org.apache.pig.data.BagFactory Java Examples
The following examples show how to use
org.apache.pig.data.BagFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransposeTupleToBag.java From datafu with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { // initialize a reverse mapping HashMap<Integer, String> positionToAlias = new HashMap<Integer, String>(); for (String alias : getFieldAliases().keySet()) { positionToAlias.put(getFieldAliases().get(alias), alias); } DataBag output = BagFactory.getInstance().newDefaultBag(); for (int i=0; i<input.size(); i++) { Tuple tuple = TupleFactory.getInstance().newTuple(); tuple.append(positionToAlias.get(i)); tuple.append(input.get(i)); output.add(tuple); } return output; }
Example #2
Source File: TOBAG2.java From spork with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < input.size(); ++i) { final Object object = input.get(i); if (object instanceof Tuple) { bag.add( (Tuple) object); } else { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, object); bag.add(tp2); } } return bag; } catch (Exception ee) { throw new RuntimeException("Error while creating a bag", ee); } }
Example #3
Source File: POCross.java From spork with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") private void accumulateData() throws ExecException { int count = 0; int length = inputs.size() - 1; inputBags = new DataBag[length]; its = new Iterator[length]; for (int i = 0; i < length; ++i) { PhysicalOperator op = inputs.get(i); DataBag bag = BagFactory.getInstance().newDefaultBag(); inputBags[count] = bag; for (Result res = op.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = op .getNextTuple()) { if (res.returnStatus == POStatus.STATUS_NULL) continue; if (res.returnStatus == POStatus.STATUS_ERR) throw new ExecException( "Error accumulating data in the local Cross operator"); if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result); } its[count++] = bag.iterator(); } }
Example #4
Source File: WeightedReservoirSamplingTests.java From datafu with Apache License 2.0 | 6 votes |
@Test public void weightedReservoirSampleAccumulateTest() throws IOException { WeightedReservoirSample sampler = new WeightedReservoirSample("10", "1"); for (int i=0; i<100; i++) { Tuple t = TupleFactory.getInstance().newTuple(2); t.set(0, i); t.set(1, i + 1); DataBag bag = BagFactory.getInstance().newDefaultBag(); bag.add(t); Tuple input = TupleFactory.getInstance().newTuple(bag); sampler.accumulate(input); } DataBag result = sampler.getValue(); verifyNoRepeatAllFound(result, 10, 0, 100); }
Example #5
Source File: ReverseEnumerate.java From datafu with Apache License 2.0 | 6 votes |
public DataBag call(DataBag inputBag) throws IOException { DataBag outputBag = BagFactory.getInstance().newDefaultBag(); long i = start, count = 0; i = inputBag.size() - 1 + start; for (Tuple t : inputBag) { Tuple t1 = TupleFactory.getInstance().newTuple(t.getAll()); t1.append(i); outputBag.add(t1); if (count % 1000000 == 0) { outputBag.spill(); count = 0; } i--; count++; } return outputBag; }
Example #6
Source File: TestMapReduce.java From spork with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag output = BagFactory.getInstance().newDefaultBag(); Iterator<Tuple> it = (DataType.toBag(input.get(0))).iterator(); while(it.hasNext()) { Tuple t = it.next(); Tuple newT = TupleFactory.getInstance().newTuple(2); newT.set(0, field0); newT.set(1, t.get(0).toString()); output.add(newT); } return output; } catch (ExecException ee) { IOException ioe = new IOException(ee.getMessage()); ioe.initCause(ee); throw ioe; } }
Example #7
Source File: TestEvalPipelineLocal.java From spork with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag output = BagFactory.getInstance().newDefaultBag(); String str = input.get(0).toString(); String title = str; if (title != null) { List<String> nGrams = makeNGrams(title); for (Iterator<String> it = nGrams.iterator(); it.hasNext(); ) { Tuple t = TupleFactory.getInstance().newTuple(1); t.set(0, it.next()); output.add(t); } } return output; } catch (ExecException ee) { IOException ioe = new IOException(ee.getMessage()); ioe.initCause(ee); throw ioe; } }
Example #8
Source File: TestLogicalPlanBuilder.java From spork with Apache License 2.0 | 6 votes |
@Test public void testEmptyBagConst() throws Exception{ String query = "a = foreach (load 'b') generate {};" + "store a into 'output';"; LogicalPlan lp = buildPlan(query); Operator store = lp.getSinks().get(0); LOForEach foreach = (LOForEach) lp.getPredecessors(store).get(0); LOGenerate gen = (LOGenerate)foreach.getInnerPlan().getSinks().get(0); LogicalExpressionPlan exprPlan = gen.getOutputPlans().get(0); Operator logOp = exprPlan.getSources().get(0); Assert.assertTrue( logOp instanceof ConstantExpression); ConstantExpression loConst = (ConstantExpression)logOp; Assert.assertTrue(loConst.getType() == DataType.BAG); Assert.assertTrue(loConst.getValue() instanceof DataBag); Assert.assertTrue(loConst.getValue().equals(BagFactory.getInstance().newDefaultBag())); String s = foreach.getSchema().toString(false); Assert.assertTrue( s.equals(":bag{}") ); }
Example #9
Source File: TestPODistinct.java From spork with Apache License 2.0 | 6 votes |
@Test public void testPODistictWithIntAndNullValues() throws ExecException { input = BagFactory.getInstance().newDefaultBag(); TupleFactory tf = TupleFactory.getInstance(); for (int i = 0; i < MAX_SAMPLES; i++) { Tuple t = tf.newTuple(); t.append(r.nextInt(MAX_VALUE)); input.add(t); t = tf.newTuple(); t.append(null); input.add(t); // System.out.println(t); } confirmDistinct(); }
Example #10
Source File: TestBuiltInBagToTupleOrString.java From spork with Apache License 2.0 | 6 votes |
@Test public void testBasicBagToStringUDF() throws Exception { BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple t2 = tf.newTuple(2); t2.set(0, "c"); t2.set(1, 6); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(2); udfInput.set(0, bag); udfInput.set(1, "-"); String result = udf.exec(udfInput); assertEquals("a-5-c-6", result); }
Example #11
Source File: TestSkewedJoin.java From spork with Apache License 2.0 | 6 votes |
@Test public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using 'skewed';"); Iterator<Tuple> iter = pigServer.openIterator("C"); while(iter.hasNext()) { dbfrj.add(iter.next()); } } } catch(Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; }
Example #12
Source File: TestBinInterSedes.java From spork with Apache License 2.0 | 6 votes |
@Test public void testTupleWriteRead1() throws IOException { //create a tuple with columns of different type Tuple tuplein = TupleFactory.getInstance().newTuple(7); tuplein.set(0, 12); Map<String, String> map = new HashMap<String, String>(); map.put("pig", "scalability"); tuplein.set(1, map); tuplein.set(2, null); tuplein.set(3, 12L); tuplein.set(4, 1.2F); Tuple innerTuple = TupleFactory.getInstance().newTuple(1); innerTuple.set(0, "innerTuple"); tuplein.set(5, innerTuple); DataBag bag = BagFactory.getInstance().newDefaultBag(); bag.add(innerTuple); tuplein.set(6, bag); testTupleSedes(tuplein); assertEquals( "(12,[pig#scalability],,12,1.2,(innerTuple),{(innerTuple)})", TupleFormat.format(tuplein)); }
Example #13
Source File: ToBag.java From spork with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < input.size(); ++i) { final Object object = input.get(i); if (object != null) { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, object); bag.add(tp2); } } return bag; } catch (Exception ee) { throw new RuntimeException("Error while creating a bag", ee); } }
Example #14
Source File: TestPODistinct.java From spork with Apache License 2.0 | 6 votes |
@Test public void testPODistictArityWithNullValues() throws ExecException { input = BagFactory.getInstance().newDefaultBag(); TupleFactory tf = TupleFactory.getInstance(); for (int i = 0; i < MAX_SAMPLES; i++) { Tuple t = tf.newTuple(); if ( r.nextInt(MAX_VALUE) % 3 == 0 ){ t.append(null); } t.append(r.nextInt(MAX_VALUE)); t.append(r.nextInt(MAX_VALUE)); input.add(t); // System.out.println(t); } confirmDistinct(); }
Example #15
Source File: ReservoirSample.java From datafu with Apache License 2.0 | 6 votes |
@Override public Tuple exec(Tuple input) throws IOException { getReservoir().clear(); DataBag bagOfSamples = (DataBag) input.get(0); for (Tuple innerTuple : bagOfSamples) { DataBag samples = (DataBag) innerTuple.get(0); for (Tuple sample : samples) { // use the same score as previously generated getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample)); } } DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple scoredTuple : getReservoir()) { // add the score on to the intermediate tuple output.add(scoredTuple.getIntermediateTuple(tupleFactory)); } return tupleFactory.newTuple(output); }
Example #16
Source File: TestLocal.java From spork with Apache License 2.0 | 5 votes |
@Override public DataBag exec(Tuple input) throws IOException { DataBag output = BagFactory.getInstance().newDefaultBag(); for (Tuple t : DataType.toBag(input.get(0))) { Tuple newT = TupleFactory.getInstance().newTuple(2); newT.set(0, field0); newT.set(1, t.get(0).toString()); output.add(newT); } return output; }
Example #17
Source File: TestBuiltInBagToTupleOrString.java From spork with Apache License 2.0 | 5 votes |
@Test public void testNestedTupleForBagToStringUDF() throws Exception { BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple nestedTuple = tf.newTuple(2); nestedTuple.set(0, "d"); nestedTuple.set(1, 7); Tuple t2 = tf.newTuple(3); t2.set(0, "c"); t2.set(1, 6); t2.set(2, nestedTuple); DataBag inputBag = bf.newDefaultBag(); inputBag.add(t1); inputBag.add(t2); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(2); udfInput.set(0, inputBag); udfInput.set(1, "_"); String result = udf.exec(udfInput); assertEquals("a_5_c_6_(d,7)", result); }
Example #18
Source File: Util.java From spork with Apache License 2.0 | 5 votes |
static public Tuple loadTuple(Tuple t, String[][] input) throws ExecException { for (int i = 0; i < input.length; i++) { DataBag bag = BagFactory.getInstance().newDefaultBag(); Tuple f = loadTuple(TupleFactory.getInstance().newTuple(input[i].length), input[i]); bag.add(f); t.set(i, bag); } return t; }
Example #19
Source File: TestStitch.java From spork with Apache License 2.0 | 5 votes |
@Test public void testSecondShort() throws Exception { Stitch func = new Stitch(); DataBag b1 = BagFactory.getInstance().newDefaultBag(); Tuple t = TupleFactory.getInstance().newTuple(); t.append("a"); t.append("b"); b1.add(t); t = TupleFactory.getInstance().newTuple(); t.append("c"); t.append("d"); b1.add(t); DataBag b2 = BagFactory.getInstance().newDefaultBag(); t = TupleFactory.getInstance().newTuple(); t.append("1"); t.append("2"); b2.add(t); t = TupleFactory.getInstance().newTuple(); t.append(b1); t.append(b2); DataBag out = func.exec(t); assertEquals(2, out.size()); Iterator<Tuple> iter = out.iterator(); t = iter.next(); assertEquals(4, t.size()); assertEquals("a", t.get(0)); assertEquals("b", t.get(1)); assertEquals("1", t.get(2)); assertEquals("2", t.get(3)); t = iter.next(); assertEquals(2, t.size()); assertEquals("c", t.get(0)); assertEquals("d", t.get(1)); }
Example #20
Source File: TestEvalPipeline.java From spork with Apache License 2.0 | 5 votes |
@Override public DataBag exec(Tuple input) throws IOException { TupleFactory tf = TupleFactory.getInstance(); DataBag output = BagFactory.getInstance().newDefaultBag(); output.add(tf.newTuple("a")); output.add(tf.newTuple("a")); output.add(tf.newTuple("a")); return output; }
Example #21
Source File: TestDataModel.java From spork with Apache License 2.0 | 5 votes |
private Tuple giveMeOneOfEach() throws Exception { TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(11); Tuple t2 = tf.newTuple(2); t2.set(0, new Integer(3)); t2.set(1, new Float(3.0)); DataBag bag = BagFactory.getInstance().newDefaultBag(); bag.add(tf.newTuple(new Integer(4))); bag.add(tf.newTuple(new String("mary had a little lamb"))); Map<String, Object> map = new LinkedHashMap<String, Object>(2); map.put(new String("hello"), new String("world")); map.put(new String("goodbye"), new String("all")); t1.set(0, t2); t1.set(1, bag); t1.set(2, map); t1.set(3, new Integer(42)); t1.set(4, new Long(5000000000L)); t1.set(5, new Float(3.141592654)); t1.set(6, new Double(2.99792458e8)); t1.set(7, new Boolean(true)); t1.set(8, new DataByteArray("hello")); t1.set(9, new String("goodbye")); return t1; }
Example #22
Source File: TestMapSideCogroup.java From spork with Apache License 2.0 | 5 votes |
@Test public void testEmptyDeltaFile() throws Exception{ PigServer pigServer = new PigServer(cluster.getExecType(), cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' using "+ DummyCollectableLoader.class.getName() +"() as (c1:chararray,c2:int);"); pigServer.registerQuery("B = LOAD '" + EMPTY_FILE + "' using "+ DummyIndexableLoader.class.getName() +"() as (c1:chararray,c2:int);"); DataBag dbMergeCogrp = BagFactory.getInstance().newDefaultBag(); pigServer.registerQuery("C = cogroup A by c1, B by c1 using 'merge';"); Iterator<Tuple> iter = pigServer.openIterator("C"); while(iter.hasNext()) { Tuple t = iter.next(); dbMergeCogrp.add(t); } String[] results = new String[]{ "(1,{(1,1),(1,2),(1,3)},{})", "(2,{(2,1),(2,2),(2,3)},{})", "(3,{(3,1),(3,2),(3,3)},{})" }; assertEquals(3, dbMergeCogrp.size()); Iterator<Tuple> itr = dbMergeCogrp.iterator(); for(int i=0; i<3; i++){ assertEquals(itr.next().toString(), results[i]); } assertFalse(itr.hasNext()); }
Example #23
Source File: TestTuple.java From spork with Apache License 2.0 | 5 votes |
@Test public void testTupleFormat() { try { Tuple tuple = mTupleFactory.newTuple(7); tuple.set(0, 12); Map<String, String> map = new HashMap<String, String>(); map.put("pig", "scalability"); tuple.set(1, map); tuple.set(2, null); tuple.set(3, 12L); tuple.set(4, 1.2F); Tuple innerTuple = mTupleFactory.newTuple(1); innerTuple.set(0, "innerTuple"); tuple.set(5, innerTuple); DataBag bag = BagFactory.getInstance().newDefaultBag(); bag.add(innerTuple); tuple.set(6, bag); assertEquals( "(12,[pig#scalability],,12,1.2,(innerTuple),{(innerTuple)})", TupleFormat.format(tuple)); } catch (ExecException e) { e.printStackTrace(); fail(); } }
Example #24
Source File: TestEvalPipelineLocal.java From spork with Apache License 2.0 | 5 votes |
@Override public DataBag exec(Tuple input) throws IOException { Integer content = (Integer)input.get(0); DataBag bag = BagFactory.getInstance().newDefaultBag(); if (content > 10) { Tuple t = TupleFactory.getInstance().newTuple(); t.append(content); bag.add(t); } return bag; }
Example #25
Source File: TestEvalPipelineLocal.java From spork with Apache License 2.0 | 5 votes |
@Override public Map<String, Object> exec(Tuple input) throws IOException { TupleFactory tupleFactory = TupleFactory.getInstance(); ArrayList<Object> objList = new ArrayList<Object>(); objList.add(new Integer(1)); objList.add(new Double(1.0)); objList.add(new Float(1.0)); objList.add(new String("World!")); Tuple tuple = tupleFactory.newTuple(objList); BagFactory bagFactory = BagFactory.getInstance(); DataBag bag = bagFactory.newDefaultBag(); bag.add(tuple); Map<String, Object> mapInMap = new HashMap<String, Object>(); mapInMap.put("int", new Integer(10)); mapInMap.put("float", new Float(10.0)); Map<String, Object> myMap = new HashMap<String, Object>(); myMap.put("string", new String("Hello")); myMap.put("int", new Integer(1)); myMap.put("long", new Long(1)); myMap.put("float", new Float(1.0)); myMap.put("double", new Double(1.0)); myMap.put("dba", new DataByteArray(new String("bytes").getBytes())); myMap.put("map", mapInMap); myMap.put("tuple", tuple); myMap.put("bag", bag); return myMap; }
Example #26
Source File: TestEvalPipelineLocal.java From spork with Apache License 2.0 | 5 votes |
@Test public void testMapLookup() throws Exception { DataBag b = BagFactory.getInstance().newDefaultBag(); Map<String, Object> colors = new HashMap<String, Object>(); colors.put("apple","red"); colors.put("orange","orange"); Map<String, Object> weights = new HashMap<String, Object>(); weights.put("apple","0.1"); weights.put("orange","0.3"); Tuple t = mTf.newTuple(); t.append(colors); t.append(weights); b.add(t); File tempF = File.createTempFile("tmp", ""); tempF.delete(); // we only needed the temp file name, so delete the file String fileName = Util.removeColon(tempF.getCanonicalPath()); PigFile f = new PigFile(fileName); f.store(b, new FuncSpec(BinStorage.class.getCanonicalName()), pigServer.getPigContext()); pigServer.registerQuery("a = load '" + Util.encodeEscape(fileName) + "' using BinStorage();"); pigServer.registerQuery("b = foreach a generate $0#'apple',flatten($1#'orange');"); Iterator<Tuple> iter = pigServer.openIterator("b"); t = iter.next(); Assert.assertEquals(t.get(0).toString(), "red"); Assert.assertEquals(DataType.toDouble(t.get(1)), 0.3); Assert.assertFalse(iter.hasNext()); }
Example #27
Source File: TestFRJoin.java From spork with Apache License 2.0 | 5 votes |
private void setUpHashTable() throws IOException { FileSpec replFile = new FileSpec(repl, new FuncSpec(PigStorage.class.getName() + "()")); POLoad ld = new POLoad(new OperatorKey("Repl File Loader", 1L), replFile); PigContext pc = new PigContext(ExecType.MAPREDUCE, PigMapReduce.sJobConfInternal.get()); pc.connect(); ld.setPc(pc); for (Result res = ld.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = ld .getNextTuple()) { Tuple tup = (Tuple)res.result; LoadFunc lf = ((LoadFunc)PigContext.instantiateFuncFromSpec(ld.getLFile().getFuncSpec())); String key = lf.getLoadCaster().bytesToCharArray( ((DataByteArray)tup.get(keyField)).get()); Tuple csttup = TupleFactory.getInstance().newTuple(2); csttup.set(0, key); csttup.set(1, lf.getLoadCaster().bytesToInteger(((DataByteArray)tup.get(1)).get())); DataBag vals = null; if (replTbl.containsKey(key)) { vals = replTbl.get(key); } else { vals = BagFactory.getInstance().newDefaultBag(); replTbl.put(key, vals); } vals.add(csttup); } }
Example #28
Source File: SessionTests.java From datafu with Apache License 2.0 | 5 votes |
private static Tuple buildInputBag(DateTime ...dt) throws Exception { Tuple input = TupleFactory.getInstance().newTuple(1); DataBag inputBag = BagFactory.getInstance().newDefaultBag(); input.set(0,inputBag); for (DateTime time : dt) { inputBag.add(TupleFactory.getInstance().newTuple(Collections.singletonList(time.getMillis()))); } return input; }
Example #29
Source File: TestSkewedJoin.java From spork with Apache License 2.0 | 5 votes |
@Test public void testSkewedJoinMapKey() throws IOException{ pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); pigServer.registerQuery("C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using 'skewed' parallel 20;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while(iter.hasNext()) { dbfrj.add(iter.next()); } }
Example #30
Source File: TestSkewedJoin.java From spork with Apache License 2.0 | 5 votes |
@Test(expected = FrontendException.class) public void testSkewedJoin3Way() throws IOException{ pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); pigServer.registerQuery("D = join A by id, B by id, C by id using 'skewed' parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while(iter.hasNext()) dbfrj.add(iter.next()); }