org.apache.pig.data.DataBag Java Examples
The following examples show how to use
org.apache.pig.data.DataBag.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Distinct.java From spork with Apache License 2.0 | 6 votes |
static private DataBag getDistinctFromNestedBags(Tuple input, EvalFunc evalFunc) throws IOException { DataBag result = createDataBag(); long progressCounter = 0; try { DataBag bg = (DataBag)input.get(0); if (bg == null) { return result; } for (Tuple tuple : bg) { // Each tuple has a single column // which is a bag. Get tuples out of it // and distinct over all tuples for (Tuple t : (DataBag)tuple.get(0)) { result.add(t); ++progressCounter; if((progressCounter % 1000) == 0){ evalFunc.progress(); } } } } catch (ExecException e) { throw e; } return result; }
Example #2
Source File: TestBagFormat.java From spork with Apache License 2.0 | 6 votes |
@Test public void testBagFormat() throws Exception { DataBag bag = BagFactory.getInstance().newDefaultBag(); Tuple tuple_1 = TupleFactory.getInstance().newTuple(1); tuple_1.set(0, 12); bag.add(tuple_1); Tuple tuple_2 = TupleFactory.getInstance().newTuple(1); DataBag innerBag = BagFactory.getInstance().newDefaultBag(); innerBag.add(tuple_1); tuple_2.set(0, (innerBag)); bag.add(tuple_2); System.out.println(BagFormat.format(bag)); assertEquals("{(12),({(12)})}", BagFormat.format(bag)); }
Example #3
Source File: DoubleAvg.java From spork with Apache License 2.0 | 6 votes |
@Override public Double exec(Tuple input) throws IOException { try { DataBag b = (DataBag)input.get(0); Tuple combined = combine(b); Double sum = (Double)combined.get(0); if(sum == null) { return null; } double count = (Long)combined.get(1); Double avg = null; if (count > 0) { avg = new Double(sum / count); } return avg; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing average in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
Example #4
Source File: TestPigStreaming.java From spork with Apache License 2.0 | 6 votes |
@Test public void testSerialize__bag() throws IOException { Tuple t = tf.newTuple(1); Tuple t1 = tf.newTuple(2); Tuple t2 = tf.newTuple(2); List<Tuple> bagTuples = new ArrayList<Tuple>(); bagTuples.add(t1); bagTuples.add(t2); t1.set(0, "A"); t1.set(1, "B"); t2.set(0, 1); t2.set(1, 2); DataBag b = DefaultBagFactory.getInstance().newDefaultBag(bagTuples); t.set(0,b); byte[] expectedOutput = "{(A,B),(1,2)}\n".getBytes(); byte[] output = ps.serialize(t); Assert.assertArrayEquals(expectedOutput, output); }
Example #5
Source File: TestMapReduce.java From spork with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag output = BagFactory.getInstance().newDefaultBag(); Iterator<Tuple> it = (DataType.toBag(input.get(0))).iterator(); while(it.hasNext()) { Tuple t = it.next(); Tuple newT = TupleFactory.getInstance().newTuple(2); newT.set(0, field0); newT.set(1, t.get(0).toString()); output.add(newT); } return output; } catch (ExecException ee) { IOException ioe = new IOException(ee.getMessage()); ioe.initCause(ee); throw ioe; } }
Example #6
Source File: IteratingAccumulatorEvalFunc.java From spork with Apache License 2.0 | 6 votes |
@Override public void accumulate(Tuple input) throws IOException { if (!isInitialized) initialize(); for (Tuple t : (DataBag)input.get(0)) { if (isFinished) return; boolean added = false; while (!isFinished && !added && !exceptionThrown) try { added = queue.offer(t, WAIT_TO_OFFER, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { } //TODO handle the exception? if (exceptionThrown) throw new RuntimeException("Exception thrown in thread: ", executionThreadException); } }
Example #7
Source File: TestConversions.java From spork with Apache License 2.0 | 6 votes |
@Test public void testBytesToBagWithConversion() throws IOException { DataBag b = GenRandomData.genFloatDataBag(r,5,100); ResourceFieldSchema fs = GenRandomData.getFloatDataBagFieldSchema(5); DataBag convertedBag = ps.getLoadCaster().bytesToBag(b.toString().getBytes(), fs); Iterator<Tuple> iter1 = b.iterator(); Iterator<Tuple> iter2 = convertedBag.iterator(); for (int i=0;i<100;i++) { Tuple t1 = (Tuple)iter1.next(); assertTrue(iter2.hasNext()); Tuple t2 = (Tuple)iter2.next(); for (int j=0;j<5;j++) { assertTrue(t2.get(j) instanceof Integer); Integer expectedValue = ((Float)t1.get(j)).intValue(); assertEquals(expectedValue, t2.get(j)); } } }
Example #8
Source File: LineageTrimmingVisitor.java From spork with Apache License 2.0 | 6 votes |
public LineageTrimmingVisitor(LogicalPlan plan, Map<LOLoad, DataBag> baseData, ExampleGenerator eg, Map<Operator, PhysicalOperator> LogToPhyMap, PhysicalPlan physPlan, PigContext pc) throws IOException, InterruptedException { super(plan, new PreOrderDepthFirstWalker(plan)); // this.baseData.putAll(baseData); this.baseData = baseData; this.plan = plan; this.LogToPhyMap = LogToPhyMap; this.pc = pc; this.physPlan = physPlan; this.eg = eg; this.inputToDataMap = new HashMap<FileSpec, DataBag>(); init(); }
Example #9
Source File: INVERSEMAP.java From spork with Apache License 2.0 | 6 votes |
private HashMap<String, DataBag> doInverse(Map<String,Object> original) throws ExecException { final HashMap<String, DataBag> inverseMap = new HashMap<String, DataBag>(original.size()); for (Map.Entry<String, Object> entry : original.entrySet()) { Object o = entry.getValue(); String newKey; // Call toString for all primitive types, else throw an Exception if (!(o instanceof Tuple || o instanceof DataBag)) { newKey = o.toString(); } else { throw new ExecException("Wrong type. Value is of type " + o.getClass()); } // Create a new bag if "newKey" does not exist in Map DataBag bag = inverseMap.get(newKey); if (bag == null) { bag = new NonSpillableDataBag(); bag.add(TUPLE_FACTORY.newTuple(entry.getKey())); inverseMap.put(newKey, bag); } else { bag.add(TUPLE_FACTORY.newTuple(entry.getKey())); } } return inverseMap; }
Example #10
Source File: TestBuiltInBagToTupleOrString.java From spork with Apache License 2.0 | 6 votes |
@Test public void testUseDefaultDelimiterBagToStringUDF() throws Exception { BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple t2 = tf.newTuple(2); t2.set(0, "c"); t2.set(1, 6); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(1); udfInput.set(0, bag); String result = udf.exec(udfInput); assertEquals("a_5_c_6", result); }
Example #11
Source File: ReservoirSample.java From datafu with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { getReservoir().clear(); DataBag bagOfSamples = (DataBag) input.get(0); for (Tuple innerTuple : bagOfSamples) { DataBag samples = (DataBag) innerTuple.get(0); for (Tuple sample : samples) { // use the same score as previously generated getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample)); } } DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple scoredTuple : getReservoir()) { // output the original tuple output.add(scoredTuple.getTuple()); } return output; }
Example #12
Source File: JoinPackager.java From spork with Apache License 2.0 | 6 votes |
@Override public void attachInput(Object key, DataBag[] bags, boolean[] readOnce) throws ExecException { checkBagType(); this.key = key; this.bags = bags; this.readOnce = readOnce; // JoinPackager expects all but the last bag to be materialized for (int i = 0; i < bags.length - 1; i++) { if (readOnce[i]) { DataBag materializedBag = getBag(); materializedBag.addAll(bags[i]); bags[i] = materializedBag; } } if (readOnce[numInputs - 1] != true) { throw new ExecException( "JoinPackager expects the last input to be streamed"); } this.newKey = true; }
Example #13
Source File: AVG.java From spork with Apache License 2.0 | 6 votes |
@Override public Double exec(Tuple input) throws IOException { try { DataBag b = (DataBag)input.get(0); Tuple combined = combine(b); Double sum = (Double)combined.get(0); if(sum == null) { return null; } double count = (Long)combined.get(1); Double avg = null; if (count > 0) { avg = new Double(sum / count); } return avg; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing average in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
Example #14
Source File: StringMax.java From spork with Apache License 2.0 | 6 votes |
@Override public Tuple exec(Tuple input) throws IOException { try { // input is a bag with one tuple containing // the column we are trying to max on DataBag bg = (DataBag) input.get(0); String s = null; if(bg.iterator().hasNext()) { Tuple tp = bg.iterator().next(); s = (String)(tp.get(0)); } return tfact.newTuple(s); } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing max in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
Example #15
Source File: TestEvalPipeline.java From spork with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag output = BagFactory.getInstance().newDefaultBag(); String str = input.get(0).toString(); String title = str; if (title != null) { List<String> nGrams = makeNGrams(title); for (Iterator<String> it = nGrams.iterator(); it.hasNext(); ) { Tuple t = TupleFactory.getInstance().newTuple(1); t.set(0, it.next()); output.add(t); } } return output; } catch (ExecException ee) { IOException ioe = new IOException(ee.getMessage()); ioe.initCause(ee); throw ioe; } }
Example #16
Source File: VALUELIST.java From spork with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if(input == null || input.size() == 0) { return null; } Map<String, Object> m = null; //Input must be of type Map. This is verified at compile time m = (Map<String, Object>)(input.get(0)); if(m == null) { return null; } Collection c = m.values(); DataBag bag = new NonSpillableDataBag(c.size()); Iterator<Object> iter = c.iterator(); while(iter.hasNext()) { Tuple t = TUPLE_FACTORY.newTuple(iter.next()); bag.add(t); } return bag; }
Example #17
Source File: JrubyAlgebraicEvalFunc.java From spork with Apache License 2.0 | 5 votes |
@Override public Tuple exec(Tuple input) throws IOException { if (!isInitialized()) initialize(); try { RubyDataBag inp = new RubyDataBag(ruby, ruby.getClass("DataBag"), (DataBag)input.get(0)); IRubyObject rubyResult = rubyEngine.callMethod(getReceiver(), getStage(), inp, IRubyObject.class); return mTupleFactory.newTuple(PigJrubyLibrary.rubyToPig(rubyResult)); } catch (Exception e) { throw new IOException("Error executing intermediate function: ", e); } }
Example #18
Source File: TestEvalPipelineLocal.java From spork with Apache License 2.0 | 5 votes |
@Test public void testMapLookup() throws Exception { DataBag b = BagFactory.getInstance().newDefaultBag(); Map<String, Object> colors = new HashMap<String, Object>(); colors.put("apple","red"); colors.put("orange","orange"); Map<String, Object> weights = new HashMap<String, Object>(); weights.put("apple","0.1"); weights.put("orange","0.3"); Tuple t = mTf.newTuple(); t.append(colors); t.append(weights); b.add(t); File tempF = File.createTempFile("tmp", ""); tempF.delete(); // we only needed the temp file name, so delete the file String fileName = Util.removeColon(tempF.getCanonicalPath()); PigFile f = new PigFile(fileName); f.store(b, new FuncSpec(BinStorage.class.getCanonicalName()), pigServer.getPigContext()); pigServer.registerQuery("a = load '" + Util.encodeEscape(fileName) + "' using BinStorage();"); pigServer.registerQuery("b = foreach a generate $0#'apple',flatten($1#'orange');"); Iterator<Tuple> iter = pigServer.openIterator("b"); t = iter.next(); Assert.assertEquals(t.get(0).toString(), "red"); Assert.assertEquals(DataType.toDouble(t.get(1)), 0.3); Assert.assertFalse(iter.hasNext()); }
Example #19
Source File: BigIntegerAvg.java From spork with Apache License 2.0 | 5 votes |
static protected BigInteger count(Tuple input) throws ExecException { DataBag values = (DataBag)input.get(0); Iterator<Tuple> it = values.iterator(); BigInteger cnt = BigInteger.ZERO; while (it.hasNext()) { Tuple t = (Tuple)it.next(); if (t != null && t.size() > 0 && t.get(0) != null) cnt = cnt.add(BigInteger.ONE); } return cnt; }
Example #20
Source File: TestEvalPipeline.java From spork with Apache License 2.0 | 5 votes |
@Override public DataBag exec(Tuple input) throws IOException { TupleFactory tf = TupleFactory.getInstance(); DataBag output = BagFactory.getInstance().newDefaultBag(); output.add(tf.newTuple("a")); output.add(tf.newTuple("a")); output.add(tf.newTuple("a")); return output; }
Example #21
Source File: TestExampleGenerator.java From spork with Apache License 2.0 | 5 votes |
@Test public void testFilterUnion() throws Exception { PigServer pigServer = new PigServer(pigContext); pigServer.registerQuery("A = load " + A.toString() + " as (x:int, y:int);"); pigServer.registerQuery("B = FILTER A by x > 3;"); pigServer.registerQuery("C = FILTER A by x < 3;"); pigServer.registerQuery("D = UNION B, C;"); Map<Operator, DataBag> derivedData = pigServer.getExamples("D"); assertNotNull(derivedData); }
Example #22
Source File: BagToTuple.java From spork with Apache License 2.0 | 5 votes |
/** * Calculate the size of the output tuple based on the sum * of the size of each tuple in the input bag * * @param bag * @return total # of data elements in a tab */ private long getOuputTupleSize(DataBag bag) { long size = 0; if (bag != null) { for (Tuple t : bag) { size = size + t.size(); } } return size; }
Example #23
Source File: WeightedRangePartitionerTez.java From spork with Apache License 2.0 | 5 votes |
@Override public void init() { Map<String, Object> quantileMap = null; if (PigProcessor.sampleMap != null) { // We've collected sampleMap in PigProcessor quantileMap = PigProcessor.sampleMap; } else { LOG.warn("Quantiles map is empty"); inited = true; return; } long start = System.currentTimeMillis(); try { DataBag quantilesList = (DataBag) quantileMap.get(FindQuantiles.QUANTILES_LIST); InternalMap weightedPartsData = (InternalMap) quantileMap.get(FindQuantiles.WEIGHTED_PARTS); estimatedNumPartitions = (Integer)quantileMap.get(PigProcessor.ESTIMATED_NUM_PARALLELISM); convertToArray(quantilesList); for (Entry<Object, Object> ent : weightedPartsData.entrySet()) { Tuple key = (Tuple) ent.getKey(); // sample item which repeats float[] probVec = getProbVec((Tuple) ent.getValue()); weightedParts.put(getPigNullableWritable(key), new DiscreteProbabilitySampleGenerator(probVec)); } } catch (Exception e) { throw new RuntimeException(e); } comparator = ConfigUtils.getIntermediateInputKeyComparator(job); LOG.info("Initialized WeightedRangePartitionerTez. Time taken: " + (System.currentTimeMillis() - start)); inited = true; }
Example #24
Source File: VAR.java From datafu with Apache License 2.0 | 5 votes |
@Override public Double exec(Tuple input) throws IOException { try { DataBag b = (DataBag)input.get(0); Tuple combined = combine(b); Double sum = (Double)combined.get(0); Double sumSquare = (Double)combined.get(1); if(sum == null) { return null; } Long count = (Long)combined.get(2); Double var = null; if (count > 0) { Double avg = new Double(sum / count); Double avgSquare = new Double(sumSquare / count); var = avgSquare - avg*avg; } return var; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing variance in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
Example #25
Source File: DoubleAvg.java From spork with Apache License 2.0 | 5 votes |
static protected long count(Tuple input) throws ExecException { DataBag values = (DataBag)input.get(0); Iterator it = values.iterator(); long cnt = 0; while (it.hasNext()){ Tuple t = (Tuple)it.next(); if (t != null && t.size() > 0 && t.get(0) != null) cnt++; } return cnt; }
Example #26
Source File: MetricUDF.java From datafu with Apache License 2.0 | 5 votes |
/** * This UDF expects a query vector as the first element, a threshold (double) as the second, and a bag of vectors. * Vectors are represented by tuples with doubles as elements or bags of tuples representing position and value * in the case of sparse vectors. * * <p> * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash. * </p> * * @see datafu.pig.hash.lsh.CosineDistanceHash */ @Override public Tuple exec(Tuple input) throws IOException { Object firstElement = input.get(0); double distanceRange = ((Number)input.get(1)).doubleValue(); DataBag vectorBag = (DataBag)input.get(2); RealVector referenceVector = null; if(firstElement instanceof Tuple) { //in which case the first element is a non-sparse tuple referenceVector = DataTypeUtil.INSTANCE.convert((Tuple)firstElement, dim); } else { //in which case the first element is a bag, representing a sparse tuple referenceVector = DataTypeUtil.INSTANCE.convert(input, dim); } for(Tuple vecTuple : vectorBag ) { Object vectorObj = vecTuple.get(0); RealVector v2 = null; if(vectorObj instanceof Tuple) { v2 = DataTypeUtil.INSTANCE.convert((Tuple)vecTuple.get(0), referenceVector.getDimension()); } else { v2 = DataTypeUtil.INSTANCE.convert(vecTuple, referenceVector.getDimension()); } double dist = dist(referenceVector, v2); if(dist < distanceRange) { return vecTuple; } } return null; }
Example #27
Source File: AllFirstLetter.java From spork with Apache License 2.0 | 5 votes |
public String exec(Tuple input) throws IOException { result = ""; DataBag bag = (DataBag) input.get(0); Iterator<Tuple> it = bag.iterator(); while (it.hasNext()) { Tuple t = it.next(); if (t != null && t.size() > 0 && t.get(0) != null) result += t.get(0).toString().substring(0, 1); } return result; }
Example #28
Source File: FloatAvg.java From spork with Apache License 2.0 | 5 votes |
@Override public Tuple exec(Tuple input) throws IOException { try { Tuple t = mTupleFactory.newTuple(2); // input is a bag with one tuple containing // the column we are trying to avg on DataBag bg = (DataBag) input.get(0); Float f = null; if(bg.iterator().hasNext()) { Tuple tp = bg.iterator().next(); f = (Float)(tp.get(0)); } t.set(0, f != null ? new Double(f) : null); if (f != null) t.set(1, 1L); else t.set(1, 0L); return t; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing average in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
Example #29
Source File: DoubleVAR.java From datafu with Apache License 2.0 | 5 votes |
@Override public Double exec(Tuple input) throws IOException { try { DataBag b = (DataBag)input.get(0); Tuple combined = combine(b); Double sum = (Double)combined.get(0); Double sumSquare = (Double)combined.get(1); if(sum == null) { return null; } Long count = (Long)combined.get(2); Double var = null; if (count > 0) { Double avg = new Double(sum / count); Double avgSquare = new Double(sumSquare / count); var = avgSquare - avg*avg; } return var; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing variance in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
Example #30
Source File: TestMapSideCogroup.java From spork with Apache License 2.0 | 5 votes |
@Test public void testEmptyDeltaFile() throws Exception{ PigServer pigServer = new PigServer(cluster.getExecType(), cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' using "+ DummyCollectableLoader.class.getName() +"() as (c1:chararray,c2:int);"); pigServer.registerQuery("B = LOAD '" + EMPTY_FILE + "' using "+ DummyIndexableLoader.class.getName() +"() as (c1:chararray,c2:int);"); DataBag dbMergeCogrp = BagFactory.getInstance().newDefaultBag(); pigServer.registerQuery("C = cogroup A by c1, B by c1 using 'merge';"); Iterator<Tuple> iter = pigServer.openIterator("C"); while(iter.hasNext()) { Tuple t = iter.next(); dbMergeCogrp.add(t); } String[] results = new String[]{ "(1,{(1,1),(1,2),(1,3)},{})", "(2,{(2,1),(2,2),(2,3)},{})", "(3,{(3,1),(3,2),(3,3)},{})" }; assertEquals(3, dbMergeCogrp.size()); Iterator<Tuple> itr = dbMergeCogrp.iterator(); for(int i=0; i<3; i++){ assertEquals(itr.next().toString(), results[i]); } assertFalse(itr.hasNext()); }