Java Code Examples for org.apache.pig.data.DataBag#size()
The following examples show how to use
org.apache.pig.data.DataBag#size() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: EmptyBagToNull.java From datafu with Apache License 2.0 | 6 votes |
@Override public DataBag exec(Tuple tuple) throws IOException { if (tuple.size() == 0 || tuple.get(0) == null) return null; Object o = tuple.get(0); if (o instanceof DataBag) { DataBag bag = (DataBag)o; if (bag.size() == 0) { return null; } else { return bag; } } else throw new IllegalArgumentException("expected a null or a bag"); }
Example 2
Source File: ReverseEnumerate.java From datafu with Apache License 2.0 | 6 votes |
public DataBag call(DataBag inputBag) throws IOException { DataBag outputBag = BagFactory.getInstance().newDefaultBag(); long i = start, count = 0; i = inputBag.size() - 1 + start; for (Tuple t : inputBag) { Tuple t1 = TupleFactory.getInstance().newTuple(t.getAll()); t1.append(i); outputBag.add(t1); if (count % 1000000 == 0) { outputBag.spill(); count = 0; } i--; count++; } return outputBag; }
Example 3
Source File: PhysicalOperator.java From spork with Apache License 2.0 | 6 votes |
public Result getNextDataBag() throws ExecException { Result val = new Result(); DataBag tmpBag = BagFactory.getInstance().newDefaultBag(); for (Result ret = getNextTuple(); ret.returnStatus != POStatus.STATUS_EOP; ret = getNextTuple()) { if (ret.returnStatus == POStatus.STATUS_ERR) { return ret; } else if (ret.returnStatus == POStatus.STATUS_NULL) { continue; } else { tmpBag.add((Tuple) ret.result); } } val.result = tmpBag; val.returnStatus = (tmpBag.size() == 0)? POStatus.STATUS_EOP : POStatus.STATUS_OK; return val; }
Example 4
Source File: TestHelper.java From spork with Apache License 2.0 | 6 votes |
public static boolean compareBags(DataBag db1, DataBag db2) { if (db1.size() != db2.size()) return false; boolean equal = true; for (Tuple tuple : db2) { boolean contains = false; for (Tuple tuple2 : db1) { if (tuple.compareTo(tuple2) == 0) { contains = true; break; } } if (!contains) { equal = false; break; } } return equal; }
Example 5
Source File: AlgebraicFloatMathBase.java From spork with Apache License 2.0 | 6 votes |
protected static Float doTupleWork(Tuple input, KnownOpProvider opProvider) throws ExecException { DataBag values = (DataBag)input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if(values.size() == 0) { return null; } Float sofar = AlgebraicFloatMathBase.getSeed(opProvider.getOp()); boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext();) { Tuple t = it.next(); try { Float d = (Float)(t.get(0)); if (d == null) continue; sawNonNull = true; sofar = doWork(sofar, d, opProvider.getOp()); }catch(RuntimeException exp) { int errCode = 2103; throw new ExecException("Problem doing work on Floats", errCode, PigException.BUG, exp); } } return sawNonNull ? sofar : null; }
Example 6
Source File: AlgebraicIntMathBase.java From spork with Apache License 2.0 | 6 votes |
protected static Integer doTupleWork(Tuple input, KnownOpProvider opProvider) throws ExecException { DataBag values = (DataBag)input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if(values.size() == 0) { return null; } int sofar = AlgebraicIntMathBase.getSeed(opProvider.getOp()); boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext();) { Tuple t = it.next(); try { Integer d = (Integer)(t.get(0)); if (d == null) continue; sawNonNull = true; sofar = doWork(sofar, d, opProvider.getOp()); }catch(RuntimeException exp) { int errCode = 2103; throw new ExecException("Problem doing work on Doubles", errCode, PigException.BUG, exp); } } return sawNonNull ? sofar : null; }
Example 7
Source File: AlgebraicBigIntegerMathBase.java From spork with Apache License 2.0 | 6 votes |
protected static BigInteger doTupleWork(Tuple input, KnownOpProvider opProvider) throws ExecException { DataBag values = (DataBag)input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if(values.size() == 0) { return null; } BigInteger sofar = AlgebraicBigIntegerMathBase.getSeed(opProvider.getOp()); boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext();) { Tuple t = it.next(); try { Number n = (Number)(t.get(0)); if (n == null) continue; BigInteger d = (BigInteger) n; sawNonNull = true; sofar = doWork(sofar, d, opProvider.getOp()); } catch(RuntimeException exp) { int errCode = 2103; throw new ExecException("Problem doing work on BigInteger", errCode, PigException.BUG, exp); } } return sawNonNull ? sofar : null; }
Example 8
Source File: DisplayExamples.java From spork with Apache License 2.0 | 5 votes |
static String[][] MakeArray(Operator op, DataBag bag) throws Exception { int rows = (int) bag.size(); int cols = ((LogicalRelationalOperator)op).getSchema().getFields().size(); String[][] table = new String[rows][cols]; Iterator<Tuple> it = bag.iterator(); for (int i = 0; i < rows; ++i) { Tuple t = it.next(); for (int j = 0; j < cols; ++j) { table[i][j] = ShortenField(t.get(j)); } } return table; }
Example 9
Source File: AlgebraicByteArrayMathBase.java From spork with Apache License 2.0 | 5 votes |
protected static Double doTupleWork(Tuple input, KnownOpProvider opProvider, byte expectedType) throws ExecException { DataBag values = (DataBag)input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if(values.size() == 0) { return null; } double sofar = AlgebraicByteArrayMathBase.getSeed(opProvider.getOp()); boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext();) { Tuple t = it.next(); try { Double d; switch (expectedType) { case DataType.BYTEARRAY: DataByteArray dba = (DataByteArray)t.get(0); d = dba != null ? Double.valueOf(dba.toString()): null; break; case DataType.DOUBLE: d = (Double) t.get(0); break; default: throw new ExecException("Unexpected type in AlgebraicByteArrayMath " + DataType.findTypeName(expectedType)); } if (d == null) continue; sawNonNull = true; sofar = doWork(sofar, d, opProvider.getOp()); }catch(RuntimeException exp) { int errCode = 2103; throw new ExecException("Problem doing work on Doubles", errCode, PigException.BUG, exp); } } return sawNonNull ? sofar : null; }
Example 10
Source File: TestStore.java From spork with Apache License 2.0 | 5 votes |
/** * @param inpD * @throws IOException */ private void setUpInputFileOnCluster(DataBag inpD) throws IOException { String[] data = new String[(int) inpD.size()]; int i = 0; for (Tuple tuple : inpD) { data[i] = toDelimitedString(tuple, "\t"); i++; } Util.createInputFile(cluster, inputFileName, data); }
Example 11
Source File: FlattenBagOperator.java From Cubert with Apache License 2.0 | 5 votes |
private void initCurrentTuple(Tuple inTuple) throws ExecException { // TODO Auto-generated method stub this.inTuple = inTuple; this.odometerIterators.clear(); for (int columnId : columnIndexArray) { FlattenType ftype = flattenPositions.get(columnId); if (ftype == null || !isFlattenBag(ftype)) { continue; } DataBag dbag = (DataBag) (inTuple.get(columnId));// Rui. change outTuple to // inTuple Iterator<Tuple> tupleIt; // Deal with null and empty bags as if they contained a single null tuple. if (dbag == null || dbag.size() == 0) tupleIt = nullBag.iterator(); else tupleIt = dbag.iterator(); odometerIterators.add(tupleIt); } seedOutTuple();// Rui. move it here. }
Example 12
Source File: PageRank.java From datafu with Apache License 2.0 | 4 votes |
@Override public void accumulate(Tuple t) throws IOException { if (aborted) { return; } DataBag bag = (DataBag) t.get(0); if (bag == null || bag.size() == 0) return; for (Tuple sourceTuple : bag) { Integer sourceId = (Integer)sourceTuple.get(0); DataBag edges = (DataBag)sourceTuple.get(1); Double nodeBias = null; if (enableNodeBiasing) { nodeBias = (Double)sourceTuple.get(2); } ArrayList<Map<String,Object>> edgesMapList = new ArrayList<Map<String, Object>>(); for (Tuple edgeTuple : edges) { Integer destId = (Integer)edgeTuple.get(0); Double weight = (Double)edgeTuple.get(1); HashMap<String,Object> edgeMap = new HashMap<String, Object>(); edgeMap.put("dest",destId); edgeMap.put("weight",weight); edgesMapList.add(edgeMap); } if (enableNodeBiasing) { graph.addNode(sourceId, edgesMapList, nodeBias.floatValue()); } else { graph.addNode(sourceId, edgesMapList); } if (graph.nodeCount() + graph.edgeCount() > maxNodesAndEdges) { System.out.println(String.format("There are too many nodes and edges (%d + %d > %d). Aborting.", graph.nodeCount(), graph.edgeCount(), maxNodesAndEdges)); aborted = true; break; } reporter.progress(); } }
Example 13
Source File: AugmentBaseDataVisitor.java From spork with Apache License 2.0 | 4 votes |
@Override public void visit(LOLimit lm) throws FrontendException { if (!limit) // not augment for LIMIT in this traversal return; if (oriLimitMap == null) oriLimitMap = new HashMap<LOLimit, Long>(); DataBag outputConstraints = outputConstraintsMap.get(lm); outputConstraintsMap.remove(lm); DataBag inputConstraints = outputConstraintsMap.get(lm.getInput((LogicalPlan) plan)); if (inputConstraints == null) { inputConstraints = BagFactory.getInstance().newDefaultBag(); outputConstraintsMap.put(lm.getInput((LogicalPlan) plan), inputConstraints); } DataBag inputData = derivedData.get(lm.getInput((LogicalPlan) plan)); if (outputConstraints != null && outputConstraints.size() > 0) { // there // 's // one // or // more // output // constraints // ; // generate // corresponding // input // constraints for (Iterator<Tuple> it = outputConstraints.iterator(); it .hasNext();) { inputConstraints.add(it.next()); // ... plus one more if only one if (inputConstraints.size() == 1) { inputConstraints.add(inputData.iterator().next()); ((PreOrderDepthFirstWalker) currentWalker).setBranchFlag(); } } } else if (inputConstraints.size() == 0){ // add all input to input constraints ... inputConstraints.addAll(inputData); // ... plus one more if only one if (inputConstraints.size() == 1) { inputConstraints.add(inputData.iterator().next()); ((PreOrderDepthFirstWalker) currentWalker).setBranchFlag(); } } POLimit poLimit = (POLimit) logToPhysMap.get(lm); oriLimitMap.put(lm, Long.valueOf(poLimit.getLimit())); poLimit.setLimit(inputConstraints.size()-1); lm.setLimit(poLimit.getLimit()); }
Example 14
Source File: SetDifference.java From datafu with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag)input.get(0); DataBag bag2 = (DataBag)input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; }
Example 15
Source File: Over.java From spork with Apache License 2.0 | 4 votes |
@Override public Object exec(Tuple input) throws IOException { DataBag inbag = (DataBag)input.get(0); if (inbag.size() == 0) return null; return inbag.iterator().next().get(0); }
Example 16
Source File: AugmentBaseDataVisitor.java From spork with Apache License 2.0 | 4 votes |
@Override public void visit(LOForEach forEach) throws FrontendException { if (limit && !((PreOrderDepthFirstWalker) currentWalker).getBranchFlag()) return; DataBag outputConstraints = outputConstraintsMap.get(forEach); outputConstraintsMap.remove(forEach); LogicalPlan plan = forEach.getInnerPlan(); boolean ableToHandle = true; List<Integer> cols = new ArrayList<Integer>(); boolean cast = false; if (outputConstraints == null || outputConstraints.size() == 0) // we dont have to do anything in this case return; Operator op = plan.getSinks().get(0); if (op instanceof CastExpression) { cast = true; op = ((CastExpression) op).getExpression(); } if (!(op instanceof ProjectExpression)) { ableToHandle = false; } else { cols.add(Integer.valueOf(((ProjectExpression) op).getColNum())); } if (ableToHandle) { // we can only handle simple projections DataBag output = BagFactory.getInstance().newDefaultBag(); for (Iterator<Tuple> it = outputConstraints.iterator(); it .hasNext();) { Tuple outputConstraint = it.next(); try { Tuple inputConstraint = BackPropConstraint( outputConstraint, cols, ((LogicalRelationalOperator)plan .getPredecessors(forEach).get(0)) .getSchema(), cast); output.add(inputConstraint); } catch (Exception e) { e.printStackTrace(); throw new FrontendException( "Operator error during Augmenting Phase in Example Generator " + e.getMessage()); } } outputConstraintsMap.put(plan.getPredecessors(forEach) .get(0), output); } }
Example 17
Source File: PartitionSkewedKeysTez.java From spork with Apache License 2.0 | 4 votes |
@Override public Map<String, Object> exec(Tuple in) throws IOException { if (in == null || in.size() == 0) { return null; } int estimatedNumReducers = -1; boolean estimate_sample_quantile = PigMapReduce.sJobConfInternal.get().getBoolean (PigProcessor.ESTIMATE_PARALLELISM, false); if (estimate_sample_quantile) { int specifiedNumReducer = (Integer) in.get(0); DataBag samples = (DataBag) in.get(1); long totalSampleSize = 0; long totalInputRows = 0; Iterator<Tuple> iter = samples.iterator(); while (iter.hasNext()) { Tuple t = iter.next(); totalInputRows += (Long)t.get(t.size() - 1); totalSampleSize += getMemorySize(t); } long totalSampleCount_ = samples.size(); long estimatedInputSize = (long)((double)totalSampleSize/totalSampleCount_ * totalInputRows); long bytesPerTask = PigMapReduce.sJobConfInternal.get().getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM, InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER); estimatedNumReducers = (int)Math.ceil((double)estimatedInputSize/bytesPerTask); estimatedNumReducers = Math.min(estimatedNumReducers, InputSizeReducerEstimator.DEFAULT_MAX_REDUCER_COUNT_PARAM); LOG.info("Estimating parallelism: estimatedInputSize is " + estimatedInputSize + ". bytesPerTask is " + bytesPerTask + ". estimatedNumReducers is " + estimatedNumReducers + "."); this.totalReducers_ = estimatedNumReducers; LOG.info("Use estimated reducer instead:" + estimatedNumReducers + ", orig: " + specifiedNumReducer); } Map<String, Object> result = super.exec(in); if (estimate_sample_quantile) { result.put(PigProcessor.ESTIMATED_NUM_PARALLELISM, totalReducers_); } PigProcessor.sampleMap = result; return result; }
Example 18
Source File: SimpleRandomSample.java From datafu with Apache License 2.0 | 4 votes |
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); boolean first = true; double p = 0.0d; // the sampling probability long n = 0L; // the size of the population (total number of items) DataBag selected = _BAG_FACTORY.newDefaultBag(); DataBag waiting = _BAG_FACTORY.newSortedBag(ScoredTupleComparator.getInstance()); for (Tuple tuple : bag) { if (first) { p = (Double) tuple.get(0); first = false; } n += (Long) tuple.get(1); selected.addAll((DataBag) tuple.get(3)); waiting.addAll((DataBag) tuple.get(4)); } long numSelected = selected.size(); long numWaiting = waiting.size(); long s = (long) Math.ceil(p * n); // sample size System.out.println("To sample " + s + " items from " + n + ", we pre-selected " + numSelected + ", and waitlisted " + waiting.size() + "."); long numNeeded = s - selected.size(); if (numNeeded < 0) { System.err.println("Pre-selected " + numSelected + " items, but only needed " + s + "."); } for (Tuple scored : waiting) { if (numNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); numNeeded--; } if (numNeeded > 0) { System.err.println("The waiting list only has " + numWaiting + " items, but needed " + numNeeded + " more."); } return selected; }
Example 19
Source File: MetricEvaluation.java From spork with Apache License 2.0 | 4 votes |
public static float getConciseness( Operator op, Map<Operator, DataBag> exampleData, Map<LogicalRelationalOperator, Collection<IdentityHashSet<Tuple>>> OperatorToEqClasses, boolean overallConciseness) { DataBag bag = exampleData.get(op); int noEqCl = OperatorToEqClasses.get(op).size(); long noTuples = bag.size(); float conciseness = 100 * ((float) noEqCl / (float) noTuples); if (!overallConciseness) { return ((conciseness > 100.0) ? 100.0f : conciseness); } else { noEqCl = 0; noTuples = 0; conciseness = 0; int noOperators = 0; for (Map.Entry<LogicalRelationalOperator, Collection<IdentityHashSet<Tuple>>> e : OperatorToEqClasses .entrySet()) { if (e.getKey().getAlias() == null) continue; noOperators++; // we need to keep a track of these and not use // OperatorToEqClasses.size() as LORead shouldn't // be considered a operator bag = exampleData.get(e.getKey()); noTuples = bag.size(); noEqCl = e.getValue().size(); float concise = 100 * ((float) noEqCl / (float) noTuples); concise = (concise > 100) ? 100 : concise; conciseness += concise; } conciseness /= (float) noOperators; return conciseness; } }
Example 20
Source File: Over.java From spork with Apache License 2.0 | 4 votes |
public void addAll(DataBag b) { tuples = new ArrayList<Tuple>((int)b.size()); for (Tuple t : b) { tuples.add(t); } }