Java Code Examples for org.apache.pig.data.Tuple#getType()

The following examples show how to use org.apache.pig.data.Tuple#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: DistinctBy.java From datafu with Apache License 2.0

6 votes

@Override
public void accumulate(Tuple input) throws IOException
{
  if (input.size() != 1) {
    throw new RuntimeException("Expected input to have only a single field");
  }    
  if (input.getType(0) != DataType.BAG) {
    throw new RuntimeException("Expected a BAG as input");
  }
  
  DataBag inputBag = (DataBag)input.get(0);
  for (Tuple t : inputBag) {
    Tuple distinctFieldTuple = getDistinctFieldTuple(t, this.fields);
    if (!seen.contains(distinctFieldTuple)) {
      outputBag.add(t);
      seen.add(distinctFieldTuple);
    }
  }
}

Example 2

Source File: AvroStorageDataConversionUtilities.java From spork with Apache License 2.0

4 votes

/**
 * Packs a Pig Tuple into an Avro record.
 * @param t the Pig tuple to pack into the avro object
 * @param s The avro schema for which to determine the type
 * @return the avro record corresponding to the input tuple
 * @throws IOException
 */
public static GenericData.Record packIntoAvro(final Tuple t, final Schema s)
    throws IOException {

  try {
    GenericData.Record record = new GenericData.Record(s);
    for (Field f : s.getFields()) {
      Object o = t.get(f.pos());
      Schema innerSchema = f.schema();
      if (AvroStorageSchemaConversionUtilities.isNullableUnion(innerSchema)) {
        if (o == null) {
          record.put(f.pos(), null);
          continue;
        }
        innerSchema = AvroStorageSchemaConversionUtilities
            .removeSimpleUnion(innerSchema);
      }
      switch(innerSchema.getType()) {
      case RECORD:
        record.put(f.pos(), packIntoAvro((Tuple) o, innerSchema));
        break;
      case ARRAY:
        record.put(f.pos(), packIntoAvro((DataBag) o, innerSchema));
        break;
      case BYTES:
        record.put(f.pos(), ByteBuffer.wrap(((DataByteArray) o).get()));
        break;
      case FIXED:
        record.put(f.pos(), new GenericData.Fixed(
            innerSchema, ((DataByteArray) o).get()));
        break;
      default:
        if (t.getType(f.pos()) == DataType.DATETIME) {
          record.put(f.pos(), ((DateTime) o).getMillis() );
        } else {
          record.put(f.pos(), o);
        }
      }
    }
    return record;
  } catch (Exception e) {
    throw new IOException(
        "exception in AvroStorageDataConversionUtilities.packIntoAvro", e);
  }
}

Example 3

Source File: POCounter.java From spork with Apache License 2.0

4 votes

/**
 * Add current task id and local counter value.
 * @param input from the previous output
 * @return  a tuple within two values prepended to the tuple
 * the task identifier and the local counter value.
 * Local counter value could be incremented by one (is a row number or dense rank)
 * or, could be incremented by the size of the bag on the previous tuple processed
 **/
protected Result addCounterValue(Result input) throws ExecException {
    Tuple in = (Tuple) input.result;
    Tuple out = mTupleFactory.newTuple(in.getAll().size() + 2);
    Long sizeBag = 0L;
    int positionBag, i = 1;

    // Tuples are added by two stamps before the tuple content:
    // 1.- At position 0: counter value
    // 2.- At position last: Current taskId

    //On this case, each tuple is analyzed independently of the tuples grouped
    if(isRowNumber() || isDenseRank()) {

        //Only when is Dense Rank (attached to a reduce phase) it is incremented on this way
        //Otherwise, the increment is done at mapper automatically
        if(isDenseRank())
            incrementReduceCounter(POCounter.ONE);

        out.set(0, getLocalCounter());

        //and the local incrementer is sequentially increased.
        incrementLocalCounter();

    } else if(!isDenseRank()) {
        //Standard rank: On this case is important the
        //number of tuples on the same group.
        positionBag = in.getAll().size()-1;
        if (in.getType(positionBag) == DataType.BAG) {
            sizeBag = ((org.apache.pig.data.DefaultAbstractBag)in.get(positionBag)).size();
        }

        //This value (the size of the tuples on the bag) is used to increment
        //the current global counter and
        incrementReduceCounter(sizeBag);

        out.set(0, getLocalCounter());

        //the value for the next tuple on the current task
        addToLocalCounter(sizeBag);

    }

    for (Object o : in) {
        out.set(i++, o);
    }

    // At position last: Current taskId
    out.set(i++, getTaskId());

    input.result = illustratorMarkup(in, out, 0);

    return input;
}

Example 4

Source File: CounterConverter.java From spork with Apache License 2.0

4 votes

@Override
public Iterator<Tuple> call(Integer index, final 
		Iterator<Tuple> input) {
       Tuple inp = null;
       Tuple output = null;
       long sizeBag = 0L;

       List<Tuple> listOutput = new ArrayList<Tuple>();
       
       try {
       	while (input.hasNext()) {
			inp = input.next();
			output = TupleFactory.getInstance()
					.newTuple(inp.getAll().size() + 3);
			
			for (int i = 0; i < inp.getAll().size(); i++) {
				output.set(i + 3, inp.get(i));
			}
			
			if (poCounter.isRowNumber() || poCounter.isDenseRank()) {
				output.set(2, getLocalCounter());
				incrementSparkCounter();
				incrementLocalCounter();
			} else if (!poCounter.isDenseRank()) {
				int positionBag = inp.getAll().size()-1;
				if (inp.getType(positionBag) == DataType.BAG) {
	                sizeBag = ((org.apache.pig.data.DefaultAbstractBag)
	                		inp.get(positionBag)).size();
	            }
				
				output.set(2, getLocalCounter());
                
				addToSparkCounter(sizeBag);
                addToLocalCounter(sizeBag);
			}
			
			output.set(0, index);
			output.set(1, getSparkCounter());
			listOutput.add(output);
		}
       } catch(ExecException e) {
       	throw new RuntimeException(e);
       }
	
			
	return listOutput.iterator();
}