org.apache.hadoop.io.Text#getLength

Source File: RangeKeyDistributionMapper.java From Kylin with Apache License 2.0

6 votes

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    lastKey = key;

    int bytesLength = key.getLength() + value.getLength();
    bytesRead += bytesLength;

    if (bytesRead >= ONE_MEGA_BYTES) {
        outputValue.set(bytesRead);
        context.write(key, outputValue);

        // reset bytesRead
        bytesRead = 0;
    }

}

Source File: QseqInputFormat.java From Hadoop-BAM with MIT License

6 votes

private void setFieldPositionsAndLengths(Text line)
{
	int pos = 0; // the byte position within the record
	int fieldno = 0; // the field index within the record
	while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field
	{
		int endpos = line.find(Delim, pos); // the field's end position
		if (endpos < 0)
			endpos = line.getLength();

		fieldPositions[fieldno] = pos;
		fieldLengths[fieldno] = endpos - pos;

		pos = endpos + 1; // the next starting position is the current end + 1
		fieldno += 1;
	}

	if (fieldno != NUM_QSEQ_COLS)
		throw new FormatException("found " + fieldno + " fields instead of 11 at " +
		            makePositionMessage(this.pos - line.getLength()) + ". Line: " + line);
}

Source File: AccumuloUtils.java From mrgeo with Apache License 2.0

6 votes

/**
 * Convert a Text object of a tileId to a back to a long.
 *
 * @param rowId Text object to convert.
 * @return the long value from the Text object.
 */
public static long toLong(Text rowId)
{

  byte[] outB = new byte[8];
  for (int x = 0; x < outB.length; x++)
  {
    if (x >= rowId.getLength())
    {
      outB[x] = 0x0;
    }
    else
    {
      outB[x] = rowId.getBytes()[x];
    }
  }

  return ByteBuffer.wrap(outB).getLong();
}

Source File: ShardedTableTabletBalancer.java From datawave with Apache License 2.0

6 votes

private byte[] retrieveDate(KeyExtent extent) {
    Text endRow = extent.getEndRow();
    if (endRow == null)
        endRow = extent.getPrevEndRow();
    if (endRow == null) {
        log.warn("Attempting to retrieve date from empty extent " + extent + ". Is your sharded table pre-split?");
        return "null".getBytes();
    } else {
        int idx = endRow.find("_");
        if (idx <= 0) {
            idx = endRow.getLength();
            log.warn("Extent " + extent + " does not conform to sharded date scheme yyyyMMdd_num");
        }
        return Arrays.copyOf(endRow.getBytes(), idx);
    }
}

Source File: TeraSort.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly.
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, 
                                  Text prefix, int maxDepth) {
  int depth = prefix.getLength();
  if (depth >= maxDepth || lower == upper) {
    return new LeafTrieNode(depth, splits, lower, upper);
  }
  InnerTrieNode result = new InnerTrieNode(depth);
  Text trial = new Text(prefix);
  // append an extra byte on to the prefix
  trial.append(new byte[1], 0, 1);
  int currentBound = lower;
  for(int ch = 0; ch < 255; ++ch) {
    trial.getBytes()[depth] = (byte) (ch + 1);
    lower = currentBound;
    while (currentBound < upper) {
      if (splits[currentBound].compareTo(trial) >= 0) {
        break;
      }
      currentBound += 1;
    }
    trial.getBytes()[depth] = (byte) ch;
    result.child[ch] = buildTrie(splits, lower, currentBound, trial, 
                                 maxDepth);
  }
  // pick up the rest
  trial.getBytes()[depth] = 127;
  result.child[255] = buildTrie(splits, currentBound, upper, trial,
                                maxDepth);
  return result;
}

Source File: ToISO8601UDF.java From occurrence with Apache License 2.0

5 votes

public Text evaluate(Text field) {
  if (field == null || field.getLength() == 0) {
    return null;
  } else {
    try {
      text.set(DownloadUtils.ISO_8601_ZONED.format(Instant.ofEpochMilli(Long.parseLong(field.toString())).atZone(ZoneOffset.UTC)));
      return text;
    } catch (NumberFormatException e) {
      return null;
    }
  }
}

Source File: EventDataScanNestedIterator.java From datawave with Apache License 2.0

5 votes

boolean isEventKey(Key k) {
    Text cf = k.getColumnFamily();
    return cf.getLength() > 0
                    && cf.find("\u0000") != -1
                    && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf
                                    .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f'));
}

Source File: XmlSerDe.java From Hive-XML-SerDe with Apache License 2.0

5 votes

/**
 * @see org.apache.hadoop.hive.serde2.Deserializer#deserialize(org.apache.hadoop.io.Writable)
 */
@Override
public Object deserialize(Writable writable) throws SerDeException {
    Text text = (Text) writable;
    if (text == null || text.getLength() == 0) {
        return (Object) null;
    }
    try {
        return this.xmlProcessor.parse(text.toString());
    } catch (Exception e) {
        throw new SerDeException(e);
    }
}

Source File: TeraSort.java From RDFS with Apache License 2.0

5 votes

int findPartition(Text key) {
  int level = getLevel();
  if (key.getLength() <= level) {
    return child[0].findPartition(key);
  }
  return child[key.getBytes()[level]].findPartition(key);
}

Source File: TopKDataGen.java From sequenceiq-samples with Apache License 2.0

5 votes

@Override
public void run() throws Exception {
    KeyValueWriter streamOutputWriter = (KeyValueWriter) getOutputs().get(OUTPUT).getWriter();
    long sizeLarge = 0;
    while (sizeLarge < streamOutputFileSize) {
        String str = createRowString();
        Text text = new Text(str);
        int size = text.getLength();
        streamOutputWriter.write(text, NullWritable.get());
        sizeLarge += size;
    }
}

Source File: DeflateUDF.java From incubator-hivemall with Apache License 2.0

5 votes

@Override
public BytesWritable evaluate(DeferredObject[] arguments) throws HiveException {
    if (codec == null) {
        this.codec = new DeflateCodec(true, false);
    }

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    Text text = stringOI.getPrimitiveWritableObject(arg0);
    byte[] original = text.getBytes();
    final int len = text.getLength();
    final byte[] compressed;
    try {
        compressed = codec.compress(original, 0, len, compressionLevel);
    } catch (IOException e) {
        throw new HiveException("Failed to compress", e);
    }
    original = null;
    if (result == null) {
        this.result = new BytesWritable(compressed);
    } else {
        result.set(compressed, 0, compressed.length);
    }
    return result;
}

Source File: LineRecordReader.java From hadoop with Apache License 2.0

5 votes

private int skipUtfByteOrderMark(Text value) throws IOException {
  // Strip BOM(Byte Order Mark)
  // Text only support UTF-8, we only need to check UTF-8 BOM
  // (0xEF,0xBB,0xBF) at the start of the text stream.
  int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength,
      Integer.MAX_VALUE);
  int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
  // Even we read 3 extra bytes for the first line,
  // we won't alter existing behavior (no backwards incompat issue).
  // Because the newSize is less than maxLineLength and
  // the number of bytes copied to Text is always no more than newSize.
  // If the return size from readLine is not less than maxLineLength,
  // we will discard the current line and read the next line.
  pos += newSize;
  int textLength = value.getLength();
  byte[] textBytes = value.getBytes();
  if ((textLength >= 3) && (textBytes[0] == (byte)0xEF) &&
      (textBytes[1] == (byte)0xBB) && (textBytes[2] == (byte)0xBF)) {
    // find UTF-8 BOM, strip it.
    LOG.info("Found UTF-8 BOM and skipped it");
    textLength -= 3;
    newSize -= 3;
    if (textLength > 0) {
      // It may work to use the same buffer and not do the copyBytes
      textBytes = value.copyBytes();
      value.set(textBytes, 3, textLength);
    } else {
      value.clear();
    }
  }
  return newSize;
}

Source File: ShardedTableTabletBalancer.java From datawave with Apache License 2.0

5 votes

@Override
public String apply(KeyExtent extent) {
    String date = "null"; // Don't return null
    if (extent != null) {
        Text endRow = extent.getEndRow();
        if (endRow != null) {
            int sepIdx = endRow.find("_");
            if (sepIdx < 0)
                sepIdx = endRow.getLength();
            date = new String(endRow.getBytes(), 0, sepIdx);
        }
    }
    return date;
}

Source File: TeraSort.java From RDFS with Apache License 2.0

5 votes

/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly.
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, 
                                  Text prefix, int maxDepth) {
  int depth = prefix.getLength();
  if (depth >= maxDepth || lower == upper) {
    return new LeafTrieNode(depth, splits, lower, upper);
  }
  InnerTrieNode result = new InnerTrieNode(depth);
  Text trial = new Text(prefix);
  // append an extra byte on to the prefix
  trial.append(new byte[1], 0, 1);
  int currentBound = lower;
  for(int ch = 0; ch < 255; ++ch) {
    trial.getBytes()[depth] = (byte) (ch + 1);
    lower = currentBound;
    while (currentBound < upper) {
      if (splits[currentBound].compareTo(trial) >= 0) {
        break;
      }
      currentBound += 1;
    }
    trial.getBytes()[depth] = (byte) ch;
    result.child[ch] = buildTrie(splits, lower, currentBound, trial, 
                                 maxDepth);
  }
  // pick up the rest
  trial.getBytes()[depth] = 127;
  result.child[255] = buildTrie(splits, currentBound, upper, trial,
                                maxDepth);
  return result;
}

Source File: TreeExportUDF.java From incubator-hivemall with Apache License 2.0

5 votes

@Nonnull
public Text export(@Nonnull Text model, @Nullable String[] featureNames,
        @Nullable String[] classNames) throws HiveException {
    int length = model.getLength();
    byte[] b = model.getBytes();
    b = Base91.decode(b, 0, length);

    final String exported;
    if (regression) {
        exported = exportRegressor(b, featureNames);
    } else {
        exported = exportClassifier(b, featureNames, classNames);
    }
    return new Text(exported);
}

Source File: RandomTextWriterJob.java From big-c with Apache License 2.0

5 votes

/**
 * Given an output filename, write a bunch of random records to it.
 */
public void map(Text key, Text value,
                Context context) throws IOException,InterruptedException {
  int itemCount = 0;
  while (numBytesToWrite > 0) {
    // Generate the key/value 
    int noWordsKey = minWordsInKey + 
      (wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
    int noWordsValue = minWordsInValue + 
      (wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
    Text keyWords = generateSentence(noWordsKey);
    Text valueWords = generateSentence(noWordsValue);
    
    // Write the sentence 
    context.write(keyWords, valueWords);
    
    numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
    
    // Update counters, progress etc.
    context.getCounter(Counters.BYTES_WRITTEN).increment(
              keyWords.getLength() + valueWords.getLength());
    context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
    if (++itemCount % 200 == 0) {
      context.setStatus("wrote record " + itemCount + ". " + 
                         numBytesToWrite + " bytes left.");
    }
  }
  context.setStatus("done with " + itemCount + " records.");
}

Source File: RandomTextWriter.java From tez with Apache License 2.0

5 votes

/**
 * Given an output filename, write a bunch of random records to it.
 */
public void map(Text key, Text value,
                Context context) throws IOException,InterruptedException {
  int itemCount = 0;
  while (numBytesToWrite > 0) {
    // Generate the key/value 
    int noWordsKey = minWordsInKey + 
      (wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
    int noWordsValue = minWordsInValue + 
      (wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
    Text keyWords = generateSentence(noWordsKey);
    Text valueWords = generateSentence(noWordsValue);
    
    // Write the sentence 
    context.write(keyWords, valueWords);
    
    numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
    
    // Update counters, progress etc.
    context.getCounter(Counters.BYTES_WRITTEN).increment(
              keyWords.getLength() + valueWords.getLength());
    context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
    if (++itemCount % 200 == 0) {
      context.setStatus("wrote record " + itemCount + ". " + 
                         numBytesToWrite + " bytes left.");
    }
  }
  context.setStatus("done with " + itemCount + " records.");
}

Source File: PigStorage.java From spork with Apache License 2.0

4 votes

@Override
public Tuple getNext() throws IOException {
    mProtoTuple = new ArrayList<Object>();
    if (!mRequiredColumnsInitialized) {
        if (signature!=null) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
            mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
        }
        mRequiredColumnsInitialized = true;
    }
    //Prepend input source path if source tagging is enabled
    if(tagFile) {
        mProtoTuple.add(new DataByteArray(sourcePath.getName()));
    } else if (tagPath) {
        mProtoTuple.add(new DataByteArray(sourcePath.toString()));
    }

    try {
        boolean notDone = in.nextKeyValue();
        if (!notDone) {
            return null;
        }
        Text value = (Text) in.getCurrentValue();
        byte[] buf = value.getBytes();
        int len = value.getLength();
        int start = 0;
        int fieldID = 0;
        for (int i = 0; i < len; i++) {
            if (buf[i] == fieldDel) {
                if (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))
                    addTupleValue(mProtoTuple, buf, start, i);
                start = i + 1;
                fieldID++;
            }
        }
        // pick up the last field
        if (start <= len && (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))) {
            addTupleValue(mProtoTuple, buf, start, len);
        }
        Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);

        return dontLoadSchema ? t : applySchema(t);
    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode,
                PigException.REMOTE_ENVIRONMENT, e);
    }
}

Source File: QseqInputFormat.java From Hadoop-BAM with MIT License

4 votes

private void scanQseqLine(Text line, Text key, SequencedFragment fragment)
{
	setFieldPositionsAndLengths(line);

	// Build the key.  We concatenate all fields from 0 to 5 (machine to y-pos)
	// and then the read number, replacing the tabs with colons.
	key.clear();
	// append up and including field[5]
	key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]);
	// replace tabs with :
	byte[] bytes = key.getBytes();
	int temporaryEnd = key.getLength();
	for (int i = 0; i < temporaryEnd; ++i)
		if (bytes[i] == '\t')
			bytes[i] = ':';
	// append the read number
	key.append(line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab.
	// convert the tab preceding the read number into a :
	key.getBytes()[temporaryEnd] = ':';

	// now the fragment
	try
	{
		fragment.clear();
		fragment.setInstrument( Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]) );
		fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])) );
		//fragment.setFlowcellId();
		fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])) );
		fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])) );
		fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])) );
		fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])) );
		fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])) );
		fragment.setFilterPassed( line.getBytes()[fieldPositions[10]] != '0' );
		//fragment.setControlNumber();
		if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence
			fragment.setIndexSequence(null);
		else
			fragment.setIndexSequence(Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N'));
	}
	catch (CharacterCodingException e) {
		throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
	}

	fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]);
	fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]);
}

Source File: ShardedDataTypeHandler.java From datawave with Apache License 2.0

3 votes

/**
 * Create Key from input parameters
 * 
 * For global index keys, the granularity of the timestamp is to the millisecond, where the semantics of the index record is to the day. This makes
 * MapReduce unable to reduce all index keys together unless they occurred at the same millisecond. If we truncate the timestamp to the day, we should
 * reduce the number of keys output from a job.
 * 
 * @param row
 * @param colf
 * @param colq
 * @param vis
 * @param ts
 * @param delete
 * @return Accumulo Key object
 */
protected Key createIndexKey(byte[] row, Text colf, Text colq, byte[] vis, long ts, boolean delete) {
    // Truncate the timestamp to the day
    long tsToDay = (ts / MS_PER_DAY) * MS_PER_DAY;
    
    Key k = new Key(row, 0, row.length, colf.getBytes(), 0, colf.getLength(), colq.getBytes(), 0, colq.getLength(), vis, 0, vis.length, tsToDay);
    k.setDeleted(delete);
    return k;
}

Java Code Examples for org.apache.hadoop.io.Text#getLength()