org.apache.hadoop.io.Text#find

Source File: ShardedTableTabletBalancer.java From datawave with Apache License 2.0

6 votes

private byte[] retrieveDate(KeyExtent extent) {
    Text endRow = extent.getEndRow();
    if (endRow == null)
        endRow = extent.getPrevEndRow();
    if (endRow == null) {
        log.warn("Attempting to retrieve date from empty extent " + extent + ". Is your sharded table pre-split?");
        return "null".getBytes();
    } else {
        int idx = endRow.find("_");
        if (idx <= 0) {
            idx = endRow.getLength();
            log.warn("Extent " + extent + " does not conform to sharded date scheme yyyyMMdd_num");
        }
        return Arrays.copyOf(endRow.getBytes(), idx);
    }
}

Source File: AncestorIndexBuildingVisitor.java From datawave with Apache License 2.0

6 votes

/**
 * Extract the uid from an event key, format shardId dataType\0UID FieldName\0FieldValue NULL
 * 
 * @param key
 * @return
 */
private String getUid(Key key) {
    Text startColfam = key.getColumnFamily();
    if (startColfam.find(Constants.NULL) != -1) {
        // have a start key with a document uid, add to the end of the cf to ensure we go to the next doc
        // parse out the uid
        String cf = startColfam.toString();
        int index = cf.indexOf('\0');
        if (index >= 0) {
            
            return cf.substring(index + 1);
        }
    }
    
    return null;
}

Source File: Phase3Step2DistinctDataJobTest.java From dkpro-c4corpus with Apache License 2.0

6 votes

@Test
public void testSplit()
        throws Exception
{
    Text key = new Text("123_456789");

    // hard-split using array copy
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    assertEquals("123", fileName);
    assertEquals("456789", outputKey.toString());
}

Source File: QseqInputFormat.java From Hadoop-BAM with MIT License

6 votes

private void setFieldPositionsAndLengths(Text line)
{
	int pos = 0; // the byte position within the record
	int fieldno = 0; // the field index within the record
	while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field
	{
		int endpos = line.find(Delim, pos); // the field's end position
		if (endpos < 0)
			endpos = line.getLength();

		fieldPositions[fieldno] = pos;
		fieldLengths[fieldno] = endpos - pos;

		pos = endpos + 1; // the next starting position is the current end + 1
		fieldno += 1;
	}

	if (fieldno != NUM_QSEQ_COLS)
		throw new FormatException("found " + fieldno + " fields instead of 11 at " +
		            makePositionMessage(this.pos - line.getLength()) + ". Line: " + line);
}

Source File: QueryMetricsSummaryLoader.java From datawave with Apache License 2.0

5 votes

private String getTimeUnit(Key key) {
    if (useHourlyPrecision) {
        return DateHelper.formatToHour(key.getTimestamp());
    } else {
        Text row = key.getRow();
        return new String(row.getBytes(), 0, row.find("_"));
    }
}

Source File: ShardedTableTabletBalancer.java From datawave with Apache License 2.0

5 votes

@Override
public String apply(KeyExtent extent) {
    String date = "null"; // Don't return null
    if (extent != null) {
        Text endRow = extent.getEndRow();
        if (endRow != null) {
            int sepIdx = endRow.find("_");
            if (sepIdx < 0)
                sepIdx = endRow.getLength();
            date = new String(endRow.getBytes(), 0, sepIdx);
        }
    }
    return date;
}

Source File: RangeStream.java From datawave with Apache License 2.0

5 votes

public static boolean isEventSpecific(Range range) {
    Text holder = new Text();
    Key startKey = range.getStartKey();
    startKey.getColumnFamily(holder);
    if (holder.getLength() > 0) {
        if (holder.find("\0") > 0) {
            return true;
        }
    }
    return false;
}

Source File: QueryIterator.java From datawave with Apache License 2.0

5 votes

/**
 * Determines if a range is document specific according to the following criteria
 * 
 * <pre>
 *     1. Cannot have a null start or end key
 *     2. Cannot span multiple rows
 *     3. ColumnFamily must contain a null byte separator
 * </pre>
 *
 * @param r
 *            - {@link Range} to be evaluated
 * @return - true if this is a document specific range, false if not.
 */
public static boolean isDocumentSpecificRange(Range r) {
    Preconditions.checkNotNull(r);
    
    // Also @see datawave.query.index.lookup.TupleToRange
    // We have already made the assertion that the client is sending us
    // an inclusive start key due to the inability to ascertain the
    // difference between and event-specific range and a continueMultiScan.
    //
    // As such, it is acceptable for us to make the same assertion on the
    // inclusivity of the start key.
    
    // Cannot have a null start or end key
    if (r.isInfiniteStartKey() || r.isInfiniteStopKey()) {
        return false;
    }
    
    // Cannot span multiple rows.
    Key startKey = r.getStartKey();
    Key endKey = r.getEndKey();
    if (!startKey.getRowData().equals(endKey.getRowData())) {
        return false;
    }
    
    // Column Family must contain a null byte separator.
    Text startCF = startKey.getColumnFamily();
    Text endCF = endKey.getColumnFamily();
    if (startCF.find(Constants.NULL) == -1 || endCF.find(Constants.NULL) == -1) {
        return false;
    }
    return true;
}

Source File: DocumentDataIterator.java From datawave with Apache License 2.0

5 votes

protected boolean isEventKey(Key k) {
    Text cf = k.getColumnFamily();
    return cf.getLength() > 0
                    && cf.find("\u0000") != -1
                    && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf
                                    .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f'));
}

Source File: EventDataScanNestedIterator.java From datawave with Apache License 2.0

5 votes

boolean isEventKey(Key k) {
    Text cf = k.getColumnFamily();
    return cf.getLength() > 0
                    && cf.find("\u0000") != -1
                    && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf
                                    .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f'));
}

Source File: UserProvider.java From hadoop with Apache License 2.0

5 votes

@Override
public synchronized List<String> getKeys() throws IOException {
  List<String> list = new ArrayList<String>();
  List<Text> keys = credentials.getAllSecretKeys();
  for (Text key : keys) {
    if (key.find("@") == -1) {
      list.add(key.toString());
    }
  }
  return list;
}

Source File: UserProvider.java From big-c with Apache License 2.0

5 votes

@Override
public synchronized List<String> getKeys() throws IOException {
  List<String> list = new ArrayList<String>();
  List<Text> keys = credentials.getAllSecretKeys();
  for (Text key : keys) {
    if (key.find("@") == -1) {
      list.add(key.toString());
    }
  }
  return list;
}

Source File: Phase3Step2DistinctDataJob.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public void reduce(Text key, Iterable<NullWritable> values, Context context)
        throws IOException, InterruptedException
{
    // hard-split using array copy to prevent Java Heap Space
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    multipleOutputs.write(new Text(outputKey), NullWritable.get(), fileName);
}

Source File: AggregatingRecordReader.java From datawave with Apache License 2.0

4 votes

protected boolean nextKeyValuePastBlock(boolean hasNext) throws IOException {
    Preconditions.checkNotNull(positionAwareLineReader, "positionAwareLineReader cannot be null");
    
    // If we're in the middle of an element
    // have "run out" of data in the current InputSplit
    // if aggValue is empty (meaning, we just cleared it out and are trying to find a new record)
    // and we got to this point, we do not want to read into the next block
    if (aggValue.getLength() > 0 && positionAwareLineReader.getPos() > positionAwareLineReader.getEnd()) {
        int end = aggValue.find(endToken);
        int prevLength = aggValue.getLength();
        Text endRecordFromNextBlock = new Text();
        
        // We want to loop until we can find an end token to match the start token we already have
        while (end == -1) {
            endRecordFromNextBlock.clear();
            
            int newSize = positionAwareLineReader.getLfLineReader().readLine(endRecordFromNextBlock, positionAwareLineReader.getMaxLineLength(),
                            Integer.MAX_VALUE);
            if (0 == newSize) {
                // This fails in the same manner as the process(Text) method does for
                // self-closing XML elements.
                return returnPartialMatches && aggValue.getLength() > 0;
            }
            
            // Track the extra data read
            positionAwareLineReader.setPos(positionAwareLineReader.getPos() + newSize);
            
            // Find the start and end in this next segment read
            int newStart = endRecordFromNextBlock.find(startToken), newEnd = endRecordFromNextBlock.find(endToken);
            
            // We found no start, but did find an end
            if (newStart == -1 && newEnd >= 0) {
                // Append onto the aggValue, and we're done.
                TextUtil.textAppendNoNull(aggValue, Text.decode(endRecordFromNextBlock.getBytes(), 0, newEnd + endToken.length()));
                return true;
            } else if (newStart < newEnd) {
                // We found another start token before an endtoken which
                // would imply malformed XML
                
                if (returnPartialMatches) {
                    // Let's try to be nice and throw everything up until the new start character into the aggregated value.
                    TextUtil.textAppendNoNull(aggValue, Text.decode(endRecordFromNextBlock.getBytes(), 0, newStart));
                    
                    return true;
                } else {
                    return false;
                }
            } else {
                // haven't seen a start or end, so just keep aggregating
                TextUtil.textAppendNoNull(aggValue, endRecordFromNextBlock.toString());
            }
            
            end = aggValue.find(endToken, prevLength);
            prevLength = aggValue.getLength();
        }
        
        return true;
    }
    
    return hasNext;
}

Source File: EventDataScanNestedIterator.java From datawave with Apache License 2.0

4 votes

protected void findNextDocument() {
    topKey = null;
    
    try {
        Text cf = new Text();
        
        /*
         * Given that we are already at a document key, this method will continue to advance the underlying source until it is either exhausted (hasTop()
         * returns false), the returned key is not in the totalRange, and the current top key shares the same row and column family as the source's next
         * key.
         */
        while (topKey == null && source.hasTop()) {
            Key k = source.getTopKey();
            if (log.isTraceEnabled())
                log.trace("Sought to " + k);
            k.getColumnFamily(cf);
            
            if (!isEventKey(k)) {
                if (cf.find("fi\0") == 0) {
                    if (log.isDebugEnabled()) {
                        log.debug("Seeking over 'fi')");
                    }
                    // Try to do an optimized jump over the field index
                    cf.set("fi\1");
                    source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()),
                                    columnFamilies, inclusive);
                } else if (cf.getLength() == 1 && cf.charAt(0) == 'd') {
                    if (log.isDebugEnabled()) {
                        log.debug("Seeking over 'd'");
                    }
                    // Try to do an optimized jump over the raw documents
                    cf.set("d\0");
                    source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()),
                                    columnFamilies, inclusive);
                } else if (cf.getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f') {
                    if (log.isDebugEnabled()) {
                        log.debug("Seeking over 'tf'");
                    }
                    // Try to do an optimized jump over the term frequencies
                    cf.set("tf\0");
                    source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()),
                                    columnFamilies, inclusive);
                } else {
                    if (log.isDebugEnabled()) {
                        log.debug("Next()'ing over the current key");
                    }
                    source.next();
                }
            } else {
                if (dataTypeFilter.apply(source.getTopKey())) {
                    this.topKey = source.getTopKey();
                } else {
                    Range nextCF = new Range(nextStartKey(source.getTopKey()), true, totalRange.getEndKey(), totalRange.isEndKeyInclusive());
                    source.seek(nextCF, columnFamilies, inclusive);
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Could not seek in findNextDocument", e);
    }
}

Source File: WikiReverseMapper.java From wikireverse with MIT License

4 votes

public void map(LongWritable inputKey, WritableWarcRecord inputValue, OutputCollector<Text, LinkArrayWritable> output,
		Reporter reporter, WikiMetadata wikiMetadata)
		throws IOException, InterruptedException {

	try {
		// Get Warc record from the writable wrapper.
		WarcRecord record = inputValue.getRecord();
		String url = record.getHeaderMetadataItem(WARC_TARGET_URI);

		if (wikiMetadata.isWikiPage(url, reporter) == false) {
			Text metadata = new Text(record.getContent());
		
			if (metadata.find(WIKIPEDIA_DOMAIN) >= 0) {
				Page page = new Page(url);
				page = MetadataParser.parse(page, metadata, LINK_TYPE, WIKIPEDIA_DOMAIN);
				Hashtable<String, LinkWritable> results = wikiMetadata.createResults(page, reporter);

				if (results != null && results.isEmpty() == false) {
					Text outputKey = new Text();
					LinkArrayWritable outputValue = new LinkArrayWritable();						
					LinkWritable[] linkArray = new LinkWritable[1];

					for(String key : results.keySet()) {
						linkArray[0] = results.get(key);

						outputKey.set(key);
						outputValue.set(linkArray);
						
						output.collect(outputKey, outputValue);
					}

					reporter.incrCounter(COUNTER_GROUP, URLS_PARSED, results.size());
				}
			}
		}
		
	} catch (URISyntaxException us) {
		reporter.incrCounter(COUNTER_GROUP, URI_SYNTAX_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(us));
	} catch (JsonParseException jp) {
		reporter.incrCounter(COUNTER_GROUP, JSON_PARSE_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(jp));
	} catch (IOException io) {
		reporter.incrCounter(COUNTER_GROUP, MAP_IO_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(io));
	} catch (Exception e) {
		try {
			reporter.incrCounter(COUNTER_GROUP, MAP_EXCEPTION, 1);
			LOG.error(StringUtils.stringifyException(e));
		} catch (Exception ie) {
			// Log and consume inner exceptions when logging.
			LOG.error(ie.toString());
		}
	}
}

Java Code Examples for org.apache.hadoop.io.Text#find()