Java Code Examples for org.apache.hadoop.io.Text#find()
The following examples show how to use
org.apache.hadoop.io.Text#find() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ShardedTableTabletBalancer.java From datawave with Apache License 2.0 | 6 votes |
private byte[] retrieveDate(KeyExtent extent) { Text endRow = extent.getEndRow(); if (endRow == null) endRow = extent.getPrevEndRow(); if (endRow == null) { log.warn("Attempting to retrieve date from empty extent " + extent + ". Is your sharded table pre-split?"); return "null".getBytes(); } else { int idx = endRow.find("_"); if (idx <= 0) { idx = endRow.getLength(); log.warn("Extent " + extent + " does not conform to sharded date scheme yyyyMMdd_num"); } return Arrays.copyOf(endRow.getBytes(), idx); } }
Example 2
Source File: AncestorIndexBuildingVisitor.java From datawave with Apache License 2.0 | 6 votes |
/** * Extract the uid from an event key, format shardId dataType\0UID FieldName\0FieldValue NULL * * @param key * @return */ private String getUid(Key key) { Text startColfam = key.getColumnFamily(); if (startColfam.find(Constants.NULL) != -1) { // have a start key with a document uid, add to the end of the cf to ensure we go to the next doc // parse out the uid String cf = startColfam.toString(); int index = cf.indexOf('\0'); if (index >= 0) { return cf.substring(index + 1); } } return null; }
Example 3
Source File: Phase3Step2DistinctDataJobTest.java From dkpro-c4corpus with Apache License 2.0 | 6 votes |
@Test public void testSplit() throws Exception { Text key = new Text("123_456789"); // hard-split using array copy int i = key.find("_", 0); Text outputKey = new Text(""); byte[] bytes = key.getBytes(); outputKey.append(bytes, i + 1, bytes.length - i - 2); String fileName = new String(bytes, 0, i); assertEquals("123", fileName); assertEquals("456789", outputKey.toString()); }
Example 4
Source File: QseqInputFormat.java From Hadoop-BAM with MIT License | 6 votes |
private void setFieldPositionsAndLengths(Text line) { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field { int endpos = line.find(Delim, pos); // the field's end position if (endpos < 0) endpos = line.getLength(); fieldPositions[fieldno] = pos; fieldLengths[fieldno] = endpos - pos; pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } if (fieldno != NUM_QSEQ_COLS) throw new FormatException("found " + fieldno + " fields instead of 11 at " + makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); }
Example 5
Source File: QueryMetricsSummaryLoader.java From datawave with Apache License 2.0 | 5 votes |
private String getTimeUnit(Key key) { if (useHourlyPrecision) { return DateHelper.formatToHour(key.getTimestamp()); } else { Text row = key.getRow(); return new String(row.getBytes(), 0, row.find("_")); } }
Example 6
Source File: ShardedTableTabletBalancer.java From datawave with Apache License 2.0 | 5 votes |
@Override public String apply(KeyExtent extent) { String date = "null"; // Don't return null if (extent != null) { Text endRow = extent.getEndRow(); if (endRow != null) { int sepIdx = endRow.find("_"); if (sepIdx < 0) sepIdx = endRow.getLength(); date = new String(endRow.getBytes(), 0, sepIdx); } } return date; }
Example 7
Source File: RangeStream.java From datawave with Apache License 2.0 | 5 votes |
public static boolean isEventSpecific(Range range) { Text holder = new Text(); Key startKey = range.getStartKey(); startKey.getColumnFamily(holder); if (holder.getLength() > 0) { if (holder.find("\0") > 0) { return true; } } return false; }
Example 8
Source File: QueryIterator.java From datawave with Apache License 2.0 | 5 votes |
/** * Determines if a range is document specific according to the following criteria * * <pre> * 1. Cannot have a null start or end key * 2. Cannot span multiple rows * 3. ColumnFamily must contain a null byte separator * </pre> * * @param r * - {@link Range} to be evaluated * @return - true if this is a document specific range, false if not. */ public static boolean isDocumentSpecificRange(Range r) { Preconditions.checkNotNull(r); // Also @see datawave.query.index.lookup.TupleToRange // We have already made the assertion that the client is sending us // an inclusive start key due to the inability to ascertain the // difference between and event-specific range and a continueMultiScan. // // As such, it is acceptable for us to make the same assertion on the // inclusivity of the start key. // Cannot have a null start or end key if (r.isInfiniteStartKey() || r.isInfiniteStopKey()) { return false; } // Cannot span multiple rows. Key startKey = r.getStartKey(); Key endKey = r.getEndKey(); if (!startKey.getRowData().equals(endKey.getRowData())) { return false; } // Column Family must contain a null byte separator. Text startCF = startKey.getColumnFamily(); Text endCF = endKey.getColumnFamily(); if (startCF.find(Constants.NULL) == -1 || endCF.find(Constants.NULL) == -1) { return false; } return true; }
Example 9
Source File: DocumentDataIterator.java From datawave with Apache License 2.0 | 5 votes |
protected boolean isEventKey(Key k) { Text cf = k.getColumnFamily(); return cf.getLength() > 0 && cf.find("\u0000") != -1 && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f')); }
Example 10
Source File: EventDataScanNestedIterator.java From datawave with Apache License 2.0 | 5 votes |
boolean isEventKey(Key k) { Text cf = k.getColumnFamily(); return cf.getLength() > 0 && cf.find("\u0000") != -1 && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f')); }
Example 11
Source File: UserProvider.java From hadoop with Apache License 2.0 | 5 votes |
@Override public synchronized List<String> getKeys() throws IOException { List<String> list = new ArrayList<String>(); List<Text> keys = credentials.getAllSecretKeys(); for (Text key : keys) { if (key.find("@") == -1) { list.add(key.toString()); } } return list; }
Example 12
Source File: UserProvider.java From big-c with Apache License 2.0 | 5 votes |
@Override public synchronized List<String> getKeys() throws IOException { List<String> list = new ArrayList<String>(); List<Text> keys = credentials.getAllSecretKeys(); for (Text key : keys) { if (key.find("@") == -1) { list.add(key.toString()); } } return list; }
Example 13
Source File: Phase3Step2DistinctDataJob.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { // hard-split using array copy to prevent Java Heap Space int i = key.find("_", 0); Text outputKey = new Text(""); byte[] bytes = key.getBytes(); outputKey.append(bytes, i + 1, bytes.length - i - 2); String fileName = new String(bytes, 0, i); multipleOutputs.write(new Text(outputKey), NullWritable.get(), fileName); }
Example 14
Source File: AggregatingRecordReader.java From datawave with Apache License 2.0 | 4 votes |
protected boolean nextKeyValuePastBlock(boolean hasNext) throws IOException { Preconditions.checkNotNull(positionAwareLineReader, "positionAwareLineReader cannot be null"); // If we're in the middle of an element // have "run out" of data in the current InputSplit // if aggValue is empty (meaning, we just cleared it out and are trying to find a new record) // and we got to this point, we do not want to read into the next block if (aggValue.getLength() > 0 && positionAwareLineReader.getPos() > positionAwareLineReader.getEnd()) { int end = aggValue.find(endToken); int prevLength = aggValue.getLength(); Text endRecordFromNextBlock = new Text(); // We want to loop until we can find an end token to match the start token we already have while (end == -1) { endRecordFromNextBlock.clear(); int newSize = positionAwareLineReader.getLfLineReader().readLine(endRecordFromNextBlock, positionAwareLineReader.getMaxLineLength(), Integer.MAX_VALUE); if (0 == newSize) { // This fails in the same manner as the process(Text) method does for // self-closing XML elements. return returnPartialMatches && aggValue.getLength() > 0; } // Track the extra data read positionAwareLineReader.setPos(positionAwareLineReader.getPos() + newSize); // Find the start and end in this next segment read int newStart = endRecordFromNextBlock.find(startToken), newEnd = endRecordFromNextBlock.find(endToken); // We found no start, but did find an end if (newStart == -1 && newEnd >= 0) { // Append onto the aggValue, and we're done. TextUtil.textAppendNoNull(aggValue, Text.decode(endRecordFromNextBlock.getBytes(), 0, newEnd + endToken.length())); return true; } else if (newStart < newEnd) { // We found another start token before an endtoken which // would imply malformed XML if (returnPartialMatches) { // Let's try to be nice and throw everything up until the new start character into the aggregated value. TextUtil.textAppendNoNull(aggValue, Text.decode(endRecordFromNextBlock.getBytes(), 0, newStart)); return true; } else { return false; } } else { // haven't seen a start or end, so just keep aggregating TextUtil.textAppendNoNull(aggValue, endRecordFromNextBlock.toString()); } end = aggValue.find(endToken, prevLength); prevLength = aggValue.getLength(); } return true; } return hasNext; }
Example 15
Source File: EventDataScanNestedIterator.java From datawave with Apache License 2.0 | 4 votes |
protected void findNextDocument() { topKey = null; try { Text cf = new Text(); /* * Given that we are already at a document key, this method will continue to advance the underlying source until it is either exhausted (hasTop() * returns false), the returned key is not in the totalRange, and the current top key shares the same row and column family as the source's next * key. */ while (topKey == null && source.hasTop()) { Key k = source.getTopKey(); if (log.isTraceEnabled()) log.trace("Sought to " + k); k.getColumnFamily(cf); if (!isEventKey(k)) { if (cf.find("fi\0") == 0) { if (log.isDebugEnabled()) { log.debug("Seeking over 'fi')"); } // Try to do an optimized jump over the field index cf.set("fi\1"); source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()), columnFamilies, inclusive); } else if (cf.getLength() == 1 && cf.charAt(0) == 'd') { if (log.isDebugEnabled()) { log.debug("Seeking over 'd'"); } // Try to do an optimized jump over the raw documents cf.set("d\0"); source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()), columnFamilies, inclusive); } else if (cf.getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f') { if (log.isDebugEnabled()) { log.debug("Seeking over 'tf'"); } // Try to do an optimized jump over the term frequencies cf.set("tf\0"); source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()), columnFamilies, inclusive); } else { if (log.isDebugEnabled()) { log.debug("Next()'ing over the current key"); } source.next(); } } else { if (dataTypeFilter.apply(source.getTopKey())) { this.topKey = source.getTopKey(); } else { Range nextCF = new Range(nextStartKey(source.getTopKey()), true, totalRange.getEndKey(), totalRange.isEndKeyInclusive()); source.seek(nextCF, columnFamilies, inclusive); } } } } catch (IOException e) { throw new RuntimeException("Could not seek in findNextDocument", e); } }
Example 16
Source File: WikiReverseMapper.java From wikireverse with MIT License | 4 votes |
public void map(LongWritable inputKey, WritableWarcRecord inputValue, OutputCollector<Text, LinkArrayWritable> output, Reporter reporter, WikiMetadata wikiMetadata) throws IOException, InterruptedException { try { // Get Warc record from the writable wrapper. WarcRecord record = inputValue.getRecord(); String url = record.getHeaderMetadataItem(WARC_TARGET_URI); if (wikiMetadata.isWikiPage(url, reporter) == false) { Text metadata = new Text(record.getContent()); if (metadata.find(WIKIPEDIA_DOMAIN) >= 0) { Page page = new Page(url); page = MetadataParser.parse(page, metadata, LINK_TYPE, WIKIPEDIA_DOMAIN); Hashtable<String, LinkWritable> results = wikiMetadata.createResults(page, reporter); if (results != null && results.isEmpty() == false) { Text outputKey = new Text(); LinkArrayWritable outputValue = new LinkArrayWritable(); LinkWritable[] linkArray = new LinkWritable[1]; for(String key : results.keySet()) { linkArray[0] = results.get(key); outputKey.set(key); outputValue.set(linkArray); output.collect(outputKey, outputValue); } reporter.incrCounter(COUNTER_GROUP, URLS_PARSED, results.size()); } } } } catch (URISyntaxException us) { reporter.incrCounter(COUNTER_GROUP, URI_SYNTAX_EXCEPTION, 1); LOG.error(StringUtils.stringifyException(us)); } catch (JsonParseException jp) { reporter.incrCounter(COUNTER_GROUP, JSON_PARSE_EXCEPTION, 1); LOG.error(StringUtils.stringifyException(jp)); } catch (IOException io) { reporter.incrCounter(COUNTER_GROUP, MAP_IO_EXCEPTION, 1); LOG.error(StringUtils.stringifyException(io)); } catch (Exception e) { try { reporter.incrCounter(COUNTER_GROUP, MAP_EXCEPTION, 1); LOG.error(StringUtils.stringifyException(e)); } catch (Exception ie) { // Log and consume inner exceptions when logging. LOG.error(ie.toString()); } } }