Java Code Examples for org.apache.hadoop.io.Text#clear()
The following examples show how to use
org.apache.hadoop.io.Text#clear() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TeraInputFormat.java From hadoop-book with Apache License 2.0 | 6 votes |
public boolean next(Text key, Text value) throws IOException { if (in.next(junk, line)) { if (line.getLength() < KEY_LENGTH) { key.set(line); value.clear(); } else { byte[] bytes = line.getBytes(); key.set(bytes, 0, KEY_LENGTH); value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH); } return true; } else { return false; } }
Example 2
Source File: FastaInputFormat.java From Hadoop-BAM with MIT License | 6 votes |
private void scanFastaLine(Text line, Text key, ReferenceFragment fragment) { // Build the key. We concatenate the chromosome/fragment descripion and // the start position of the FASTA sequence line, replacing the tabs with colons. key.clear(); key.append(current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length); key.append(Integer.toString(current_split_pos).getBytes(UTF8), 0, Integer.toString(current_split_pos).getBytes(UTF8).length); // replace tabs with : byte[] bytes = key.getBytes(); int temporaryEnd = key.getLength(); for (int i = 0; i < temporaryEnd; ++i) if (bytes[i] == '\t') bytes[i] = ':'; fragment.clear(); fragment.setPosition(current_split_pos); fragment.setIndexSequence(current_split_indexseq); fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length); }
Example 3
Source File: TeraInputFormat.java From RDFS with Apache License 2.0 | 5 votes |
public boolean next(Text key, Text value) throws IOException { if (in.next(junk, line)) { if (line.getLength() < KEY_LENGTH) { key.set(line); value.clear(); } else { byte[] bytes = line.getBytes(); key.set(bytes, 0, KEY_LENGTH); value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH); } return true; } else { return false; } }
Example 4
Source File: LazyStringDictionaryTreeReader.java From hive-dwrf with Apache License 2.0 | 5 votes |
private void nextFromDictionary(Text result) throws IOException { int entry = (int) reader.next(); int offset = dictionaryOffsets[entry]; int length = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry]; // If the column is just empty strings, the size will be zero, so the buffer will be null, // in that case just return result as it will default to empty if (dictionaryBuffer != null) { dictionaryBuffer.setText(result, offset, length); } else { result.clear(); } }
Example 5
Source File: TeraInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
public boolean next(Text key, Text value) throws IOException { if (in.next(junk, line)) { if (line.getLength() < KEY_LENGTH) { key.set(line); value.clear(); } else { byte[] bytes = line.getBytes(); key.set(bytes, 0, KEY_LENGTH); value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH); } return true; } else { return false; } }
Example 6
Source File: LfLineReader.java From datawave with Apache License 2.0 | 4 votes |
/** * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line. * * @param str * the object to store the given line (without newline) * @param maxLineLength * the maximum number of bytes to store into str; the rest of the line is silently discarded. * @param maxBytesToConsume * the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It * can overshoot potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline found. * * @throws IOException * if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in * the buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy * to str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization int newlineLength = 0; // length of terminating newline long bytesConsumed = 0; do { int startPosn = bufferPosn; // starting from where we left off the // last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { // search for // newline if (buffer[bufferPosn] == LF) { newlineLength = 1; ++bufferPosn; // at next invocation proceed from following // byte break; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength; if (isNewLineIncluded()) { appendLength = readLength; } else { appendLength = readLength - newlineLength; } if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int) bytesConsumed; }
Example 7
Source File: IndexedStorage.java From spork with Apache License 2.0 | 4 votes |
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line. * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline * found. * * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) ++bytesConsumed; //account for CR from previous read bufferOffset = ((Seekable)in).getPos(); bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) --readLength; //CR at the end of the buffer bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long)Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int)bytesConsumed; }
Example 8
Source File: LineReader.java From incubator-tajo with Apache License 2.0 | 4 votes |
/** * Read a line terminated by a custom delimiter. */ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount = 0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount = 0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long) Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
Example 9
Source File: LineReader.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line. * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline * found. * * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) ++bytesConsumed; //account for CR from previous read bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) --readLength; //CR at the end of the buffer bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long)Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int)bytesConsumed; }
Example 10
Source File: LineReader.java From RDFS with Apache License 2.0 | 4 votes |
/** * Read one line from the InputStream into the given Text. A line * can be terminated by one of the following: '\n' (LF) , '\r' (CR), * or '\r\n' (CR+LF). EOF also terminates an otherwise unterminated * line. * * @param str the object to store the given line (without newline) * @param maxLineLength the maximum number of bytes to store into str; * the rest of the line is silently discarded. * @param maxBytesToConsume the maximum number of bytes to consume * in this call. This is only a hint, because if the line cross * this threshold, we allow it to happen. It can overshoot * potentially by as much as one buffer length. * * @return the number of bytes read including the (longest) newline * found. * * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) ++bytesConsumed; //account for CR from previous read bufferLength = in.read(buffer); if (bufferLength <= 0) break; // EOF } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) --readLength; //CR at the end of the buffer bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > (long)Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); return (int)bytesConsumed; }
Example 11
Source File: LineReader.java From big-c with Apache License 2.0 | 4 votes |
/** * Read a line terminated by a custom delimiter. */ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount=0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount=0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
Example 12
Source File: LineReader.java From big-c with Apache License 2.0 | 4 votes |
/** * Read a line terminated by one of CR, LF, or CRLF. */ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; //account for CR from previous read } bufferLength = fillBuffer(in, buffer, prevCharCR); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; //CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int)bytesConsumed; }
Example 13
Source File: LineReader.java From hadoop with Apache License 2.0 | 4 votes |
/** * Read a line terminated by a custom delimiter. */ private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from inputStream, but the head of the stream may be * already captured in the previous buffer, so we have several cases: * * 1. The buffer tail does not contain any character sequence which * matches with the head of delimiter. We count it as a * ambiguous byte count = 0 * * 2. The buffer tail contains a X number of characters, * that forms a sequence, which matches with the * head of delimiter. We count ambiguous byte count = X * * // *** eg: A segment of input file is as follows * * " record 1792: I found this bug very interesting and * I have completely read about it. record 1793: This bug * can be solved easily record 1794: This ." * * delimiter = "record"; * * supposing:- String at the end of buffer = * "I found this bug very interesting and I have completely re" * There for next buffer = "ad about it. record 179 ...." * * The matching characters in the input * buffer tail and delimiter head = "re" * Therefore, ambiguous byte count = 2 **** // * * 2.1 If the following bytes are the remaining characters of * the delimiter, then we have to capture only up to the starting * position of delimiter. That means, we need not include the * ambiguous characters in str. * * 2.2 If the following bytes are not the remaining characters of * the delimiter ( as mentioned in the example ), * then we have to include the ambiguous characters in str. */ str.clear(); int txtLength = 0; // tracks str.getLength(), as an optimization long bytesConsumed = 0; int delPosn = 0; int ambiguousByteCount=0; // To capture the ambiguous characters count do { int startPosn = bufferPosn; // Start from previous end position if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0); if (bufferLength <= 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) { delPosn++; if (delPosn >= recordDelimiterBytes.length) { bufferPosn++; break; } } else if (delPosn != 0) { bufferPosn--; delPosn = 0; } } int readLength = bufferPosn - startPosn; bytesConsumed += readLength; int appendLength = readLength - delPosn; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { if (ambiguousByteCount > 0) { str.append(recordDelimiterBytes, 0, ambiguousByteCount); //appending the ambiguous characters (refer case 2.2) bytesConsumed += ambiguousByteCount; ambiguousByteCount=0; } str.append(buffer, startPosn, appendLength); txtLength += appendLength; } if (bufferPosn >= bufferLength) { if (delPosn > 0 && delPosn < recordDelimiterBytes.length) { ambiguousByteCount = delPosn; bytesConsumed -= ambiguousByteCount; //to be consumed in next } } } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) { throw new IOException("Too many bytes before delimiter: " + bytesConsumed); } return (int) bytesConsumed; }
Example 14
Source File: LineReader.java From hadoop with Apache License 2.0 | 4 votes |
/** * Read a line terminated by one of CR, LF, or CRLF. */ private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException { /* We're reading data from in, but the head of the stream may be * already buffered in buffer, so we have several cases: * 1. No newline characters are in the buffer, so we need to copy * everything and read another buffer from the stream. * 2. An unambiguously terminated line is in buffer, so we just * copy to str. * 3. Ambiguously terminated line is in buffer, i.e. buffer ends * in CR. In this case we copy everything up to CR to str, but * we also need to see what follows CR: if it's LF, then we * need consume LF as well, so next call to readLine will read * from after that. * We use a flag prevCharCR to signal if previous character was CR * and, if it happens to be at the end of the buffer, delay * consuming it until we have a chance to look at the char that * follows. */ str.clear(); int txtLength = 0; //tracks str.getLength(), as an optimization int newlineLength = 0; //length of terminating newline boolean prevCharCR = false; //true of prev char was CR long bytesConsumed = 0; do { int startPosn = bufferPosn; //starting from where we left off the last time if (bufferPosn >= bufferLength) { startPosn = bufferPosn = 0; if (prevCharCR) { ++bytesConsumed; //account for CR from previous read } bufferLength = fillBuffer(in, buffer, prevCharCR); if (bufferLength <= 0) { break; // EOF } } for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline if (buffer[bufferPosn] == LF) { newlineLength = (prevCharCR) ? 2 : 1; ++bufferPosn; // at next invocation proceed from following byte break; } if (prevCharCR) { //CR + notLF, we are at notLF newlineLength = 1; break; } prevCharCR = (buffer[bufferPosn] == CR); } int readLength = bufferPosn - startPosn; if (prevCharCR && newlineLength == 0) { --readLength; //CR at the end of the buffer } bytesConsumed += readLength; int appendLength = readLength - newlineLength; if (appendLength > maxLineLength - txtLength) { appendLength = maxLineLength - txtLength; } if (appendLength > 0) { str.append(buffer, startPosn, appendLength); txtLength += appendLength; } } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume); if (bytesConsumed > Integer.MAX_VALUE) { throw new IOException("Too many bytes before newline: " + bytesConsumed); } return (int)bytesConsumed; }
Example 15
Source File: KeyValueReader.java From localization_nifi with Apache License 2.0 | 4 votes |
@Override public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException { final SequenceFile.Reader reader; Set<FlowFile> flowFiles = new HashSet<>(); reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final Text key = new Text(); final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; LOG.debug("Read from SequenceFile: {} ", new Object[]{file}); try { while (reader.next(key)) { String fileName = key.toString(); // the key may be a file name, and may not if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) { if (fileName.contains(File.separator)) { fileName = StringUtils.substringAfterLast(fileName, File.separator); } fileName = fileName + "." + System.nanoTime(); } else { fileName = inputfileName + ++counter; } FlowFile flowFile = session.create(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName); callback.key = key; try { flowFile = session.write(flowFile, callback); flowFiles.add(flowFile); } catch (ProcessException e) { LOG.error("Could not write to flowfile {}", new Object[]{flowFile}, e); session.remove(flowFile); } key.clear(); } } finally { IOUtils.closeQuietly(reader); } return flowFiles; }
Example 16
Source File: KeyValueReader.java From nifi with Apache License 2.0 | 4 votes |
@Override public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException { final SequenceFile.Reader reader; Set<FlowFile> flowFiles = new HashSet<>(); reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final Text key = new Text(); final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; LOG.debug("Read from SequenceFile: {} ", new Object[]{file}); try { while (reader.next(key)) { String fileName = key.toString(); // the key may be a file name, and may not if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) { if (fileName.contains(File.separator)) { fileName = StringUtils.substringAfterLast(fileName, File.separator); } fileName = fileName + "." + System.nanoTime(); } else { fileName = inputfileName + ++counter; } FlowFile flowFile = session.create(); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName); callback.key = key; try { flowFile = session.write(flowFile, callback); flowFiles.add(flowFile); } catch (ProcessException e) { LOG.error("Could not write to flowfile {}", new Object[]{flowFile}, e); session.remove(flowFile); } key.clear(); } } finally { IOUtils.closeQuietly(reader); } return flowFiles; }
Example 17
Source File: StreamKeyValUtil.java From big-c with Apache License 2.0 | 2 votes |
/** * Read a utf8 encoded line from a data input stream. * @param lineReader LineReader to read the line from. * @param out Text to read into * @return number of bytes read * @throws IOException */ public static int readLine(LineReader lineReader, Text out) throws IOException { out.clear(); return lineReader.readLine(out); }
Example 18
Source File: StreamKeyValUtil.java From hadoop with Apache License 2.0 | 2 votes |
/** * Read a utf8 encoded line from a data input stream. * @param lineReader LineReader to read the line from. * @param out Text to read into * @return number of bytes read * @throws IOException */ public static int readLine(LineReader lineReader, Text out) throws IOException { out.clear(); return lineReader.readLine(out); }
Example 19
Source File: StreamKeyValUtil.java From hadoop-gpu with Apache License 2.0 | 2 votes |
/** * Read a utf8 encoded line from a data input stream. * @param lineReader LineReader to read the line from. * @param out Text to read into * @return number of bytes read * @throws IOException */ public static int readLine(LineReader lineReader, Text out) throws IOException { out.clear(); return lineReader.readLine(out); }
Example 20
Source File: DynamicByteArray.java From hive-dwrf with Apache License 2.0 | 2 votes |
/** * Set a text value from the bytes in this dynamic array. * @param result the value to set * @param offset the start of the bytes to copy * @param length the number of bytes to copy */ public void setText(Text result, int offset, int length) { result.clear(); result.set(data.getBytes(), offset, length); }