org.apache.hadoop.io.Text#clear

Source File: TeraInputFormat.java From hadoop-book with Apache License 2.0

6 votes

public boolean next(Text key, Text value) throws IOException {
    if (in.next(junk, line)) {
        if (line.getLength() < KEY_LENGTH) {
            key.set(line);
            value.clear();
        } else {
            byte[] bytes = line.getBytes();
            key.set(bytes, 0, KEY_LENGTH);
            value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH);
        }
        return true;
    } else {
        return false;
    }
}

Source File: FastaInputFormat.java From Hadoop-BAM with MIT License

6 votes

private void scanFastaLine(Text line, Text key, ReferenceFragment fragment)
{
    // Build the key.  We concatenate the chromosome/fragment descripion and
    // the start position of the FASTA sequence line, replacing the tabs with colons.
    key.clear();
    
    key.append(current_split_indexseq.getBytes(UTF8), 0, current_split_indexseq.getBytes(UTF8).length);
    key.append(Integer.toString(current_split_pos).getBytes(UTF8), 0, Integer.toString(current_split_pos).getBytes(UTF8).length);
    // replace tabs with :
    byte[] bytes = key.getBytes();
    int temporaryEnd = key.getLength();
    for (int i = 0; i < temporaryEnd; ++i)
	if (bytes[i] == '\t')
	    bytes[i] = ':';
    
    fragment.clear();
    fragment.setPosition(current_split_pos);
    fragment.setIndexSequence(current_split_indexseq);
    fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length);
}

Source File: TeraInputFormat.java From RDFS with Apache License 2.0

5 votes

public boolean next(Text key, Text value) throws IOException {
  if (in.next(junk, line)) {
    if (line.getLength() < KEY_LENGTH) {
      key.set(line);
      value.clear();
    } else {
      byte[] bytes = line.getBytes();
      key.set(bytes, 0, KEY_LENGTH);
      value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH);
    }
    return true;
  } else {
    return false;
  }
}

Source File: LazyStringDictionaryTreeReader.java From hive-dwrf with Apache License 2.0

5 votes

private void nextFromDictionary(Text result) throws IOException {
  int entry = (int) reader.next();
  int offset = dictionaryOffsets[entry];
  int length = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];

  // If the column is just empty strings, the size will be zero, so the buffer will be null,
  // in that case just return result as it will default to empty
  if (dictionaryBuffer != null) {
    dictionaryBuffer.setText(result, offset, length);
  } else {
    result.clear();
  }
}

Source File: TeraInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

public boolean next(Text key, Text value) throws IOException {
  if (in.next(junk, line)) {
    if (line.getLength() < KEY_LENGTH) {
      key.set(line);
      value.clear();
    } else {
      byte[] bytes = line.getBytes();
      key.set(bytes, 0, KEY_LENGTH);
      value.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH);
    }
    return true;
  } else {
    return false;
  }
}

Source File: LfLineReader.java From datawave with Apache License 2.0

4 votes

/**
 * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
 * 
 * @param str
 *            the object to store the given line (without newline)
 * @param maxLineLength
 *            the maximum number of bytes to store into str; the rest of the line is silently discarded.
 * @param maxBytesToConsume
 *            the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It
 *            can overshoot potentially by as much as one buffer length.
 * 
 * @return the number of bytes read including the (longest) newline found.
 * 
 * @throws IOException
 *             if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    /*
     * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in
     * the buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy
     * to str.
     */
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    int newlineLength = 0; // length of terminating newline
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; // starting from where we left off the
                                    // last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { // search for
                                                          // newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = 1;
                ++bufferPosn; // at next invocation proceed from following
                              // byte
                break;
            }
        }
        int readLength = bufferPosn - startPosn;
        bytesConsumed += readLength;
        int appendLength;
        if (isNewLineIncluded()) {
            appendLength = readLength;
        } else {
            appendLength = readLength - newlineLength;
        }
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
    
    if (bytesConsumed > Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int) bytesConsumed;
}

Source File: IndexedStorage.java From spork with Apache License 2.0

4 votes

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line.
 *
 * @param str the object to store the given line (without newline)
 * @param maxLineLength the maximum number of bytes to store into str;
 *  the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *  in this call.  This is only a hint, because if the line cross
 *  this threshold, we allow it to happen.  It can overshoot
 *  potentially by as much as one buffer length.
 *
 * @return the number of bytes read including the (longest) newline
 * found.
 *
 * @throws IOException if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength,
        int maxBytesToConsume) throws IOException {
    /* We're reading data from in, but the head of the stream may be
     * already buffered in buffer, so we have several cases:
     * 1. No newline characters are in the buffer, so we need to copy
     *    everything and read another buffer from the stream.
     * 2. An unambiguously terminated line is in buffer, so we just
     *    copy to str.
     * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
     *    in CR.  In this case we copy everything up to CR to str, but
     *    we also need to see what follows CR: if it's LF, then we
     *    need consume LF as well, so next call to readLine will read
     *    from after that.
     * We use a flag prevCharCR to signal if previous character was CR
     * and, if it happens to be at the end of the buffer, delay
     * consuming it until we have a chance to look at the char that
     * follows.
     */
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
        int startPosn = bufferPosn; //starting from where we left off the last time
        if (bufferPosn >= bufferLength) {
            startPosn = bufferPosn = 0;
            if (prevCharCR)
                ++bytesConsumed; //account for CR from previous read

            bufferOffset = ((Seekable)in).getPos();
            bufferLength = in.read(buffer);

            if (bufferLength <= 0)
                break; // EOF
        }
        for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
            if (buffer[bufferPosn] == LF) {
                newlineLength = (prevCharCR) ? 2 : 1;
                ++bufferPosn; // at next invocation proceed from following byte
                break;
            }
            if (prevCharCR) { //CR + notLF, we are at notLF
                newlineLength = 1;
                break;
            }
            prevCharCR = (buffer[bufferPosn] == CR);
        }
        int readLength = bufferPosn - startPosn;
        if (prevCharCR && newlineLength == 0)
            --readLength; //CR at the end of the buffer
        bytesConsumed += readLength;
        int appendLength = readLength - newlineLength;
        if (appendLength > maxLineLength - txtLength) {
            appendLength = maxLineLength - txtLength;
        }
        if (appendLength > 0) {
            str.append(buffer, startPosn, appendLength);
            txtLength += appendLength;
        }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > (long)Integer.MAX_VALUE)
        throw new IOException("Too many bytes before newline: " + bytesConsumed);
    return (int)bytesConsumed;
}

Source File: LineReader.java From incubator-tajo with Apache License 2.0

4 votes

/**
 * Read a line terminated by a custom delimiter.
 */
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
    throws IOException {
 /* We're reading data from inputStream, but the head of the stream may be
  *  already captured in the previous buffer, so we have several cases:
  *
  * 1. The buffer tail does not contain any character sequence which
  *    matches with the head of delimiter. We count it as a
  *    ambiguous byte count = 0
  *
  * 2. The buffer tail contains a X number of characters,
  *    that forms a sequence, which matches with the
  *    head of delimiter. We count ambiguous byte count = X
  *
  *    // ***  eg: A segment of input file is as follows
  *
  *    " record 1792: I found this bug very interesting and
  *     I have completely read about it. record 1793: This bug
  *     can be solved easily record 1794: This ."
  *
  *    delimiter = "record";
  *
  *    supposing:- String at the end of buffer =
  *    "I found this bug very interesting and I have completely re"
  *    There for next buffer = "ad about it. record 179       ...."
  *
  *     The matching characters in the input
  *     buffer tail and delimiter head = "re"
  *     Therefore, ambiguous byte count = 2 ****   //
  *
  *     2.1 If the following bytes are the remaining characters of
  *         the delimiter, then we have to capture only up to the starting
  *         position of delimiter. That means, we need not include the
  *         ambiguous characters in str.
  *
  *     2.2 If the following bytes are not the remaining characters of
  *         the delimiter ( as mentioned in the example ),
  *         then we have to include the ambiguous characters in str.
  */
  str.clear();
  int txtLength = 0; // tracks str.getLength(), as an optimization
  long bytesConsumed = 0;
  int delPosn = 0;
  int ambiguousByteCount = 0; // To capture the ambiguous characters count
  do {
    int startPosn = bufferPosn; // Start from previous end position
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
      if (bufferLength <= 0) {
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        break; // EOF
      }
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) {
      if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
        delPosn++;
        if (delPosn >= recordDelimiterBytes.length) {
          bufferPosn++;
          break;
        }
      } else if (delPosn != 0) {
        bufferPosn--;
        delPosn = 0;
      }
    }
    int readLength = bufferPosn - startPosn;
    bytesConsumed += readLength;
    int appendLength = readLength - delPosn;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      if (ambiguousByteCount > 0) {
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        //appending the ambiguous characters (refer case 2.2)
        bytesConsumed += ambiguousByteCount;
        ambiguousByteCount = 0;
      }
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
    if (bufferPosn >= bufferLength) {
      if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
        ambiguousByteCount = delPosn;
        bytesConsumed -= ambiguousByteCount; //to be consumed in next
      }
    }
  } while (delPosn < recordDelimiterBytes.length
      && bytesConsumed < maxBytesToConsume);
  if (bytesConsumed > (long) Integer.MAX_VALUE) {
    throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
  }
  return (int) bytesConsumed;
}

Source File: LineReader.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line.
 *
 * @param str the object to store the given line (without newline)
 * @param maxLineLength the maximum number of bytes to store into str;
 *  the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *  in this call.  This is only a hint, because if the line cross
 *  this threshold, we allow it to happen.  It can overshoot
 *  potentially by as much as one buffer length.
 *
 * @return the number of bytes read including the (longest) newline
 * found.
 *
 * @throws IOException if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength,
                    int maxBytesToConsume) throws IOException {
  /* We're reading data from in, but the head of the stream may be
   * already buffered in buffer, so we have several cases:
   * 1. No newline characters are in the buffer, so we need to copy
   *    everything and read another buffer from the stream.
   * 2. An unambiguously terminated line is in buffer, so we just
   *    copy to str.
   * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
   *    in CR.  In this case we copy everything up to CR to str, but
   *    we also need to see what follows CR: if it's LF, then we
   *    need consume LF as well, so next call to readLine will read
   *    from after that.
   * We use a flag prevCharCR to signal if previous character was CR
   * and, if it happens to be at the end of the buffer, delay
   * consuming it until we have a chance to look at the char that
   * follows.
   */
  str.clear();
  int txtLength = 0; //tracks str.getLength(), as an optimization
  int newlineLength = 0; //length of terminating newline
  boolean prevCharCR = false; //true of prev char was CR
  long bytesConsumed = 0;
  do {
    int startPosn = bufferPosn; //starting from where we left off the last time
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      if (prevCharCR)
        ++bytesConsumed; //account for CR from previous read
      bufferLength = in.read(buffer);
      if (bufferLength <= 0)
        break; // EOF
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
      if (buffer[bufferPosn] == LF) {
        newlineLength = (prevCharCR) ? 2 : 1;
        ++bufferPosn; // at next invocation proceed from following byte
        break;
      }
      if (prevCharCR) { //CR + notLF, we are at notLF
        newlineLength = 1;
        break;
      }
      prevCharCR = (buffer[bufferPosn] == CR);
    }
    int readLength = bufferPosn - startPosn;
    if (prevCharCR && newlineLength == 0)
      --readLength; //CR at the end of the buffer
    bytesConsumed += readLength;
    int appendLength = readLength - newlineLength;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
  } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

  if (bytesConsumed > (long)Integer.MAX_VALUE)
    throw new IOException("Too many bytes before newline: " + bytesConsumed);    
  return (int)bytesConsumed;
}

Source File: LineReader.java From RDFS with Apache License 2.0

4 votes

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line.
 *
 * @param str the object to store the given line (without newline)
 * @param maxLineLength the maximum number of bytes to store into str;
 *  the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *  in this call.  This is only a hint, because if the line cross
 *  this threshold, we allow it to happen.  It can overshoot
 *  potentially by as much as one buffer length.
 *
 * @return the number of bytes read including the (longest) newline
 * found.
 *
 * @throws IOException if the underlying stream throws
 */
public int readLine(Text str, int maxLineLength,
                    int maxBytesToConsume) throws IOException {
  /* We're reading data from in, but the head of the stream may be
   * already buffered in buffer, so we have several cases:
   * 1. No newline characters are in the buffer, so we need to copy
   *    everything and read another buffer from the stream.
   * 2. An unambiguously terminated line is in buffer, so we just
   *    copy to str.
   * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
   *    in CR.  In this case we copy everything up to CR to str, but
   *    we also need to see what follows CR: if it's LF, then we
   *    need consume LF as well, so next call to readLine will read
   *    from after that.
   * We use a flag prevCharCR to signal if previous character was CR
   * and, if it happens to be at the end of the buffer, delay
   * consuming it until we have a chance to look at the char that
   * follows.
   */
  str.clear();
  int txtLength = 0; //tracks str.getLength(), as an optimization
  int newlineLength = 0; //length of terminating newline
  boolean prevCharCR = false; //true of prev char was CR
  long bytesConsumed = 0;
  do {
    int startPosn = bufferPosn; //starting from where we left off the last time
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      if (prevCharCR)
        ++bytesConsumed; //account for CR from previous read
      bufferLength = in.read(buffer);
      if (bufferLength <= 0)
        break; // EOF
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
      if (buffer[bufferPosn] == LF) {
        newlineLength = (prevCharCR) ? 2 : 1;
        ++bufferPosn; // at next invocation proceed from following byte
        break;
      }
      if (prevCharCR) { //CR + notLF, we are at notLF
        newlineLength = 1;
        break;
      }
      prevCharCR = (buffer[bufferPosn] == CR);
    }
    int readLength = bufferPosn - startPosn;
    if (prevCharCR && newlineLength == 0)
      --readLength; //CR at the end of the buffer
    bytesConsumed += readLength;
    int appendLength = readLength - newlineLength;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
  } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

  if (bytesConsumed > (long)Integer.MAX_VALUE)
    throw new IOException("Too many bytes before newline: " + bytesConsumed);    
  return (int)bytesConsumed;
}

Source File: LineReader.java From big-c with Apache License 2.0

4 votes

/**
 * Read a line terminated by a custom delimiter.
 */
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
    throws IOException {
 /* We're reading data from inputStream, but the head of the stream may be
  *  already captured in the previous buffer, so we have several cases:
  * 
  * 1. The buffer tail does not contain any character sequence which
  *    matches with the head of delimiter. We count it as a 
  *    ambiguous byte count = 0
  *    
  * 2. The buffer tail contains a X number of characters,
  *    that forms a sequence, which matches with the
  *    head of delimiter. We count ambiguous byte count = X
  *    
  *    // ***  eg: A segment of input file is as follows
  *    
  *    " record 1792: I found this bug very interesting and
  *     I have completely read about it. record 1793: This bug
  *     can be solved easily record 1794: This ." 
  *    
  *    delimiter = "record";
  *        
  *    supposing:- String at the end of buffer =
  *    "I found this bug very interesting and I have completely re"
  *    There for next buffer = "ad about it. record 179       ...."           
  *     
  *     The matching characters in the input
  *     buffer tail and delimiter head = "re" 
  *     Therefore, ambiguous byte count = 2 ****   //
  *     
  *     2.1 If the following bytes are the remaining characters of
  *         the delimiter, then we have to capture only up to the starting 
  *         position of delimiter. That means, we need not include the 
  *         ambiguous characters in str.
  *     
  *     2.2 If the following bytes are not the remaining characters of
  *         the delimiter ( as mentioned in the example ), 
  *         then we have to include the ambiguous characters in str. 
  */
  str.clear();
  int txtLength = 0; // tracks str.getLength(), as an optimization
  long bytesConsumed = 0;
  int delPosn = 0;
  int ambiguousByteCount=0; // To capture the ambiguous characters count
  do {
    int startPosn = bufferPosn; // Start from previous end position
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
      if (bufferLength <= 0) {
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        break; // EOF
      }
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) {
      if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
        delPosn++;
        if (delPosn >= recordDelimiterBytes.length) {
          bufferPosn++;
          break;
        }
      } else if (delPosn != 0) {
        bufferPosn--;
        delPosn = 0;
      }
    }
    int readLength = bufferPosn - startPosn;
    bytesConsumed += readLength;
    int appendLength = readLength - delPosn;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      if (ambiguousByteCount > 0) {
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        //appending the ambiguous characters (refer case 2.2)
        bytesConsumed += ambiguousByteCount;
        ambiguousByteCount=0;
      }
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
    if (bufferPosn >= bufferLength) {
      if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
        ambiguousByteCount = delPosn;
        bytesConsumed -= ambiguousByteCount; //to be consumed in next
      }
    }
  } while (delPosn < recordDelimiterBytes.length 
      && bytesConsumed < maxBytesToConsume);
  if (bytesConsumed > Integer.MAX_VALUE) {
    throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
  }
  return (int) bytesConsumed; 
}

Source File: LineReader.java From big-c with Apache License 2.0

4 votes

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 */
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
throws IOException {
  /* We're reading data from in, but the head of the stream may be
   * already buffered in buffer, so we have several cases:
   * 1. No newline characters are in the buffer, so we need to copy
   *    everything and read another buffer from the stream.
   * 2. An unambiguously terminated line is in buffer, so we just
   *    copy to str.
   * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
   *    in CR.  In this case we copy everything up to CR to str, but
   *    we also need to see what follows CR: if it's LF, then we
   *    need consume LF as well, so next call to readLine will read
   *    from after that.
   * We use a flag prevCharCR to signal if previous character was CR
   * and, if it happens to be at the end of the buffer, delay
   * consuming it until we have a chance to look at the char that
   * follows.
   */
  str.clear();
  int txtLength = 0; //tracks str.getLength(), as an optimization
  int newlineLength = 0; //length of terminating newline
  boolean prevCharCR = false; //true of prev char was CR
  long bytesConsumed = 0;
  do {
    int startPosn = bufferPosn; //starting from where we left off the last time
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      if (prevCharCR) {
        ++bytesConsumed; //account for CR from previous read
      }
      bufferLength = fillBuffer(in, buffer, prevCharCR);
      if (bufferLength <= 0) {
        break; // EOF
      }
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
      if (buffer[bufferPosn] == LF) {
        newlineLength = (prevCharCR) ? 2 : 1;
        ++bufferPosn; // at next invocation proceed from following byte
        break;
      }
      if (prevCharCR) { //CR + notLF, we are at notLF
        newlineLength = 1;
        break;
      }
      prevCharCR = (buffer[bufferPosn] == CR);
    }
    int readLength = bufferPosn - startPosn;
    if (prevCharCR && newlineLength == 0) {
      --readLength; //CR at the end of the buffer
    }
    bytesConsumed += readLength;
    int appendLength = readLength - newlineLength;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
  } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

  if (bytesConsumed > Integer.MAX_VALUE) {
    throw new IOException("Too many bytes before newline: " + bytesConsumed);
  }
  return (int)bytesConsumed;
}

Source File: LineReader.java From hadoop with Apache License 2.0

4 votes

/**
 * Read a line terminated by a custom delimiter.
 */
private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
    throws IOException {
 /* We're reading data from inputStream, but the head of the stream may be
  *  already captured in the previous buffer, so we have several cases:
  * 
  * 1. The buffer tail does not contain any character sequence which
  *    matches with the head of delimiter. We count it as a 
  *    ambiguous byte count = 0
  *    
  * 2. The buffer tail contains a X number of characters,
  *    that forms a sequence, which matches with the
  *    head of delimiter. We count ambiguous byte count = X
  *    
  *    // ***  eg: A segment of input file is as follows
  *    
  *    " record 1792: I found this bug very interesting and
  *     I have completely read about it. record 1793: This bug
  *     can be solved easily record 1794: This ." 
  *    
  *    delimiter = "record";
  *        
  *    supposing:- String at the end of buffer =
  *    "I found this bug very interesting and I have completely re"
  *    There for next buffer = "ad about it. record 179       ...."           
  *     
  *     The matching characters in the input
  *     buffer tail and delimiter head = "re" 
  *     Therefore, ambiguous byte count = 2 ****   //
  *     
  *     2.1 If the following bytes are the remaining characters of
  *         the delimiter, then we have to capture only up to the starting 
  *         position of delimiter. That means, we need not include the 
  *         ambiguous characters in str.
  *     
  *     2.2 If the following bytes are not the remaining characters of
  *         the delimiter ( as mentioned in the example ), 
  *         then we have to include the ambiguous characters in str. 
  */
  str.clear();
  int txtLength = 0; // tracks str.getLength(), as an optimization
  long bytesConsumed = 0;
  int delPosn = 0;
  int ambiguousByteCount=0; // To capture the ambiguous characters count
  do {
    int startPosn = bufferPosn; // Start from previous end position
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
      if (bufferLength <= 0) {
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        break; // EOF
      }
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) {
      if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
        delPosn++;
        if (delPosn >= recordDelimiterBytes.length) {
          bufferPosn++;
          break;
        }
      } else if (delPosn != 0) {
        bufferPosn--;
        delPosn = 0;
      }
    }
    int readLength = bufferPosn - startPosn;
    bytesConsumed += readLength;
    int appendLength = readLength - delPosn;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      if (ambiguousByteCount > 0) {
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        //appending the ambiguous characters (refer case 2.2)
        bytesConsumed += ambiguousByteCount;
        ambiguousByteCount=0;
      }
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
    if (bufferPosn >= bufferLength) {
      if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
        ambiguousByteCount = delPosn;
        bytesConsumed -= ambiguousByteCount; //to be consumed in next
      }
    }
  } while (delPosn < recordDelimiterBytes.length 
      && bytesConsumed < maxBytesToConsume);
  if (bytesConsumed > Integer.MAX_VALUE) {
    throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
  }
  return (int) bytesConsumed; 
}

Source File: LineReader.java From hadoop with Apache License 2.0

4 votes

/**
 * Read a line terminated by one of CR, LF, or CRLF.
 */
private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
throws IOException {
  /* We're reading data from in, but the head of the stream may be
   * already buffered in buffer, so we have several cases:
   * 1. No newline characters are in the buffer, so we need to copy
   *    everything and read another buffer from the stream.
   * 2. An unambiguously terminated line is in buffer, so we just
   *    copy to str.
   * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
   *    in CR.  In this case we copy everything up to CR to str, but
   *    we also need to see what follows CR: if it's LF, then we
   *    need consume LF as well, so next call to readLine will read
   *    from after that.
   * We use a flag prevCharCR to signal if previous character was CR
   * and, if it happens to be at the end of the buffer, delay
   * consuming it until we have a chance to look at the char that
   * follows.
   */
  str.clear();
  int txtLength = 0; //tracks str.getLength(), as an optimization
  int newlineLength = 0; //length of terminating newline
  boolean prevCharCR = false; //true of prev char was CR
  long bytesConsumed = 0;
  do {
    int startPosn = bufferPosn; //starting from where we left off the last time
    if (bufferPosn >= bufferLength) {
      startPosn = bufferPosn = 0;
      if (prevCharCR) {
        ++bytesConsumed; //account for CR from previous read
      }
      bufferLength = fillBuffer(in, buffer, prevCharCR);
      if (bufferLength <= 0) {
        break; // EOF
      }
    }
    for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
      if (buffer[bufferPosn] == LF) {
        newlineLength = (prevCharCR) ? 2 : 1;
        ++bufferPosn; // at next invocation proceed from following byte
        break;
      }
      if (prevCharCR) { //CR + notLF, we are at notLF
        newlineLength = 1;
        break;
      }
      prevCharCR = (buffer[bufferPosn] == CR);
    }
    int readLength = bufferPosn - startPosn;
    if (prevCharCR && newlineLength == 0) {
      --readLength; //CR at the end of the buffer
    }
    bytesConsumed += readLength;
    int appendLength = readLength - newlineLength;
    if (appendLength > maxLineLength - txtLength) {
      appendLength = maxLineLength - txtLength;
    }
    if (appendLength > 0) {
      str.append(buffer, startPosn, appendLength);
      txtLength += appendLength;
    }
  } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

  if (bytesConsumed > Integer.MAX_VALUE) {
    throw new IOException("Too many bytes before newline: " + bytesConsumed);
  }
  return (int)bytesConsumed;
}

Source File: KeyValueReader.java From localization_nifi with Apache License 2.0

4 votes

@Override
public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException {

    final SequenceFile.Reader reader;

    Set<FlowFile> flowFiles = new HashSet<>();
    reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
    final Text key = new Text();
    final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader);
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Read from SequenceFile: {} ", new Object[]{file});
    try {
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }

            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            callback.key = key;
            try {
                flowFile = session.write(flowFile, callback);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[]{flowFile}, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }

    return flowFiles;
}

Source File: KeyValueReader.java From nifi with Apache License 2.0

4 votes

@Override
public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException {

    final SequenceFile.Reader reader;

    Set<FlowFile> flowFiles = new HashSet<>();
    reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
    final Text key = new Text();
    final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader);
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Read from SequenceFile: {} ", new Object[]{file});
    try {
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }

            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            callback.key = key;
            try {
                flowFile = session.write(flowFile, callback);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[]{flowFile}, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }

    return flowFiles;
}

Source File: StreamKeyValUtil.java From big-c with Apache License 2.0

2 votes

/**
 * Read a utf8 encoded line from a data input stream. 
 * @param lineReader LineReader to read the line from.
 * @param out Text to read into
 * @return number of bytes read 
 * @throws IOException
 */
public static int readLine(LineReader lineReader, Text out) 
throws IOException {
  out.clear();
  return lineReader.readLine(out);
}

Source File: StreamKeyValUtil.java From hadoop with Apache License 2.0

2 votes

/**
 * Read a utf8 encoded line from a data input stream. 
 * @param lineReader LineReader to read the line from.
 * @param out Text to read into
 * @return number of bytes read 
 * @throws IOException
 */
public static int readLine(LineReader lineReader, Text out) 
throws IOException {
  out.clear();
  return lineReader.readLine(out);
}

Source File: StreamKeyValUtil.java From hadoop-gpu with Apache License 2.0

2 votes

/**
 * Read a utf8 encoded line from a data input stream. 
 * @param lineReader LineReader to read the line from.
 * @param out Text to read into
 * @return number of bytes read 
 * @throws IOException
 */
public static int readLine(LineReader lineReader, Text out) 
throws IOException {
  out.clear();
  return lineReader.readLine(out);
}

Source File: DynamicByteArray.java From hive-dwrf with Apache License 2.0

2 votes

/**
 * Set a text value from the bytes in this dynamic array.
 * @param result the value to set
 * @param offset the start of the bytes to copy
 * @param length the number of bytes to copy
 */
public void setText(Text result, int offset, int length) {
  result.clear();
  result.set(data.getBytes(), offset, length);
}

Java Code Examples for org.apache.hadoop.io.Text#clear()