org.apache.nutch.protocol.http.api.HttpException Java Exaples

Source File: HttpResponse.java From anthelion with Apache License 2.0

6 votes

private int parseStatusLine(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {
  readLine(in, line, false);

  int codeStart = line.indexOf(" ");
  int codeEnd = line.indexOf(" ", codeStart+1);

  // handle lines with no plaintext result code, ie:
  // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
  if (codeEnd == -1) 
    codeEnd= line.length();

  int code;
  try {
    code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
  } catch (NumberFormatException e) {
    throw new HttpException("bad status line '" + line 
                            + "': " + e.getMessage(), e);
  }

  return code;
}

Source File: HttpResponse.java From anthelion with Apache License 2.0

6 votes

private void processHeaderLine(StringBuffer line)
  throws IOException, HttpException {

  int colonIndex = line.indexOf(":");       // key is up to colon
  if (colonIndex == -1) {
    int i;
    for (i= 0; i < line.length(); i++)
      if (!Character.isWhitespace(line.charAt(i)))
        break;
    if (i == line.length())
      return;
    throw new HttpException("No colon in header:" + line);
  }
  String key = line.substring(0, colonIndex);

  int valueStart = colonIndex+1;            // skip whitespace
  while (valueStart < line.length()) {
    int c = line.charAt(valueStart);
    if (c != ' ' && c != '\t')
      break;
    valueStart++;
  }
  String value = line.substring(valueStart);
  headers.set(key, value);
}

Source File: HttpResponse.java From nutch-selenium with Apache License 2.0

6 votes

private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
    readLine(in, line, false);

    int codeStart = line.indexOf(" ");
    int codeEnd = line.indexOf(" ", codeStart + 1);

    // handle lines with no plaintext result code, ie:
    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
    if (codeEnd == -1)
        codeEnd = line.length();

    int code;
    try {
        code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
    } catch (NumberFormatException e) {
        throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
    }

    return code;
}

Source File: HttpResponse.java From nutch-selenium with Apache License 2.0

6 votes

private void processHeaderLine(StringBuffer line) throws IOException, HttpException {

        int colonIndex = line.indexOf(":"); // key is up to colon
        if (colonIndex == -1) {
            int i;
            for (i = 0; i < line.length(); i++)
                if (!Character.isWhitespace(line.charAt(i)))
                    break;
            if (i == line.length())
                return;
            throw new HttpException("No colon in header:" + line);
        }
        String key = line.substring(0, colonIndex);

        int valueStart = colonIndex + 1; // skip whitespace
        while (valueStart < line.length()) {
            int c = line.charAt(valueStart);
            if (c != ' ' && c != '\t')
                break;
            valueStart++;
        }
        String value = line.substring(valueStart);
        headers.set(key, value);
    }

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

private int parseStatusLine(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {
  readLine(in, line, false);

  int codeStart = line.indexOf(" ");
  int codeEnd = line.indexOf(" ", codeStart+1);

  // handle lines with no plaintext result code, ie:
  // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
  if (codeEnd == -1) 
    codeEnd= line.length();

  int code;
  try {
    code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
  } catch (NumberFormatException e) {
    throw new HttpException("bad status line '" + line 
                            + "': " + e.getMessage(), e);
  }

  return code;
}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

private void processHeaderLine(StringBuffer line)
  throws IOException, HttpException {

  int colonIndex = line.indexOf(":");       // key is up to colon
  if (colonIndex == -1) {
    int i;
    for (i= 0; i < line.length(); i++)
      if (!Character.isWhitespace(line.charAt(i)))
        break;
    if (i == line.length())
      return;
    throw new HttpException("No colon in header:" + line);
  }
  String key = line.substring(0, colonIndex);

  int valueStart = colonIndex+1;            // skip whitespace
  while (valueStart < line.length()) {
    int c = line.charAt(valueStart);
    if (c != ' ' && c != '\t')
      break;
    valueStart++;
  }
  String value = line.substring(valueStart);
  headers.set(key, value);
}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

private void readPlainContent(InputStream in) throws HttpException, IOException {

        int contentLength = Integer.MAX_VALUE; // get content length
        String contentLengthString = headers.get(Response.CONTENT_LENGTH);
        if (contentLengthString != null) {
            contentLengthString = contentLengthString.trim();
            try {
                if (!contentLengthString.isEmpty())
                    contentLength = Integer.parseInt(contentLengthString);
            } catch (NumberFormatException e) {
                throw new HttpException("bad content length: " + contentLengthString);
            }
        }
        if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size
            contentLength = http.getMaxContent();

        ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
        byte[] bytes = new byte[Http.BUFFER_SIZE];
        int length = 0; // read content
        for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

            out.write(bytes, 0, i);
            length += i;
        }
        content = out.toByteArray();
    }

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

private void processHeaderLine(StringBuffer line) throws IOException, HttpException {

        int colonIndex = line.indexOf(":"); // key is up to colon
        if (colonIndex == -1) {
            int i;
            for (i = 0; i < line.length(); i++)
                if (!Character.isWhitespace(line.charAt(i)))
                    break;
            if (i == line.length())
                return;
            throw new HttpException("No colon in header:" + line);
        }
        String key = line.substring(0, colonIndex);

        int valueStart = colonIndex + 1; // skip whitespace
        while (valueStart < line.length()) {
            int c = line.charAt(valueStart);
            if (c != ' ' && c != '\t')
                break;
            valueStart++;
        }
        String value = line.substring(valueStart);
        headers.set(key, value);
    }

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
    readLine(in, line, false);

    int codeStart = line.indexOf(" ");
    int codeEnd = line.indexOf(" ", codeStart + 1);

    // handle lines with no plaintext result code, ie:
    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
    if (codeEnd == -1)
        codeEnd = line.length();

    int code;
    try {
        code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
    } catch (NumberFormatException e) {
        throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
    }

    return code;
}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

5 votes

private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {

        while (readLine(in, line, true) != 0) {

            // handle HTTP responses with missing blank line after headers
            int pos;
            if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
                    || ((pos = line.indexOf("<html")) != -1)) {

                in.unread(line.substring(pos).getBytes("UTF-8"));
                line.setLength(pos);

                try {
                    //TODO: (CM) We don't know the header names here
                    //since we're just handling them generically. It would
                    //be nice to provide some sort of mapping function here
                    //for the returned header names to the standard metadata
                    //names in the ParseData class
                    processHeaderLine(line);
                } catch (Exception e) {
                    // fixme:
                    Http.LOG.warn("Error: ", e);
                }
                return;
            }

            processHeaderLine(line);
        }
    }

Source File: HttpResponse.java From anthelion with Apache License 2.0

5 votes

private void parseHeaders(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {

  while (readLine(in, line, true) != 0) {

    // handle HTTP responses with missing blank line after headers
    int pos;
    if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 
         || ((pos= line.indexOf("<HTML")) != -1) 
         || ((pos= line.indexOf("<html")) != -1) ) {

      in.unread(line.substring(pos).getBytes("UTF-8"));
      line.setLength(pos);

      try {
          //TODO: (CM) We don't know the header names here
          //since we're just handling them generically. It would
          //be nice to provide some sort of mapping function here
          //for the returned header names to the standard metadata
          //names in the ParseData class
        processHeaderLine(line);
      } catch (Exception e) {
        // fixme:
        Http.LOG.warn("Error: ", e);
      }
      return;
    }

    processHeaderLine(line);
  }
}

Source File: HttpResponse.java From nutch-selenium with Apache License 2.0

5 votes

private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {

        while (readLine(in, line, true) != 0) {

            // handle HTTP responses with missing blank line after headers
            int pos;
            if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
                    || ((pos = line.indexOf("<html")) != -1)) {

                in.unread(line.substring(pos).getBytes("UTF-8"));
                line.setLength(pos);

                try {
                    //TODO: (CM) We don't know the header names here
                    //since we're just handling them generically. It would
                    //be nice to provide some sort of mapping function here
                    //for the returned header names to the standard metadata
                    //names in the ParseData class
                    processHeaderLine(line);
                } catch (Exception e) {
                    // fixme:
                    Http.LOG.warn("Error: ", e);
                }
                return;
            }

            processHeaderLine(line);
        }
    }

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

5 votes

private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}

Source File: HttpResponse.java From anthelion with Apache License 2.0

5 votes

private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

5 votes

private void parseHeaders(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {

  while (readLine(in, line, true) != 0) {

    // handle HTTP responses with missing blank line after headers
    int pos;
    if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 
         || ((pos= line.indexOf("<HTML")) != -1) 
         || ((pos= line.indexOf("<html")) != -1) ) {

      in.unread(line.substring(pos).getBytes("UTF-8"));
      line.setLength(pos);

      try {
          //TODO: (CM) We don't know the header names here
          //since we're just handling them generically. It would
          //be nice to provide some sort of mapping function here
          //for the returned header names to the standard metadata
          //names in the ParseData class
        processHeaderLine(line);
      } catch (Exception e) {
        // fixme:
        Http.LOG.warn("Error: ", e);
      }
      return;
    }

    processHeaderLine(line);
  }
}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * 
 * @param in
 * @param line
 * @throws HttpException
 * @throws IOException
 */
@SuppressWarnings("unused")
private void readChunkedContent(PushbackInputStream in,  
                                StringBuffer line) 
  throws HttpException, IOException {
  boolean doneChunks= false;
  int contentBytesRead= 0;
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);

  while (!doneChunks) {
    if (Http.LOG.isTraceEnabled()) {
      Http.LOG.trace("Http: starting chunk");
    }

    readLine(in, line, false);

    String chunkLenStr;
    // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }

    int pos= line.indexOf(";");
    if (pos < 0) {
      chunkLenStr= line.toString();
    } else {
      chunkLenStr= line.substring(0, pos);
      // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); }
    }
    chunkLenStr= chunkLenStr.trim();
    int chunkLen;
    try {
      chunkLen= Integer.parseInt(chunkLenStr, 16);
    } catch (NumberFormatException e){ 
      throw new HttpException("bad chunk length: "+line.toString());
    }

    if (chunkLen == 0) {
      doneChunks= true;
      break;
    }

    if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
      chunkLen= http.getMaxContent() - contentBytesRead;

    // read one chunk
    int chunkBytesRead= 0;
    while (chunkBytesRead < chunkLen) {

      int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
                  (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
      int len= in.read(bytes, 0, toRead);

      if (len == -1) 
        throw new HttpException("chunk eof after " + contentBytesRead
                                    + " bytes in successful chunks"
                                    + " and " + chunkBytesRead 
                                    + " in current chunk");

      // DANGER!!! Will printed GZIPed stuff right to your
      // terminal!
      // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len)); }

      out.write(bytes, 0, len);
      chunkBytesRead+= len;  
    }

    readLine(in, line, false);

  }

  if (!doneChunks) {
    if (contentBytesRead != http.getMaxContent()) 
      throw new HttpException("chunk eof: !doneChunk && didn't max out");
    return;
  }

  content = out.toByteArray();
  parseHeaders(in, line);

}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * 
 * @param in
 * @param line
 * @throws HttpException
 * @throws IOException
 */
@SuppressWarnings("unused")
private void readChunkedContent(PushbackInputStream in, StringBuffer line) throws HttpException, IOException {
    boolean doneChunks = false;
    int contentBytesRead = 0;
    byte[] bytes = new byte[Http.BUFFER_SIZE];
    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);

    while (!doneChunks) {
        if (Http.LOG.isTraceEnabled()) {
            Http.LOG.trace("Http: starting chunk");
        }

        readLine(in, line, false);

        String chunkLenStr;
        // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }

        int pos = line.indexOf(";");
        if (pos < 0) {
            chunkLenStr = line.toString();
        } else {
            chunkLenStr = line.substring(0, pos);
            // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); }
        }
        chunkLenStr = chunkLenStr.trim();
        int chunkLen;
        try {
            chunkLen = Integer.parseInt(chunkLenStr, 16);
        } catch (NumberFormatException e) {
            throw new HttpException("bad chunk length: " + line.toString());
        }

        if (chunkLen == 0) {
            doneChunks = true;
            break;
        }

        if ((contentBytesRead + chunkLen) > http.getMaxContent())
            chunkLen = http.getMaxContent() - contentBytesRead;

        // read one chunk
        int chunkBytesRead = 0;
        while (chunkBytesRead < chunkLen) {

            int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? (chunkLen - chunkBytesRead)
                    : Http.BUFFER_SIZE;
            int len = in.read(bytes, 0, toRead);

            if (len == -1)
                throw new HttpException("chunk eof after " + contentBytesRead + " bytes in successful chunks"
                        + " and " + chunkBytesRead + " in current chunk");

            // DANGER!!! Will printed GZIPed stuff right to your
            // terminal!
            // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len)); }

            out.write(bytes, 0, len);
            chunkBytesRead += len;
        }

        readLine(in, line, false);

    }

    if (!doneChunks) {
        if (contentBytesRead != http.getMaxContent())
            throw new HttpException("chunk eof: !doneChunk && didn't max out");
        return;
    }

    content = out.toByteArray();
    parseHeaders(in, line);

}

Source File: HttpResponse.java From anthelion with Apache License 2.0

4 votes

private void readChunkedContent(PushbackInputStream in,  
                                StringBuffer line) 
  throws HttpException, IOException {
  boolean doneChunks= false;
  int contentBytesRead= 0;
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);

  while (!doneChunks) {
    if (Http.LOG.isTraceEnabled()) {
      Http.LOG.trace("Http: starting chunk");
    }

    readLine(in, line, false);

    String chunkLenStr;
    // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }

    int pos= line.indexOf(";");
    if (pos < 0) {
      chunkLenStr= line.toString();
    } else {
      chunkLenStr= line.substring(0, pos);
      // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); }
    }
    chunkLenStr= chunkLenStr.trim();
    int chunkLen;
    try {
      chunkLen= Integer.parseInt(chunkLenStr, 16);
    } catch (NumberFormatException e){ 
      throw new HttpException("bad chunk length: "+line.toString());
    }

    if (chunkLen == 0) {
      doneChunks= true;
      break;
    }

    if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
      chunkLen= http.getMaxContent() - contentBytesRead;

    // read one chunk
    int chunkBytesRead= 0;
    while (chunkBytesRead < chunkLen) {

      int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
                  (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
      int len= in.read(bytes, 0, toRead);

      if (len == -1) 
        throw new HttpException("chunk eof after " + contentBytesRead
                                    + " bytes in successful chunks"
                                    + " and " + chunkBytesRead 
                                    + " in current chunk");

      // DANGER!!! Will printed GZIPed stuff right to your
      // terminal!
      // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len)); }

      out.write(bytes, 0, len);
      chunkBytesRead+= len;  
    }

    readLine(in, line, false);

  }

  if (!doneChunks) {
    if (contentBytesRead != http.getMaxContent()) 
      throw new HttpException("chunk eof: !doneChunk && didn't max out");
    return;
  }

  content = out.toByteArray();
  parseHeaders(in, line);

}

org.apache.nutch.protocol.http.api.HttpException Java Examples