org.apache.nutch.protocol.http.api.HttpException Java Examples

The following examples show how to use org.apache.nutch.protocol.http.api.HttpException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpResponse.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private int parseStatusLine(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {
  readLine(in, line, false);

  int codeStart = line.indexOf(" ");
  int codeEnd = line.indexOf(" ", codeStart+1);

  // handle lines with no plaintext result code, ie:
  // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
  if (codeEnd == -1) 
    codeEnd= line.length();

  int code;
  try {
    code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
  } catch (NumberFormatException e) {
    throw new HttpException("bad status line '" + line 
                            + "': " + e.getMessage(), e);
  }

  return code;
}
 
Example #2
Source File: HttpResponse.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private void processHeaderLine(StringBuffer line)
  throws IOException, HttpException {

  int colonIndex = line.indexOf(":");       // key is up to colon
  if (colonIndex == -1) {
    int i;
    for (i= 0; i < line.length(); i++)
      if (!Character.isWhitespace(line.charAt(i)))
        break;
    if (i == line.length())
      return;
    throw new HttpException("No colon in header:" + line);
  }
  String key = line.substring(0, colonIndex);

  int valueStart = colonIndex+1;            // skip whitespace
  while (valueStart < line.length()) {
    int c = line.charAt(valueStart);
    if (c != ' ' && c != '\t')
      break;
    valueStart++;
  }
  String value = line.substring(valueStart);
  headers.set(key, value);
}
 
Example #3
Source File: HttpResponse.java    From nutch-selenium with Apache License 2.0 6 votes vote down vote up
private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
    readLine(in, line, false);

    int codeStart = line.indexOf(" ");
    int codeEnd = line.indexOf(" ", codeStart + 1);

    // handle lines with no plaintext result code, ie:
    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
    if (codeEnd == -1)
        codeEnd = line.length();

    int code;
    try {
        code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
    } catch (NumberFormatException e) {
        throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
    }

    return code;
}
 
Example #4
Source File: HttpResponse.java    From nutch-selenium with Apache License 2.0 6 votes vote down vote up
private void processHeaderLine(StringBuffer line) throws IOException, HttpException {

        int colonIndex = line.indexOf(":"); // key is up to colon
        if (colonIndex == -1) {
            int i;
            for (i = 0; i < line.length(); i++)
                if (!Character.isWhitespace(line.charAt(i)))
                    break;
            if (i == line.length())
                return;
            throw new HttpException("No colon in header:" + line);
        }
        String key = line.substring(0, colonIndex);

        int valueStart = colonIndex + 1; // skip whitespace
        while (valueStart < line.length()) {
            int c = line.charAt(valueStart);
            if (c != ' ' && c != '\t')
                break;
            valueStart++;
        }
        String value = line.substring(valueStart);
        headers.set(key, value);
    }
 
Example #5
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private int parseStatusLine(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {
  readLine(in, line, false);

  int codeStart = line.indexOf(" ");
  int codeEnd = line.indexOf(" ", codeStart+1);

  // handle lines with no plaintext result code, ie:
  // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
  if (codeEnd == -1) 
    codeEnd= line.length();

  int code;
  try {
    code= Integer.parseInt(line.substring(codeStart+1, codeEnd));
  } catch (NumberFormatException e) {
    throw new HttpException("bad status line '" + line 
                            + "': " + e.getMessage(), e);
  }

  return code;
}
 
Example #6
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private void processHeaderLine(StringBuffer line)
  throws IOException, HttpException {

  int colonIndex = line.indexOf(":");       // key is up to colon
  if (colonIndex == -1) {
    int i;
    for (i= 0; i < line.length(); i++)
      if (!Character.isWhitespace(line.charAt(i)))
        break;
    if (i == line.length())
      return;
    throw new HttpException("No colon in header:" + line);
  }
  String key = line.substring(0, colonIndex);

  int valueStart = colonIndex+1;            // skip whitespace
  while (valueStart < line.length()) {
    int c = line.charAt(valueStart);
    if (c != ' ' && c != '\t')
      break;
    valueStart++;
  }
  String value = line.substring(valueStart);
  headers.set(key, value);
}
 
Example #7
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private void readPlainContent(InputStream in) throws HttpException, IOException {

        int contentLength = Integer.MAX_VALUE; // get content length
        String contentLengthString = headers.get(Response.CONTENT_LENGTH);
        if (contentLengthString != null) {
            contentLengthString = contentLengthString.trim();
            try {
                if (!contentLengthString.isEmpty())
                    contentLength = Integer.parseInt(contentLengthString);
            } catch (NumberFormatException e) {
                throw new HttpException("bad content length: " + contentLengthString);
            }
        }
        if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size
            contentLength = http.getMaxContent();

        ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
        byte[] bytes = new byte[Http.BUFFER_SIZE];
        int length = 0; // read content
        for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

            out.write(bytes, 0, i);
            length += i;
        }
        content = out.toByteArray();
    }
 
Example #8
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private void processHeaderLine(StringBuffer line) throws IOException, HttpException {

        int colonIndex = line.indexOf(":"); // key is up to colon
        if (colonIndex == -1) {
            int i;
            for (i = 0; i < line.length(); i++)
                if (!Character.isWhitespace(line.charAt(i)))
                    break;
            if (i == line.length())
                return;
            throw new HttpException("No colon in header:" + line);
        }
        String key = line.substring(0, colonIndex);

        int valueStart = colonIndex + 1; // skip whitespace
        while (valueStart < line.length()) {
            int c = line.charAt(valueStart);
            if (c != ' ' && c != '\t')
                break;
            valueStart++;
        }
        String value = line.substring(valueStart);
        headers.set(key, value);
    }
 
Example #9
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
    readLine(in, line, false);

    int codeStart = line.indexOf(" ");
    int codeEnd = line.indexOf(" ", codeStart + 1);

    // handle lines with no plaintext result code, ie:
    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
    if (codeEnd == -1)
        codeEnd = line.length();

    int code;
    try {
        code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
    } catch (NumberFormatException e) {
        throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
    }

    return code;
}
 
Example #10
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {

        while (readLine(in, line, true) != 0) {

            // handle HTTP responses with missing blank line after headers
            int pos;
            if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
                    || ((pos = line.indexOf("<html")) != -1)) {

                in.unread(line.substring(pos).getBytes("UTF-8"));
                line.setLength(pos);

                try {
                    //TODO: (CM) We don't know the header names here
                    //since we're just handling them generically. It would
                    //be nice to provide some sort of mapping function here
                    //for the returned header names to the standard metadata
                    //names in the ParseData class
                    processHeaderLine(line);
                } catch (Exception e) {
                    // fixme:
                    Http.LOG.warn("Error: ", e);
                }
                return;
            }

            processHeaderLine(line);
        }
    }
 
Example #11
Source File: HttpResponse.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void parseHeaders(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {

  while (readLine(in, line, true) != 0) {

    // handle HTTP responses with missing blank line after headers
    int pos;
    if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 
         || ((pos= line.indexOf("<HTML")) != -1) 
         || ((pos= line.indexOf("<html")) != -1) ) {

      in.unread(line.substring(pos).getBytes("UTF-8"));
      line.setLength(pos);

      try {
          //TODO: (CM) We don't know the header names here
          //since we're just handling them generically. It would
          //be nice to provide some sort of mapping function here
          //for the returned header names to the standard metadata
          //names in the ParseData class
        processHeaderLine(line);
      } catch (Exception e) {
        // fixme:
        Http.LOG.warn("Error: ", e);
      }
      return;
    }

    processHeaderLine(line);
  }
}
 
Example #12
Source File: HttpResponse.java    From nutch-selenium with Apache License 2.0 5 votes vote down vote up
private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {

        while (readLine(in, line, true) != 0) {

            // handle HTTP responses with missing blank line after headers
            int pos;
            if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
                    || ((pos = line.indexOf("<html")) != -1)) {

                in.unread(line.substring(pos).getBytes("UTF-8"));
                line.setLength(pos);

                try {
                    //TODO: (CM) We don't know the header names here
                    //since we're just handling them generically. It would
                    //be nice to provide some sort of mapping function here
                    //for the returned header names to the standard metadata
                    //names in the ParseData class
                    processHeaderLine(line);
                } catch (Exception e) {
                    // fixme:
                    Http.LOG.warn("Error: ", e);
                }
                return;
            }

            processHeaderLine(line);
        }
    }
 
Example #13
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}
 
Example #14
Source File: HttpResponse.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}
 
Example #15
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void parseHeaders(PushbackInputStream in, StringBuffer line)
  throws IOException, HttpException {

  while (readLine(in, line, true) != 0) {

    // handle HTTP responses with missing blank line after headers
    int pos;
    if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) 
         || ((pos= line.indexOf("<HTML")) != -1) 
         || ((pos= line.indexOf("<html")) != -1) ) {

      in.unread(line.substring(pos).getBytes("UTF-8"));
      line.setLength(pos);

      try {
          //TODO: (CM) We don't know the header names here
          //since we're just handling them generically. It would
          //be nice to provide some sort of mapping function here
          //for the returned header names to the standard metadata
          //names in the ParseData class
        processHeaderLine(line);
      } catch (Exception e) {
        // fixme:
        Http.LOG.warn("Error: ", e);
      }
      return;
    }

    processHeaderLine(line);
  }
}
 
Example #16
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * 
 * @param in
 * @param line
 * @throws HttpException
 * @throws IOException
 */
@SuppressWarnings("unused")
private void readChunkedContent(PushbackInputStream in,  
                                StringBuffer line) 
  throws HttpException, IOException {
  boolean doneChunks= false;
  int contentBytesRead= 0;
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);

  while (!doneChunks) {
    if (Http.LOG.isTraceEnabled()) {
      Http.LOG.trace("Http: starting chunk");
    }

    readLine(in, line, false);

    String chunkLenStr;
    // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }

    int pos= line.indexOf(";");
    if (pos < 0) {
      chunkLenStr= line.toString();
    } else {
      chunkLenStr= line.substring(0, pos);
      // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); }
    }
    chunkLenStr= chunkLenStr.trim();
    int chunkLen;
    try {
      chunkLen= Integer.parseInt(chunkLenStr, 16);
    } catch (NumberFormatException e){ 
      throw new HttpException("bad chunk length: "+line.toString());
    }

    if (chunkLen == 0) {
      doneChunks= true;
      break;
    }

    if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
      chunkLen= http.getMaxContent() - contentBytesRead;

    // read one chunk
    int chunkBytesRead= 0;
    while (chunkBytesRead < chunkLen) {

      int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
                  (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
      int len= in.read(bytes, 0, toRead);

      if (len == -1) 
        throw new HttpException("chunk eof after " + contentBytesRead
                                    + " bytes in successful chunks"
                                    + " and " + chunkBytesRead 
                                    + " in current chunk");

      // DANGER!!! Will printed GZIPed stuff right to your
      // terminal!
      // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len)); }

      out.write(bytes, 0, len);
      chunkBytesRead+= len;  
    }

    readLine(in, line, false);

  }

  if (!doneChunks) {
    if (contentBytesRead != http.getMaxContent()) 
      throw new HttpException("chunk eof: !doneChunk && didn't max out");
    return;
  }

  content = out.toByteArray();
  parseHeaders(in, line);

}
 
Example #17
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * 
 * @param in
 * @param line
 * @throws HttpException
 * @throws IOException
 */
@SuppressWarnings("unused")
private void readChunkedContent(PushbackInputStream in, StringBuffer line) throws HttpException, IOException {
    boolean doneChunks = false;
    int contentBytesRead = 0;
    byte[] bytes = new byte[Http.BUFFER_SIZE];
    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);

    while (!doneChunks) {
        if (Http.LOG.isTraceEnabled()) {
            Http.LOG.trace("Http: starting chunk");
        }

        readLine(in, line, false);

        String chunkLenStr;
        // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }

        int pos = line.indexOf(";");
        if (pos < 0) {
            chunkLenStr = line.toString();
        } else {
            chunkLenStr = line.substring(0, pos);
            // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); }
        }
        chunkLenStr = chunkLenStr.trim();
        int chunkLen;
        try {
            chunkLen = Integer.parseInt(chunkLenStr, 16);
        } catch (NumberFormatException e) {
            throw new HttpException("bad chunk length: " + line.toString());
        }

        if (chunkLen == 0) {
            doneChunks = true;
            break;
        }

        if ((contentBytesRead + chunkLen) > http.getMaxContent())
            chunkLen = http.getMaxContent() - contentBytesRead;

        // read one chunk
        int chunkBytesRead = 0;
        while (chunkBytesRead < chunkLen) {

            int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ? (chunkLen - chunkBytesRead)
                    : Http.BUFFER_SIZE;
            int len = in.read(bytes, 0, toRead);

            if (len == -1)
                throw new HttpException("chunk eof after " + contentBytesRead + " bytes in successful chunks"
                        + " and " + chunkBytesRead + " in current chunk");

            // DANGER!!! Will printed GZIPed stuff right to your
            // terminal!
            // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len)); }

            out.write(bytes, 0, len);
            chunkBytesRead += len;
        }

        readLine(in, line, false);

    }

    if (!doneChunks) {
        if (contentBytesRead != http.getMaxContent())
            throw new HttpException("chunk eof: !doneChunk && didn't max out");
        return;
    }

    content = out.toByteArray();
    parseHeaders(in, line);

}
 
Example #18
Source File: HttpResponse.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private void readChunkedContent(PushbackInputStream in,  
                                StringBuffer line) 
  throws HttpException, IOException {
  boolean doneChunks= false;
  int contentBytesRead= 0;
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);

  while (!doneChunks) {
    if (Http.LOG.isTraceEnabled()) {
      Http.LOG.trace("Http: starting chunk");
    }

    readLine(in, line, false);

    String chunkLenStr;
    // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'"); }

    int pos= line.indexOf(";");
    if (pos < 0) {
      chunkLenStr= line.toString();
    } else {
      chunkLenStr= line.substring(0, pos);
      // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " + line.substring(pos+1)); }
    }
    chunkLenStr= chunkLenStr.trim();
    int chunkLen;
    try {
      chunkLen= Integer.parseInt(chunkLenStr, 16);
    } catch (NumberFormatException e){ 
      throw new HttpException("bad chunk length: "+line.toString());
    }

    if (chunkLen == 0) {
      doneChunks= true;
      break;
    }

    if ( (contentBytesRead + chunkLen) > http.getMaxContent() )
      chunkLen= http.getMaxContent() - contentBytesRead;

    // read one chunk
    int chunkBytesRead= 0;
    while (chunkBytesRead < chunkLen) {

      int toRead= (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
                  (chunkLen - chunkBytesRead) : Http.BUFFER_SIZE;
      int len= in.read(bytes, 0, toRead);

      if (len == -1) 
        throw new HttpException("chunk eof after " + contentBytesRead
                                    + " bytes in successful chunks"
                                    + " and " + chunkBytesRead 
                                    + " in current chunk");

      // DANGER!!! Will printed GZIPed stuff right to your
      // terminal!
      // if (LOG.isTraceEnabled()) { LOG.trace("read: " +  new String(bytes, 0, len)); }

      out.write(bytes, 0, len);
      chunkBytesRead+= len;  
    }

    readLine(in, line, false);

  }

  if (!doneChunks) {
    if (contentBytesRead != http.getMaxContent()) 
      throw new HttpException("chunk eof: !doneChunk && didn't max out");
    return;
  }

  content = out.toByteArray();
  parseHeaders(in, line);

}