org.apache.nutch.net.protocols.Response Java Examples

The following examples show how to use org.apache.nutch.net.protocols.Response. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LanguageIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #2
Source File: HTMLLanguageParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}
 
Example #3
Source File: LanguageIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #4
Source File: TestProtocolFile.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}
 
Example #5
Source File: FileResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * get dir list as http response
 * @param f
 * @throws IOException
 */
private void getDirAsHttpResponse(java.io.File f) throws IOException {

  String path = f.toString();
  if (this.file.crawlParents)
    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
        : true);
  else
    this.content = list2html(f.listFiles(), path, false);

  // set headers
  headers.set(Response.CONTENT_LENGTH,
      new Integer(this.content.length).toString());
  headers.set(Response.CONTENT_TYPE, "text/html");
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  // response code
  this.code = 200; // http OK
}
 
Example #6
Source File: FileResponse.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private void getDirAsHttpResponse(java.io.File f) throws IOException {

    String path = f.toString();
    if (this.file.crawlParents)
      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
          : true);
    else
      this.content = list2html(f.listFiles(), path, false);

    // set headers
    headers.set(Response.CONTENT_LENGTH,
        new Integer(this.content.length).toString());
    headers.set(Response.CONTENT_TYPE, "text/html");
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));

    // response code
    this.code = 200; // http OK
  }
 
Example #7
Source File: TestProtocolFile.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}
 
Example #8
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private void readPlainContent(InputStream in) throws HttpException, IOException {

        int contentLength = Integer.MAX_VALUE; // get content length
        String contentLengthString = headers.get(Response.CONTENT_LENGTH);
        if (contentLengthString != null) {
            contentLengthString = contentLengthString.trim();
            try {
                if (!contentLengthString.isEmpty())
                    contentLength = Integer.parseInt(contentLengthString);
            } catch (NumberFormatException e) {
                throw new HttpException("bad content length: " + contentLengthString);
            }
        }
        if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size
            contentLength = http.getMaxContent();

        ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
        byte[] bytes = new byte[Http.BUFFER_SIZE];
        int length = 0; // read content
        for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

            out.write(bytes, 0, i);
            length += i;
        }
        content = out.toByteArray();
    }
 
Example #9
Source File: HTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}
 
Example #10
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
Example #11
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}
 
Example #12
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
 
Example #13
Source File: HttpResponse.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}
 
Example #14
Source File: HttpBase.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
protected static void main(HttpBase http, String[] args) throws Exception {
    boolean verbose = false;
    String url = null;
    
    String usage = "Usage: Http [-verbose] [-timeout N] url";
    
    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }
    
    for (int i = 0; i < args.length; i++) { // parse command line
      if (args[i].equals("-timeout")) { // found -timeout option
        http.timeout = Integer.parseInt(args[++i]) * 1000;
      } else if (args[i].equals("-verbose")) { // found -verbose option
        verbose = true;
      } else if (i != args.length - 1) {
        System.err.println(usage);
        System.exit(-1);
      } else // root is required parameter
        url = args[i];
    }
    
//    if (verbose) {
//      LOGGER.setLevel(Level.FINE);
//    }
    
    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
    Content content = out.getContent();
    
    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
      System.out.println("Content Length: " +
                         content.getMetadata().get(Response.CONTENT_LENGTH));
      System.out.println("Content:");
      String text = new String(content.getContent());
      System.out.println(text);
    }  
  }
 
Example #15
Source File: EncodingDetector.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}
 
Example #16
Source File: ParseSegment.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}
 
Example #17
Source File: EncodingDetector.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}
 
Example #18
Source File: HttpResponse.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}
 
Example #19
Source File: TestMoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
 
Example #20
Source File: TestMoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
Example #21
Source File: ParseSegment.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}
 
Example #22
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}
 
Example #23
Source File: HttpBase.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
protected abstract Response getResponse(URL url,
                                      CrawlDatum datum,
                                      boolean followRedirects)
throws ProtocolException, IOException;
 
Example #24
Source File: File.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/** 
 * Quick way for running this class. Useful for debugging.
 */
public static void main(String[] args) throws Exception {
  int maxContentLength = Integer.MIN_VALUE;
  String logLevel = "info";
  boolean dumpContent = false;
  String urlString = null;

  String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";

  if (args.length == 0) {
    System.err.println(usage);
    System.exit(-1);
  }
    
  for (int i = 0; i < args.length; i++) {
    if (args[i].equals("-logLevel")) {
      logLevel = args[++i];
    } else if (args[i].equals("-maxContentLength")) {
      maxContentLength = Integer.parseInt(args[++i]);
    } else if (args[i].equals("-dumpContent")) {
      dumpContent = true;
    } else if (i != args.length-1) {
      System.err.println(usage);
      System.exit(-1);
    } else
      urlString = args[i];
  }

  File file = new File();
  file.setConf(NutchConfiguration.create());

  if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
    file.setMaxContentLength(maxContentLength);

  // set log level
  //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

  Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

  System.err.println("Content-Type: " + content.getContentType());
  System.err.println("Content-Length: " +
                     content.getMetadata().get(Response.CONTENT_LENGTH));
  System.err.println("Last-Modified: " +
                     content.getMetadata().get(Response.LAST_MODIFIED));
  if (dumpContent) {
    System.out.print(new String(content.getContent()));
  }

  file = null;
}
 
Example #25
Source File: FileResponse.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private void getFileAsHttpResponse(java.io.File f) throws FileException,
    IOException {

  // ignore file of size larger than
  // Integer.MAX_VALUE = 2^31-1 = 2147483647
  long size = f.length();
  if (size > Integer.MAX_VALUE) {
    throw new FileException("file is too large, size: " + size);
    // or we can do this?
    // this.code = 400; // http Bad request
    // return;
  }

  // capture content
  int len = (int) size;

  if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
    len = this.file.maxContentLength;

  this.content = new byte[len];

  java.io.InputStream is = new java.io.FileInputStream(f);
  int offset = 0;
  int n = 0;
  while (offset < len
      && (n = is.read(this.content, offset, len - offset)) >= 0) {
    offset += n;
  }
  if (offset < len) { // keep whatever already have, but issue a warning
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("not enough bytes read from file: " + f.getPath());
    }
  }
  is.close();

  // set headers
  headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  String mimeType = MIME.getMimeType(f);

  headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

  // response code
  this.code = 200; // http OK
}
 
Example #26
Source File: FileResponse.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}
 
Example #27
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * Add Content-Type and its primaryType and subType add contentType,
 * primaryType and subType to field "type" as un-stored, indexed and
 * un-tokenized, so that search results can be confined by contentType or its
 * primaryType or its subType.
 * </p>
 * <p>
 * For example, if contentType is application/vnd.ms-powerpoint, search can be
 * done with one of the following qualifiers
 * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
 * all case insensitive. The query filter is implemented in
 * {@link TypeQueryFilter}.
 * </p>
 *
 * @param doc
 * @param data
 * @param url
 * @return
 */
private NutchDocument addType(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  String mimeType = null;
  String contentType = null;

  Writable tcontentType = datum.getMetaData().get(
      new Text(Response.CONTENT_TYPE));
  if (tcontentType != null) {
    contentType = tcontentType.toString();
  } else
    contentType = data.getMeta(Response.CONTENT_TYPE);
  if (contentType == null) {
    // Note by Jerome Charron on 20050415:
    // Content Type not solved by a previous plugin
    // Or unable to solve it... Trying to find it
    // Should be better to use the doc content too
    // (using MimeTypes.getMimeType(byte[], String), but I don't know
    // which field it is?
    // if (MAGIC) {
    //   contentType = MIME.getMimeType(url, content);
    // } else {
    //   contentType = MIME.getMimeType(url);
    // }

    mimeType = tika.detect(url);
  } else {
    mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
  }

  // Checks if we solved the content-type.
  if (mimeType == null) {
    return doc;
  }

  // Check if we have to map mime types
  if (mapMimes) {
    // Check if the current mime is mapped
    if (mimeMap.containsKey(mimeType)) {
      // It's mapped, let's replace it
      mimeType = mimeMap.get(mimeType);
    }
  }

  contentType = mimeType;
  doc.add("type", contentType);

  // Check if we need to split the content type in sub parts
  if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
    String[] parts = getParts(contentType);

    for(String part: parts) {
      doc.add("type", part);
    }
  }

  // leave this for future improvement
  //MimeTypeParameterList parameterList = mimeType.getParameters()

  return doc;
}
 
Example #28
Source File: FileResponse.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public Content toContent() {
  return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
      getHeader(Response.CONTENT_TYPE), headers, this.conf);
}
 
Example #29
Source File: Ftp.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/** For debugging. */
public static void main(String[] args) throws Exception {
  int timeout = Integer.MIN_VALUE;
  int maxContentLength = Integer.MIN_VALUE;
  String logLevel = "info";
  boolean followTalk = false;
  boolean keepConnection = false;
  boolean dumpContent = false;
  String urlString = null;

  String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";

  if (args.length == 0) {
    System.err.println(usage);
    System.exit(-1);
  }
    
  for (int i = 0; i < args.length; i++) {
    if (args[i].equals("-logLevel")) {
      logLevel = args[++i];
    } else if (args[i].equals("-followTalk")) {
      followTalk = true;
    } else if (args[i].equals("-keepConnection")) {
      keepConnection = true;
    } else if (args[i].equals("-timeout")) {
      timeout = Integer.parseInt(args[++i]) * 1000;
    } else if (args[i].equals("-maxContentLength")) {
      maxContentLength = Integer.parseInt(args[++i]);
    } else if (args[i].equals("-dumpContent")) {
      dumpContent = true;
    } else if (i != args.length-1) {
      System.err.println(usage);
      System.exit(-1);
    } else {
      urlString = args[i];
    }
  }

  Ftp ftp = new Ftp();

  ftp.setFollowTalk(followTalk);
  ftp.setKeepConnection(keepConnection);

  if (timeout != Integer.MIN_VALUE) // set timeout
    ftp.setTimeout(timeout);

  if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
    ftp.setMaxContentLength(maxContentLength);

  // set log level
  //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

  Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

  System.err.println("Content-Type: " + content.getContentType());
  System.err.println("Content-Length: " +
                     content.getMetadata().get(Response.CONTENT_LENGTH));
  System.err.println("Last-Modified: " +
                    content.getMetadata().get(Response.LAST_MODIFIED));
  if (dumpContent) {
    System.out.print(new String(content.getContent()));
  }

  ftp = null;
}
 
Example #30
Source File: Http.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  throws ProtocolException, IOException {
  return new HttpResponse(this, url, datum);
}