org.apache.nutch.net.protocols.Response Java Exaples

Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0

6 votes

/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}

Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: TestProtocolFile.java From anthelion with Apache License 2.0

6 votes

/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}

Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * get dir list as http response
 * @param f
 * @throws IOException
 */
private void getDirAsHttpResponse(java.io.File f) throws IOException {

  String path = f.toString();
  if (this.file.crawlParents)
    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
        : true);
  else
    this.content = list2html(f.listFiles(), path, false);

  // set headers
  headers.set(Response.CONTENT_LENGTH,
      new Integer(this.content.length).toString());
  headers.set(Response.CONTENT_TYPE, "text/html");
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  // response code
  this.code = 200; // http OK
}

Source File: FileResponse.java From anthelion with Apache License 2.0

6 votes

private void getDirAsHttpResponse(java.io.File f) throws IOException {

    String path = f.toString();
    if (this.file.crawlParents)
      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
          : true);
    else
      this.content = list2html(f.listFiles(), path, false);

    // set headers
    headers.set(Response.CONTENT_LENGTH,
        new Integer(this.content.length).toString());
    headers.set(Response.CONTENT_TYPE, "text/html");
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));

    // response code
    this.code = 200; // http OK
  }

Source File: TestProtocolFile.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

private void readPlainContent(InputStream in) throws HttpException, IOException {

        int contentLength = Integer.MAX_VALUE; // get content length
        String contentLengthString = headers.get(Response.CONTENT_LENGTH);
        if (contentLengthString != null) {
            contentLengthString = contentLengthString.trim();
            try {
                if (!contentLengthString.isEmpty())
                    contentLength = Integer.parseInt(contentLengthString);
            } catch (NumberFormatException e) {
                throw new HttpException("bad content length: " + contentLengthString);
            }
        }
        if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size
            contentLength = http.getMaxContent();

        ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
        byte[] bytes = new byte[Http.BUFFER_SIZE];
        int length = 0; // read content
        for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

            out.write(bytes, 0, i);
            length += i;
        }
        content = out.toByteArray();
    }

Source File: HTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}

Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0

5 votes

private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}

Source File: HttpBase.java From nutch-htmlunit with Apache License 2.0

5 votes

protected static void main(HttpBase http, String[] args) throws Exception {
    boolean verbose = false;
    String url = null;
    
    String usage = "Usage: Http [-verbose] [-timeout N] url";
    
    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }
    
    for (int i = 0; i < args.length; i++) { // parse command line
      if (args[i].equals("-timeout")) { // found -timeout option
        http.timeout = Integer.parseInt(args[++i]) * 1000;
      } else if (args[i].equals("-verbose")) { // found -verbose option
        verbose = true;
      } else if (i != args.length - 1) {
        System.err.println(usage);
        System.exit(-1);
      } else // root is required parameter
        url = args[i];
    }
    
//    if (verbose) {
//      LOGGER.setLevel(Level.FINE);
//    }
    
    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
    Content content = out.getContent();
    
    System.out.println("Status: " + out.getStatus());
    if (content != null) {
      System.out.println("Content Type: " + content.getContentType());
      System.out.println("Content Length: " +
                         content.getMetadata().get(Response.CONTENT_LENGTH));
      System.out.println("Content:");
      String text = new String(content.getContent());
      System.out.println(text);
    }  
  }

Source File: EncodingDetector.java From nutch-htmlunit with Apache License 2.0

5 votes

public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}

Source File: ParseSegment.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}

Source File: EncodingDetector.java From anthelion with Apache License 2.0

5 votes

public void autoDetectClues(Content content, boolean filter) {
  byte[] data = content.getContent();

  if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
      && data.length > MIN_LENGTH) {
    CharsetMatch[] matches = null;

    // do all these in a try/catch; setText and detect/detectAll
    // will sometimes throw exceptions
    try {
      detector.enableInputFilter(filter);
      if (data.length > MIN_LENGTH) {
        detector.setText(data);
        matches = detector.detectAll();
      }
    } catch (Exception e) {
      LOG.debug("Exception from ICU4J (ignoring): ", e);
    }

    if (matches != null) {
      for (CharsetMatch match : matches) {
        addClue(match.getName(), "detect", match.getConfidence());
      }
    }
  }

  // add character encoding coming from HTTP response header
  addClue(parseCharacterEncoding(
      content.getMetadata().get(Response.CONTENT_TYPE)), "header");
}

Source File: HttpResponse.java From anthelion with Apache License 2.0

5 votes

private void readPlainContent(InputStream in) 
  throws HttpException, IOException {

  int contentLength = Integer.MAX_VALUE;    // get content length
  String contentLengthString = headers.get(Response.CONTENT_LENGTH);
  if (contentLengthString != null) {
    contentLengthString = contentLengthString.trim();
    try {
      if (!contentLengthString.isEmpty()) 
        contentLength = Integer.parseInt(contentLengthString);
    } catch (NumberFormatException e) {
      throw new HttpException("bad content length: "+contentLengthString);
    }
  }
  if (http.getMaxContent() >= 0
    && contentLength > http.getMaxContent())   // limit download size
    contentLength  = http.getMaxContent();

  ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
  byte[] bytes = new byte[Http.BUFFER_SIZE];
  int length = 0;                           // read content
  for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) {

    out.write(bytes, 0, i);
    length += i;
  }
  content = out.toByteArray();
}

Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0

5 votes

private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}

Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0

5 votes

public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}

Source File: ParseSegment.java From anthelion with Apache License 2.0

5 votes

/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}

Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0

5 votes

private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}

Source File: HttpBase.java From nutch-htmlunit with Apache License 2.0

4 votes

protected abstract Response getResponse(URL url,
                                      CrawlDatum datum,
                                      boolean followRedirects)
throws ProtocolException, IOException;

Source File: File.java From nutch-htmlunit with Apache License 2.0

4 votes

/** 
 * Quick way for running this class. Useful for debugging.
 */
public static void main(String[] args) throws Exception {
  int maxContentLength = Integer.MIN_VALUE;
  String logLevel = "info";
  boolean dumpContent = false;
  String urlString = null;

  String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";

  if (args.length == 0) {
    System.err.println(usage);
    System.exit(-1);
  }
    
  for (int i = 0; i < args.length; i++) {
    if (args[i].equals("-logLevel")) {
      logLevel = args[++i];
    } else if (args[i].equals("-maxContentLength")) {
      maxContentLength = Integer.parseInt(args[++i]);
    } else if (args[i].equals("-dumpContent")) {
      dumpContent = true;
    } else if (i != args.length-1) {
      System.err.println(usage);
      System.exit(-1);
    } else
      urlString = args[i];
  }

  File file = new File();
  file.setConf(NutchConfiguration.create());

  if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
    file.setMaxContentLength(maxContentLength);

  // set log level
  //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

  Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

  System.err.println("Content-Type: " + content.getContentType());
  System.err.println("Content-Length: " +
                     content.getMetadata().get(Response.CONTENT_LENGTH));
  System.err.println("Last-Modified: " +
                     content.getMetadata().get(Response.LAST_MODIFIED));
  if (dumpContent) {
    System.out.print(new String(content.getContent()));
  }

  file = null;
}

Source File: FileResponse.java From anthelion with Apache License 2.0

4 votes

private void getFileAsHttpResponse(java.io.File f) throws FileException,
    IOException {

  // ignore file of size larger than
  // Integer.MAX_VALUE = 2^31-1 = 2147483647
  long size = f.length();
  if (size > Integer.MAX_VALUE) {
    throw new FileException("file is too large, size: " + size);
    // or we can do this?
    // this.code = 400; // http Bad request
    // return;
  }

  // capture content
  int len = (int) size;

  if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
    len = this.file.maxContentLength;

  this.content = new byte[len];

  java.io.InputStream is = new java.io.FileInputStream(f);
  int offset = 0;
  int n = 0;
  while (offset < len
      && (n = is.read(this.content, offset, len - offset)) >= 0) {
    offset += n;
  }
  if (offset < len) { // keep whatever already have, but issue a warning
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("not enough bytes read from file: " + f.getPath());
    }
  }
  is.close();

  // set headers
  headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  String mimeType = MIME.getMimeType(f);

  headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

  // response code
  this.code = 200; // http OK
}

Source File: FileResponse.java From anthelion with Apache License 2.0

4 votes

public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * <p>
 * Add Content-Type and its primaryType and subType add contentType,
 * primaryType and subType to field "type" as un-stored, indexed and
 * un-tokenized, so that search results can be confined by contentType or its
 * primaryType or its subType.
 * </p>
 * <p>
 * For example, if contentType is application/vnd.ms-powerpoint, search can be
 * done with one of the following qualifiers
 * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
 * all case insensitive. The query filter is implemented in
 * {@link TypeQueryFilter}.
 * </p>
 *
 * @param doc
 * @param data
 * @param url
 * @return
 */
private NutchDocument addType(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  String mimeType = null;
  String contentType = null;

  Writable tcontentType = datum.getMetaData().get(
      new Text(Response.CONTENT_TYPE));
  if (tcontentType != null) {
    contentType = tcontentType.toString();
  } else
    contentType = data.getMeta(Response.CONTENT_TYPE);
  if (contentType == null) {
    // Note by Jerome Charron on 20050415:
    // Content Type not solved by a previous plugin
    // Or unable to solve it... Trying to find it
    // Should be better to use the doc content too
    // (using MimeTypes.getMimeType(byte[], String), but I don't know
    // which field it is?
    // if (MAGIC) {
    //   contentType = MIME.getMimeType(url, content);
    // } else {
    //   contentType = MIME.getMimeType(url);
    // }

    mimeType = tika.detect(url);
  } else {
    mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
  }

  // Checks if we solved the content-type.
  if (mimeType == null) {
    return doc;
  }

  // Check if we have to map mime types
  if (mapMimes) {
    // Check if the current mime is mapped
    if (mimeMap.containsKey(mimeType)) {
      // It's mapped, let's replace it
      mimeType = mimeMap.get(mimeType);
    }
  }

  contentType = mimeType;
  doc.add("type", contentType);

  // Check if we need to split the content type in sub parts
  if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
    String[] parts = getParts(contentType);

    for(String part: parts) {
      doc.add("type", part);
    }
  }

  // leave this for future improvement
  //MimeTypeParameterList parameterList = mimeType.getParameters()

  return doc;
}

Source File: FileResponse.java From anthelion with Apache License 2.0

4 votes

public Content toContent() {
  return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
      getHeader(Response.CONTENT_TYPE), headers, this.conf);
}

Source File: Ftp.java From anthelion with Apache License 2.0

4 votes

/** For debugging. */
public static void main(String[] args) throws Exception {
  int timeout = Integer.MIN_VALUE;
  int maxContentLength = Integer.MIN_VALUE;
  String logLevel = "info";
  boolean followTalk = false;
  boolean keepConnection = false;
  boolean dumpContent = false;
  String urlString = null;

  String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";

  if (args.length == 0) {
    System.err.println(usage);
    System.exit(-1);
  }
    
  for (int i = 0; i < args.length; i++) {
    if (args[i].equals("-logLevel")) {
      logLevel = args[++i];
    } else if (args[i].equals("-followTalk")) {
      followTalk = true;
    } else if (args[i].equals("-keepConnection")) {
      keepConnection = true;
    } else if (args[i].equals("-timeout")) {
      timeout = Integer.parseInt(args[++i]) * 1000;
    } else if (args[i].equals("-maxContentLength")) {
      maxContentLength = Integer.parseInt(args[++i]);
    } else if (args[i].equals("-dumpContent")) {
      dumpContent = true;
    } else if (i != args.length-1) {
      System.err.println(usage);
      System.exit(-1);
    } else {
      urlString = args[i];
    }
  }

  Ftp ftp = new Ftp();

  ftp.setFollowTalk(followTalk);
  ftp.setKeepConnection(keepConnection);

  if (timeout != Integer.MIN_VALUE) // set timeout
    ftp.setTimeout(timeout);

  if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
    ftp.setMaxContentLength(maxContentLength);

  // set log level
  //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

  Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

  System.err.println("Content-Type: " + content.getContentType());
  System.err.println("Content-Length: " +
                     content.getMetadata().get(Response.CONTENT_LENGTH));
  System.err.println("Last-Modified: " +
                    content.getMetadata().get(Response.LAST_MODIFIED));
  if (dumpContent) {
    System.out.print(new String(content.getContent()));
  }

  ftp = null;
}

Source File: Http.java From nutch-htmlunit with Apache License 2.0

4 votes

protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  throws ProtocolException, IOException {
  return new HttpResponse(this, url, datum);
}

org.apache.nutch.net.protocols.Response Java Examples