org.apache.nutch.net.protocols.Response Java Examples
The following examples show how to use
org.apache.nutch.net.protocols.Response.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #2
Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0 | 6 votes |
/** Try to find the document's language from page headers and metadata */ private String detectLanguage(Parse page, DocumentFragment doc) { String lang = getLanguageFromMetadata(page.getData().getParseMeta()); if (lang == null) { LanguageParser parser = new LanguageParser(doc); lang = parser.getLanguage(); } if (lang != null) { return lang; } lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); return lang; }
Example #3
Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #4
Source File: TestProtocolFile.java From anthelion with Apache License 2.0 | 6 votes |
/** * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field. * * @since NUTCH-384 * */ public void setContentType(String testTextFile) throws ProtocolException { String urlString = "file:" + sampleDir + fileSeparator + testTextFile; assertNotNull(urlString); Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), datum); assertNotNull(output); assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output .getStatus().getCode()); assertNotNull(output.getContent()); assertNotNull(output.getContent().getContentType()); assertEquals(expectedMimeType, output.getContent().getContentType()); assertNotNull(output.getContent().getMetadata()); assertEquals(expectedMimeType, output.getContent().getMetadata().get(Response.CONTENT_TYPE)); }
Example #5
Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * get dir list as http response * @param f * @throws IOException */ private void getDirAsHttpResponse(java.io.File f) throws IOException { String path = f.toString(); if (this.file.crawlParents) this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); else this.content = list2html(f.listFiles(), path, false); // set headers headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); headers.set(Response.CONTENT_TYPE, "text/html"); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); // response code this.code = 200; // http OK }
Example #6
Source File: FileResponse.java From anthelion with Apache License 2.0 | 6 votes |
private void getDirAsHttpResponse(java.io.File f) throws IOException { String path = f.toString(); if (this.file.crawlParents) this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); else this.content = list2html(f.listFiles(), path, false); // set headers headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); headers.set(Response.CONTENT_TYPE, "text/html"); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); // response code this.code = 200; // http OK }
Example #7
Source File: TestProtocolFile.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field. * * @since NUTCH-384 * */ public void setContentType(String testTextFile) throws ProtocolException { String urlString = "file:" + sampleDir + fileSeparator + testTextFile; assertNotNull(urlString); Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), datum); assertNotNull(output); assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output .getStatus().getCode()); assertNotNull(output.getContent()); assertNotNull(output.getContent().getContentType()); assertEquals(expectedMimeType, output.getContent().getContentType()); assertNotNull(output.getContent().getMetadata()); assertEquals(expectedMimeType, output.getContent().getMetadata().get(Response.CONTENT_TYPE)); }
Example #8
Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
private void readPlainContent(InputStream in) throws HttpException, IOException { int contentLength = Integer.MAX_VALUE; // get content length String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { if (!contentLengthString.isEmpty()) contentLength = Integer.parseInt(contentLengthString); } catch (NumberFormatException e) { throw new HttpException("bad content length: " + contentLengthString); } } if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size contentLength = http.getMaxContent(); ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); byte[] bytes = new byte[Http.BUFFER_SIZE]; int length = 0; // read content for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) { out.write(bytes, 0, i); length += i; } content = out.toByteArray(); }
Example #9
Source File: HTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** Try to find the document's language from page headers and metadata */ private String detectLanguage(Parse page, DocumentFragment doc) { String lang = getLanguageFromMetadata(page.getData().getParseMeta()); if (lang == null) { LanguageParser parser = new LanguageParser(doc); lang = parser.getLanguage(); } if (lang != null) { return lang; } lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); return lang; }
Example #10
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testContentDispositionTitle() throws IndexingException { Configuration conf = NutchConfiguration.create(); Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); }
Example #11
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) { String contentLength = data.getMeta(Response.CONTENT_LENGTH); if (contentLength != null) { // NUTCH-1010 ContentLength not trimmed String trimmed = contentLength.toString().trim(); if (!trimmed.isEmpty()) doc.add("contentLength", trimmed); } return doc; }
Example #12
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_TYPE, source); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("mime type not detected", expected, doc.getFieldValue("type")); }
Example #13
Source File: HttpResponse.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private void readPlainContent(InputStream in) throws HttpException, IOException { int contentLength = Integer.MAX_VALUE; // get content length String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { if (!contentLengthString.isEmpty()) contentLength = Integer.parseInt(contentLengthString); } catch (NumberFormatException e) { throw new HttpException("bad content length: "+contentLengthString); } } if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size contentLength = http.getMaxContent(); ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); byte[] bytes = new byte[Http.BUFFER_SIZE]; int length = 0; // read content for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) { out.write(bytes, 0, i); length += i; } content = out.toByteArray(); }
Example #14
Source File: HttpBase.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
protected static void main(HttpBase http, String[] args) throws Exception { boolean verbose = false; String url = null; String usage = "Usage: Http [-verbose] [-timeout N] url"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-timeout")) { // found -timeout option http.timeout = Integer.parseInt(args[++i]) * 1000; } else if (args[i].equals("-verbose")) { // found -verbose option verbose = true; } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); } else // root is required parameter url = args[i]; } // if (verbose) { // LOGGER.setLevel(Level.FINE); // } ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum()); Content content = out.getContent(); System.out.println("Status: " + out.getStatus()); if (content != null) { System.out.println("Content Type: " + content.getContentType()); System.out.println("Content Length: " + content.getMetadata().get(Response.CONTENT_LENGTH)); System.out.println("Content:"); String text = new String(content.getContent()); System.out.println(text); } }
Example #15
Source File: EncodingDetector.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): ", e); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding( content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
Example #16
Source File: ParseSegment.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Checks if the page's content is truncated. * @param content * @return If the page is truncated <code>true</code>. When it is not, * or when it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); if (contentBytes == null) return false; Metadata metadata = content.getMetadata(); if (metadata == null) return false; String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr=lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } int inHeaderSize; String url = content.getUrl(); try { inHeaderSize = Integer.parseInt(lengthStr); } catch (NumberFormatException e) { LOG.warn("Wrong contentlength format for " + url, e); return false; } int actualSize = contentBytes.length; if (inHeaderSize > actualSize) { LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize); return true; } if (LOG.isDebugEnabled()) { LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); } return false; }
Example #17
Source File: EncodingDetector.java From anthelion with Apache License 2.0 | 5 votes |
public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): ", e); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding( content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
Example #18
Source File: HttpResponse.java From anthelion with Apache License 2.0 | 5 votes |
private void readPlainContent(InputStream in) throws HttpException, IOException { int contentLength = Integer.MAX_VALUE; // get content length String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { if (!contentLengthString.isEmpty()) contentLength = Integer.parseInt(contentLengthString); } catch (NumberFormatException e) { throw new HttpException("bad content length: "+contentLengthString); } } if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size contentLength = http.getMaxContent(); ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); byte[] bytes = new byte[Http.BUFFER_SIZE]; int length = 0; // read content for (int i = in.read(bytes); i != -1 && length + i <= contentLength; i = in.read(bytes)) { out.write(bytes, 0, i); length += i; } content = out.toByteArray(); }
Example #19
Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0 | 5 votes |
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_TYPE, source); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("mime type not detected", expected, doc.getFieldValue("type")); }
Example #20
Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0 | 5 votes |
public void testContentDispositionTitle() throws IndexingException { Configuration conf = NutchConfiguration.create(); Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); }
Example #21
Source File: ParseSegment.java From anthelion with Apache License 2.0 | 5 votes |
/** * Checks if the page's content is truncated. * @param content * @return If the page is truncated <code>true</code>. When it is not, * or when it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); if (contentBytes == null) return false; Metadata metadata = content.getMetadata(); if (metadata == null) return false; String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr=lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } int inHeaderSize; String url = content.getUrl(); try { inHeaderSize = Integer.parseInt(lengthStr); } catch (NumberFormatException e) { LOG.warn("Wrong contentlength format for " + url, e); return false; } int actualSize = contentBytes.length; if (inHeaderSize > actualSize) { LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize); return true; } if (LOG.isDebugEnabled()) { LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); } return false; }
Example #22
Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0 | 5 votes |
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) { String contentLength = data.getMeta(Response.CONTENT_LENGTH); if (contentLength != null) { // NUTCH-1010 ContentLength not trimmed String trimmed = contentLength.toString().trim(); if (!trimmed.isEmpty()) doc.add("contentLength", trimmed); } return doc; }
Example #23
Source File: HttpBase.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
protected abstract Response getResponse(URL url, CrawlDatum datum, boolean followRedirects) throws ProtocolException, IOException;
Example #24
Source File: File.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * Quick way for running this class. Useful for debugging. */ public static void main(String[] args) throws Exception { int maxContentLength = Integer.MIN_VALUE; String logLevel = "info"; boolean dumpContent = false; String urlString = null; String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-logLevel")) { logLevel = args[++i]; } else if (args[i].equals("-maxContentLength")) { maxContentLength = Integer.parseInt(args[++i]); } else if (args[i].equals("-dumpContent")) { dumpContent = true; } else if (i != args.length-1) { System.err.println(usage); System.exit(-1); } else urlString = args[i]; } File file = new File(); file.setConf(NutchConfiguration.create()); if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength file.setMaxContentLength(maxContentLength); // set log level //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH)); System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } file = null; }
Example #25
Source File: FileResponse.java From anthelion with Apache License 2.0 | 4 votes |
private void getFileAsHttpResponse(java.io.File f) throws FileException, IOException { // ignore file of size larger than // Integer.MAX_VALUE = 2^31-1 = 2147483647 long size = f.length(); if (size > Integer.MAX_VALUE) { throw new FileException("file is too large, size: " + size); // or we can do this? // this.code = 400; // http Bad request // return; } // capture content int len = (int) size; if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength) len = this.file.maxContentLength; this.content = new byte[len]; java.io.InputStream is = new java.io.FileInputStream(f); int offset = 0; int n = 0; while (offset < len && (n = is.read(this.content, offset, len - offset)) >= 0) { offset += n; } if (offset < len) { // keep whatever already have, but issue a warning if (File.LOG.isWarnEnabled()) { File.LOG.warn("not enough bytes read from file: " + f.getPath()); } } is.close(); // set headers headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); String mimeType = MIME.getMimeType(f); headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : ""); // response code this.code = 200; // http OK }
Example #26
Source File: FileResponse.java From anthelion with Apache License 2.0 | 4 votes |
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; MIME = new MimeUtil(conf); tika = new Tika(); if (!"file".equals(url.getProtocol())) throw new FileException("Not a file url:" + url); if (File.LOG.isTraceEnabled()) { File.LOG.trace("fetching " + url); } if (url.getPath() != url.getFile()) { if (File.LOG.isWarnEnabled()) { File.LOG.warn("url.getPath() != url.getFile(): " + url); } } String path = "".equals(url.getPath()) ? "/" : url.getPath(); try { // specify the encoding via the config later? path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } try { this.content = null; // url.toURI() is only in j2se 1.5.0 //java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { this.code = 404; // http Not Found return; } if (!f.canRead()) { this.code = 401; // http Unauthorized return; } // symbolic link or relative path on unix // fix me: what's the consequence on windows platform // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers //hdrs.put("Location", f.getCanonicalFile().toURI()); // // we want to automatically escape characters that are illegal in URLs. // It is recommended that new code convert an abstract pathname into a URL // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); return; } if (f.isDirectory()) { getDirAsHttpResponse(f); } else if (f.isFile()) { getFileAsHttpResponse(f); } else { this.code = 500; // http Internal Server Error return; } } catch (IOException e) { throw e; } }
Example #27
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * <p> * Add Content-Type and its primaryType and subType add contentType, * primaryType and subType to field "type" as un-stored, indexed and * un-tokenized, so that search results can be confined by contentType or its * primaryType or its subType. * </p> * <p> * For example, if contentType is application/vnd.ms-powerpoint, search can be * done with one of the following qualifiers * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint * all case insensitive. The query filter is implemented in * {@link TypeQueryFilter}. * </p> * * @param doc * @param data * @param url * @return */ private NutchDocument addType(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { String mimeType = null; String contentType = null; Writable tcontentType = datum.getMetaData().get( new Text(Response.CONTENT_TYPE)); if (tcontentType != null) { contentType = tcontentType.toString(); } else contentType = data.getMeta(Response.CONTENT_TYPE); if (contentType == null) { // Note by Jerome Charron on 20050415: // Content Type not solved by a previous plugin // Or unable to solve it... Trying to find it // Should be better to use the doc content too // (using MimeTypes.getMimeType(byte[], String), but I don't know // which field it is? // if (MAGIC) { // contentType = MIME.getMimeType(url, content); // } else { // contentType = MIME.getMimeType(url); // } mimeType = tika.detect(url); } else { mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); } // Checks if we solved the content-type. if (mimeType == null) { return doc; } // Check if we have to map mime types if (mapMimes) { // Check if the current mime is mapped if (mimeMap.containsKey(mimeType)) { // It's mapped, let's replace it mimeType = mimeMap.get(mimeType); } } contentType = mimeType; doc.add("type", contentType); // Check if we need to split the content type in sub parts if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { String[] parts = getParts(contentType); for(String part: parts) { doc.add("type", part); } } // leave this for future improvement //MimeTypeParameterList parameterList = mimeType.getParameters() return doc; }
Example #28
Source File: FileResponse.java From anthelion with Apache License 2.0 | 4 votes |
public Content toContent() { return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), getHeader(Response.CONTENT_TYPE), headers, this.conf); }
Example #29
Source File: Ftp.java From anthelion with Apache License 2.0 | 4 votes |
/** For debugging. */ public static void main(String[] args) throws Exception { int timeout = Integer.MIN_VALUE; int maxContentLength = Integer.MIN_VALUE; String logLevel = "info"; boolean followTalk = false; boolean keepConnection = false; boolean dumpContent = false; String urlString = null; String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-logLevel")) { logLevel = args[++i]; } else if (args[i].equals("-followTalk")) { followTalk = true; } else if (args[i].equals("-keepConnection")) { keepConnection = true; } else if (args[i].equals("-timeout")) { timeout = Integer.parseInt(args[++i]) * 1000; } else if (args[i].equals("-maxContentLength")) { maxContentLength = Integer.parseInt(args[++i]); } else if (args[i].equals("-dumpContent")) { dumpContent = true; } else if (i != args.length-1) { System.err.println(usage); System.exit(-1); } else { urlString = args[i]; } } Ftp ftp = new Ftp(); ftp.setFollowTalk(followTalk); ftp.setKeepConnection(keepConnection); if (timeout != Integer.MIN_VALUE) // set timeout ftp.setTimeout(timeout); if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength ftp.setMaxContentLength(maxContentLength); // set log level //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH)); System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } ftp = null; }
Example #30
Source File: Http.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException { return new HttpResponse(this, url, datum); }