Java Code Examples for org.apache.nutch.metadata.Metadata#get()
The following examples show how to use
org.apache.nutch.metadata.Metadata#get() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParseSegment.java From anthelion with Apache License 2.0 | 5 votes |
/** * Checks if the page's content is truncated. * @param content * @return If the page is truncated <code>true</code>. When it is not, * or when it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); if (contentBytes == null) return false; Metadata metadata = content.getMetadata(); if (metadata == null) return false; String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr=lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } int inHeaderSize; String url = content.getUrl(); try { inHeaderSize = Integer.parseInt(lengthStr); } catch (NumberFormatException e) { LOG.warn("Wrong contentlength format for " + url, e); return false; } int actualSize = contentBytes.length; if (inHeaderSize > actualSize) { LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize); return true; } if (LOG.isDebugEnabled()) { LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); } return false; }
Example 2
Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0 | 5 votes |
private static String getLanguageFromMetadata(Metadata meta) { if (meta == null) return null; // dublin core String lang = meta.get("dc.language"); if (lang != null) return lang; // meta content-language lang = meta.get("content-language"); if (lang != null) return lang; // lang attribute return meta.get("lang"); }
Example 3
Source File: CCIndexingFilter.java From anthelion with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); // index the license String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); if (licenseUrl != null) { if (LOG.isInfoEnabled()) { LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); } // add the entire license as cc:license=xxx addFeature(doc, "license=" + licenseUrl); // index license attributes extracted of the license url addUrlFeatures(doc, licenseUrl); } // index the license location as cc:meta=xxx String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); if (licenseLocation != null) { addFeature(doc, "meta=" + licenseLocation); } // index the work type cc:type=xxx String workType = metadata.get(CreativeCommons.WORK_TYPE); if (workType != null) { addFeature(doc, workType); } return doc; }
Example 4
Source File: ParseSegment.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Checks if the page's content is truncated. * @param content * @return If the page is truncated <code>true</code>. When it is not, * or when it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); if (contentBytes == null) return false; Metadata metadata = content.getMetadata(); if (metadata == null) return false; String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr=lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } int inHeaderSize; String url = content.getUrl(); try { inHeaderSize = Integer.parseInt(lengthStr); } catch (NumberFormatException e) { LOG.warn("Wrong contentlength format for " + url, e); return false; } int actualSize = contentBytes.length; if (inHeaderSize > actualSize) { LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize); return true; } if (LOG.isDebugEnabled()) { LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); } return false; }
Example 5
Source File: HTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private static String getLanguageFromMetadata(Metadata meta) { if (meta == null) return null; // dublin core String lang = meta.get("dc.language"); if (lang != null) return lang; // meta content-language lang = meta.get("content-language"); if (lang != null) return lang; // lang attribute return meta.get("lang"); }
Example 6
Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); // index the license String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); if (licenseUrl != null) { if (LOG.isInfoEnabled()) { LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); } // add the entire license as cc:license=xxx addFeature(doc, "license=" + licenseUrl); // index license attributes extracted of the license url addUrlFeatures(doc, licenseUrl); } // index the license location as cc:meta=xxx String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); if (licenseLocation != null) { addFeature(doc, "meta=" + licenseLocation); } // index the work type cc:type=xxx String workType = metadata.get(CreativeCommons.WORK_TYPE); if (workType != null) { addFeature(doc, workType); } return doc; }