org.apache.tika.metadata.TikaCoreProperties Java Examples
The following examples show how to use
org.apache.tika.metadata.TikaCoreProperties.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EmbedSpawner.java From extract with MIT License | 6 votes |
@Override public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata, final boolean outputHtml) throws SAXException, IOException { // There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main // document as usual. if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata .get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) { final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler)); if (outputHtml) { writeStart(handler, metadata); } delegateParsing(input, embedHandler, metadata); if (outputHtml) { writeEnd(handler); } } else { try (final TikaInputStream tis = TikaInputStream.get(input)) { spawnEmbedded(tis, metadata); } } }
Example #2
Source File: TikaFormat.java From gate-core with GNU Lesser General Public License v3.0 | 5 votes |
private void setDocumentFeatures(Metadata metadata, Document doc) { FeatureMap fmap = doc.getFeatures(); setTikaFeature(metadata, TikaCoreProperties.TITLE, fmap); setTikaFeature(metadata, Office.AUTHOR, fmap); setTikaFeature(metadata, TikaCoreProperties.COMMENTS, fmap); setTikaFeature(metadata, TikaCoreProperties.CREATOR, fmap); if (fmap.get("AUTHORS") == null && fmap.get("AUTHOR") != null) fmap.put("AUTHORS", fmap.get(Office.AUTHOR)); fmap.put("MimeType", metadata.get(Metadata.CONTENT_TYPE)); }
Example #3
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 5 votes |
/** * See if a language was set by the parser, from meta tags. As a last resort falls back to the result from the * ProfilingHandler. * * @param metadata * @param profilingHandler * @return The first language found (two char lang code) or empty string if no language was detected. */ private static String detectLanguage(Metadata metadata, ProfilingHandler profilingHandler) { String result = null; String dubCoreLang = metadata.get(TikaCoreProperties.LANGUAGE); String httpEquivLang = metadata.get(Metadata.CONTENT_LANGUAGE); if (dubCoreLang != null) { result = dubCoreLang; } else if (httpEquivLang != null) { result = httpEquivLang; } result = getFirstLanguage(result); if (result == null) { // Language is still unspecified, so use ProfileHandler's result LanguageIdentifier langIdentifier = profilingHandler.getLanguage(); // FUTURE KKr - provide config for specifying required certainty level. if (langIdentifier.isReasonablyCertain()) { result = langIdentifier.getLanguage(); LOGGER.trace("Using language specified by profiling handler: " + result); } else { result = ""; } } return result; }
Example #4
Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0 | 4 votes |
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException { // Input & Output Variables ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes); byte[] xml_bytes = null; // Tika Parser Objects Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { // Populate Metadata Object with Tika Parser parser.parse(base_input, handler, metadata, context); // Container & Writer for Metadata Properties meta_props = new Properties(); StringWriter sw = new StringWriter(); // Put Tika Metadata in Properties for(String name : metadata.names()) { if (!metadata.get(name).isEmpty()) { meta_props.put(name, metadata.get(name)); } } meta_props.store(sw, "Tika Values"); // Expected Harvester Properties String meta_descr = metadata.get(TikaCoreProperties.DESCRIPTION); String meta_modif = metadata.get(TikaCoreProperties.MODIFIED); String meta_title = metadata.get(TikaCoreProperties.TITLE); // Default Label for Undefined Tika Properties DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd"); Date date = new Date(); String date_today = date_format.format(date); String tika_label = String.format("TIKA_%s", date_today); // Check For Null Values & Set Defaults if (meta_descr == null) { meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString()); } else { meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr); } if (meta_modif == null) { meta_props.put(WKAConstants.WKA_MODIFIED, tika_label); } else { meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif); } if (meta_title == null) { meta_props.put(WKAConstants.WKA_TITLE, file_name); } else { meta_props.put(WKAConstants.WKA_TITLE, meta_title); } // Build XML as Bytes MapAttribute attr = AttributeUtils.fromProperties(meta_props); Document document = new SimpleDcMetaBuilder().create(attr); xml_bytes = XmlUtils.toString(document).getBytes("UTF-8"); } catch (Exception ex) { LOG.error(String.format("Error reading data."), ex); } finally { base_input.close(); } return xml_bytes; }
Example #5
Source File: UpdatableInputStreamDigester.java From extract with MIT License | 4 votes |
private String getMetadataKey() { return TikaCoreProperties.TIKA_META_PREFIX + "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName; }
Example #6
Source File: Identifier.java From extract with MIT License | 4 votes |
static String getKey(String algorithm) { return TikaCoreProperties.TIKA_META_PREFIX + "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + algorithm .replace("-", ""); }