Java Code Examples for org.apache.tika.metadata.Metadata#get()
The following examples show how to use
org.apache.tika.metadata.Metadata#get() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
/** * Generates the release date */ private Date generateReleaseDate(Metadata metadata) { String date = metadata.get(XMPDM.RELEASE_DATE); if(date == null || date.length() == 0) { return null; } // Is it just a year? if(date.matches("\\d\\d\\d\\d")) { // Just a year, we need a full date // Go for the 1st of the 1st Calendar c = Calendar.getInstance(); c.set( Integer.parseInt(date), Calendar.JANUARY, 1, 0, 0, 0 ); c.set(Calendar.MILLISECOND, 0); return c.getTime(); } // Treat as a normal date return makeDate(date); }
Example 2
Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param Bytes * @return * @throws PDException */ protected String Convert(InputStream Bytes) throws PDException { try { ContentHandler textHandler=new BodyContentHandler(-1); Metadata metadata=new Metadata(); Parser parser=new AutoDetectParser(); ParseContext context=new ParseContext(); parser.parse(Bytes, textHandler, metadata, context); FileMetadata=""; for (String key : metadata.names()) FileMetadata+=key+"="+metadata.get(key)+"\n"; FullText=textHandler.toString(); } catch (Exception ex) { PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage()); } return(FullText); }
Example 3
Source File: TikaFormat.java From gate-core with GNU Lesser General Public License v3.0 | 6 votes |
private void setTikaFeature(Metadata metadata, Property property, FeatureMap fmap) { String value = metadata.get(property); if (value == null) { return; } value = value.trim(); if (value.length() == 0) { return; } String key = property.getName().toUpperCase(); if (fmap.containsKey(key)) { fmap.put("TIKA_" + key, value); } else { fmap.put(key, value); fmap.put("TIKA_" + key, value); } }
Example 4
Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
/** * Because some editors use JPEG_IMAGE_HEIGHT_TAG when * saving JPEG images , a more reliable source for * image size are the values provided by Tika * and not the exif/tiff metadata read from the file * This will override the tiff:Image size * which gets embedded into the alfresco node properties * for jpeg files that contain such exif information */ @Override protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties, Map<String,String> headers) { if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE))) { //check if the image has exif information if(metadata.get(EXIF_IMAGE_WIDTH_TAG) != null && metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null && metadata.get(COMPRESSION_TAG) != null) { //replace the exif size properties that will be embedded in the node with //the guessed dimensions from Tika putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(EXIF_IMAGE_HEIGHT_TAG)), properties); putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(EXIF_IMAGE_WIDTH_TAG)), properties); putRawValue(JPEG_IMAGE_HEIGHT_TAG, metadata.get(EXIF_IMAGE_HEIGHT_TAG), properties); putRawValue(JPEG_IMAGE_WIDTH_TAG, metadata.get(EXIF_IMAGE_WIDTH_TAG), properties); } } return properties; }
Example 5
Source File: TikaPoweredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
private String getMetadataValue(Metadata metadata, String key) { if (metadata.isMultiValued(key)) { String[] parts = metadata.getValues(key); // use Set to prevent duplicates Set<String> value = new LinkedHashSet<String>(parts.length); for (int i = 0; i < parts.length; i++) { value.add(parts[i]); } String valueStr = value.toString(); // remove leading/trailing braces [] return valueStr.substring(1, valueStr.length() - 1); } else { return metadata.get(key); } }
Example 6
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
/** * Generate the description * * @param metadata the metadata extracted from the file * @return the description */ @SuppressWarnings("deprecation") private String generateDescription(Metadata metadata) { StringBuilder result = new StringBuilder(); if (metadata.get(Metadata.TITLE) != null) { result.append(metadata.get(Metadata.TITLE)); if (metadata.get(XMPDM.ALBUM) != null) { result .append(" - ") .append(metadata.get(XMPDM.ALBUM)); } if (metadata.get(XMPDM.ARTIST) != null) { result .append(" (") .append(metadata.get(XMPDM.ARTIST)) .append(")"); } } return result.toString(); }
Example 7
Source File: MediaTypeDisablingDocumentSelector.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override public boolean select(Metadata metadata) { String contentType = metadata.get(Metadata.CONTENT_TYPE); if (contentType == null || contentType.equals("") || disabledMediaTypes == null) { return true; } return !disabledMediaTypes.contains(contentType); }
Example 8
Source File: OpenDocumentMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@SuppressWarnings("deprecation") @Override protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties, Map<String, String> headers) { putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties); putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties); putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties); putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties); putRawValue(KEY_GENERATOR, metadata.get("generator"), properties); putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties); putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties); // putRawValue(KEY_PRINT_DATE, getDateOrNull(metadata.get(Metadata.)), rawProperties); // putRawValue(KEY_PRINTED_BY, metadata.get(Metadata.), rawProperties); // Handle user-defined properties dynamically Map<String, Set<QName>> mapping = super.getMapping(); for (String key : mapping.keySet()) { if (metadata.get(CUSTOM_PREFIX + key) != null) { putRawValue(key, metadata.get(CUSTOM_PREFIX + key), properties); } } return properties; }
Example 9
Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
private String extractContentTypeMetadata(Document doc, Set<String> metaKeys, Metadata metadata) { if (metaKeys.contains("Content-Type")) { doc.getAssociativeArray().put("X-TL-CONTENT-TYPE", metadata.get("Content-Type")); return metadata.get("Content-Type"); } else { doc.getAssociativeArray().put("X-TL-CONTENT-TYPE", "TL_CONTENT_TYPE_UNKNOWN"); return "TL_CONTENT_TYPE_UNKNOWN"; } }
Example 10
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 5 votes |
/** * See if a language was set by the parser, from meta tags. As a last resort falls back to the result from the * ProfilingHandler. * * @param metadata * @param profilingHandler * @return The first language found (two char lang code) or empty string if no language was detected. */ private static String detectLanguage(Metadata metadata, ProfilingHandler profilingHandler) { String result = null; String dubCoreLang = metadata.get(TikaCoreProperties.LANGUAGE); String httpEquivLang = metadata.get(Metadata.CONTENT_LANGUAGE); if (dubCoreLang != null) { result = dubCoreLang; } else if (httpEquivLang != null) { result = httpEquivLang; } result = getFirstLanguage(result); if (result == null) { // Language is still unspecified, so use ProfileHandler's result LanguageIdentifier langIdentifier = profilingHandler.getLanguage(); // FUTURE KKr - provide config for specifying required certainty level. if (langIdentifier.isReasonablyCertain()) { result = langIdentifier.getLanguage(); LOGGER.trace("Using language specified by profiling handler: " + result); } else { result = ""; } } return result; }
Example 11
Source File: TikaEntityProcessor.java From lucene-solr with Apache License 2.0 | 5 votes |
private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) { if (spatialMetadataField == null) return; String latString = metadata.get(Metadata.LATITUDE); String lonString = metadata.get(Metadata.LONGITUDE); if (latString != null && lonString != null) { row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString)); } }
Example 12
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String assembleExtractionResult(String bucket, String key, String extractedText, Metadata tikaMetadata) { JSONObject extractJson = new JSONObject(); String contentType = tikaMetadata.get("Content-Type"); contentType = contentType != null ? contentType : "content/unknown"; String contentLength = tikaMetadata.get("Content-Length"); contentLength = contentLength != null ? contentLength : "0"; extractJson.put("Exception", null); extractJson.put("FilePath", "s3://" + bucket + "/" + key); extractJson.put("Text", extractedText); extractJson.put("ContentType", contentType); extractJson.put("ContentLength", contentLength); JSONObject metadataJson = new JSONObject(); for( String name : tikaMetadata.names() ){ String[] elements = tikaMetadata.getValues(name); String joined = String.join(", ", elements); metadataJson.put(name, joined); } extractJson.put("Metadata", metadataJson); return extractJson.toJSONString(); }
Example 13
Source File: FallbackParser.java From extract with MIT License | 5 votes |
@Override public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata, final ParseContext context) throws SAXException, IOException, TikaException { final Parser parser; final long size; String value = metadata.get(Metadata.CONTENT_LENGTH); if (null != value && !value.isEmpty()) { size = Long.valueOf(value); } else { try (final TikaInputStream tis = TikaInputStream.get(stream)) { size = tis.getLength(); } metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); } // If the file is not empty, throw a parse error. // Otherwise, output an empty document. if (size > 0) { parser = ErrorParser.INSTANCE; } else { metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream"); parser = EmptyParser.INSTANCE; } parser.parse(stream, handler, metadata, context); }
Example 14
Source File: EmbedParser.java From extract with MIT License | 5 votes |
void writeStart(final ContentHandler handler, final Metadata metadata) throws SAXException { final AttributesImpl attributes = new AttributesImpl(); final String name = metadata.get(Metadata.RESOURCE_NAME_KEY); attributes.addAttribute("", "class", "class", "CDATA", "package-entry"); handler.startElement(XHTML, "div", "div", attributes); if (name != null && name.length() > 0) { handler.startElement(XHTML, "h1", "h1", new AttributesImpl()); char[] chars = name.toCharArray(); handler.characters(chars, 0, chars.length); handler.endElement(XHTML, "h1", "h1"); } }
Example 15
Source File: EmbedSpawner.java From extract with MIT License | 5 votes |
private void writeEmbed(final TikaInputStream tis, final EmbeddedTikaDocument embed, final String name) throws IOException { final Path destination = outputPath.resolve(embed.getHash()); final Path source; final Metadata metadata = embed.getMetadata(); final Object container = tis.getOpenContainer(); // If the input is a container, write it to a temporary file so that it can then be copied atomically. // This happens with, for example, an Outlook Message that is an attachment of another Outlook Message. if (container instanceof DirectoryEntry) { try (final TemporaryResources tmp = new TemporaryResources(); final POIFSFileSystem fs = new POIFSFileSystem()) { source = tmp.createTempFile(); saveEntries((DirectoryEntry) container, fs.getRoot()); try (final OutputStream output = Files.newOutputStream(source)) { fs.writeFilesystem(output); } } } else { source = tis.getPath(); } // Set the content-length as it isn't (always?) set by Tika for embeds. if (null == metadata.get(Metadata.CONTENT_LENGTH)) { metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(source))); } // To prevent massive duplication and because the disk is only a storage for underlying data, save using the // straight hash as a filename. try { Files.copy(source, destination); } catch (final FileAlreadyExistsException e) { if (Files.size(source) != Files.size(destination)) { Files.copy(source, destination, StandardCopyOption.REPLACE_EXISTING); } else { logger.info("Temporary file for document \"{}\" in \"{}\" already exists.", name, root); } } }
Example 16
Source File: DigestIdentifier.java From extract with MIT License | 5 votes |
@Override public String generateForEmbed(final EmbeddedTikaDocument embed) throws NoSuchAlgorithmException { final MessageDigest digest = MessageDigest.getInstance(algorithm); // Embedded documents in different files or the same file could have the same hash. Therefore, to avoid ID // collisions within the child document tree, the digest considers: // - the file digest hash // - the parent path // - the embedded relationship ID // - the embedded document name final Metadata metadata = embed.getMetadata(); final String embeddedRelationshipId = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); final String name = metadata.get(Metadata.RESOURCE_NAME_KEY); final String hash = hash(embed); if (null == hash) { throw new IllegalStateException(String.format("No hash is available for the document with name \"%s\" at " + "path \"%s\".", name, embed.getPath())); } digest.update(hash.getBytes(charset)); digest.update(embed.getParent().getId().getBytes(charset)); if (null != embeddedRelationshipId) { digest.update(embeddedRelationshipId.getBytes(charset)); } if (null != name) { digest.update(name.getBytes(charset)); } return DatatypeConverter.printHexBinary(digest.digest()).toLowerCase(ENGLISH); }
Example 17
Source File: DataURIEncodingInputStream.java From extract with MIT License | 5 votes |
private static Charset detectCharset(final Path path, final Metadata metadata) throws IOException { final Charset charset; // Try to parse the character set from the content-encoding. String orig = metadata.get(Metadata.CONTENT_ENCODING); // Try to detect the character set. if (null != orig && Charset.isSupported(orig)) { return Charset.forName(orig); } try ( final InputStream input = new BufferedInputStream(Files.newInputStream(path)); final AutoDetectReader detector = new AutoDetectReader(input, metadata) ) { charset = detector.getCharset(); } catch (TikaException e) { throw new IOException("Unable to detect charset.", e); } return charset; }
Example 18
Source File: UniversalEncodingListener.java From onedev with MIT License | 5 votes |
public UniversalEncodingListener(Metadata metadata) { MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); if (type != null) { hint = type.getParameters().get("charset"); } if (hint == null) { hint = metadata.get(Metadata.CONTENT_ENCODING); } }
Example 19
Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0 | 4 votes |
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException { // Input & Output Variables ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes); byte[] xml_bytes = null; // Tika Parser Objects Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { // Populate Metadata Object with Tika Parser parser.parse(base_input, handler, metadata, context); // Container & Writer for Metadata Properties meta_props = new Properties(); StringWriter sw = new StringWriter(); // Put Tika Metadata in Properties for(String name : metadata.names()) { if (!metadata.get(name).isEmpty()) { meta_props.put(name, metadata.get(name)); } } meta_props.store(sw, "Tika Values"); // Expected Harvester Properties String meta_descr = metadata.get(TikaCoreProperties.DESCRIPTION); String meta_modif = metadata.get(TikaCoreProperties.MODIFIED); String meta_title = metadata.get(TikaCoreProperties.TITLE); // Default Label for Undefined Tika Properties DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd"); Date date = new Date(); String date_today = date_format.format(date); String tika_label = String.format("TIKA_%s", date_today); // Check For Null Values & Set Defaults if (meta_descr == null) { meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString()); } else { meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr); } if (meta_modif == null) { meta_props.put(WKAConstants.WKA_MODIFIED, tika_label); } else { meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif); } if (meta_title == null) { meta_props.put(WKAConstants.WKA_TITLE, file_name); } else { meta_props.put(WKAConstants.WKA_TITLE, meta_title); } // Build XML as Bytes MapAttribute attr = AttributeUtils.fromProperties(meta_props); Document document = new SimpleDcMetaBuilder().create(attr); xml_bytes = XmlUtils.toString(document).getBytes("UTF-8"); } catch (Exception ex) { LOG.error(String.format("Error reading data."), ex); } finally { base_input.close(); } return xml_bytes; }
Example 20
Source File: AbstractIdentifier.java From extract with MIT License | 4 votes |
@Override public String retrieveHash(final Metadata metadata) { return metadata.get(key); }