Java Code Examples for org.apache.tika.metadata.Metadata#names()
The following examples show how to use
org.apache.tika.metadata.Metadata#names() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaExtractor.java From ache with Apache License 2.0 | 6 votes |
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
Example 2
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String extractMeta(String uri, String contentType) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); fillMetadata(parser, metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); parser.parse(inputStream, new DefaultHandler(), metadata); Map meta = new HashMap(); for (String name : metadata.names()) { String[] values = metadata.getValues(name); meta.put(name, values); } inputStream.close(); return new Gson().toJson(meta); }
Example 3
Source File: TikaTest.java From tika-server with Apache License 2.0 | 6 votes |
/** * Test that in at least one item in metadataList, all keys and values * in minExpected are contained. * <p> * The values in minExpected are tested for whether they are contained * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and * what was actually found in the target within metadatalist is * &dquot;text/vbasic; charset=windows-1252&dquot;, * that is counted as a hit. * * @param minExpected * @param metadataList */ public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) { for (Metadata m : metadataList) { int foundPropertyCount = 0; for (String n : minExpected.names()) { int foundValCount = 0; for (String foundVal : m.getValues(n)) { for (String expectedVal : minExpected.getValues(n)) { if (foundVal.contains(expectedVal)) { foundValCount++; } } } if (foundValCount == minExpected.getValues(n).length) { foundPropertyCount++; } } if (foundPropertyCount == minExpected.names().length) { //found everything! return; } } //TODO: figure out how to have more informative error message fail("Couldn't find everything within a single metadata item"); }
Example 4
Source File: TikaContentExtractor.java From baleen with Apache License 2.0 | 6 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } }
Example 5
Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param Bytes * @return * @throws PDException */ protected String Convert(InputStream Bytes) throws PDException { try { ContentHandler textHandler=new BodyContentHandler(-1); Metadata metadata=new Metadata(); Parser parser=new AutoDetectParser(); ParseContext context=new ParseContext(); parser.parse(Bytes, textHandler, metadata, context); FileMetadata=""; for (String key : metadata.names()) FileMetadata+=key+"="+metadata.get(key)+"\n"; FullText=textHandler.toString(); } catch (Exception ex) { PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage()); } return(FullText); }
Example 6
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
/** Sets the input metadata for {@link Parser#parse}. */ public ParseFiles withInputMetadata(Metadata metadata) { Metadata inputMetadata = this.getInputMetadata(); if (inputMetadata != null) { for (String name : metadata.names()) { inputMetadata.set(name, metadata.get(name)); } } else { inputMetadata = metadata; } return toBuilder().setInputMetadata(inputMetadata).build(); }
Example 7
Source File: TikaProcessor.java From jesterj with Apache License 2.0 | 5 votes |
@Override public Document[] processDocument(Document document) { try { byte[] rawData = document.getRawData(); if (rawData == null) { log.debug("Skipping document without data in " + getName()); return new Document[]{document}; } Tika tika = new Tika(tikaConfig); tika.setMaxStringLength(document.getRawData().length); Metadata metadata = new Metadata(); try (ByteArrayInputStream bais = new ByteArrayInputStream(rawData)) { String textContent = tika.parseToString(bais, metadata, maxLength); if (replaceRaw) { document.setRawData(textContent.getBytes(StandardCharsets.UTF_8)); } if (destField != null) { document.put(destField,textContent); } for (String name : metadata.names()) { document.put(sanitize(name) + plusSuffix(), metadata.get(name)); } } catch (IOException | TikaException e) { log.debug("Tika processing failure!", e); // if tika can't parse it we certainly don't want random binary crap in the index document.setStatus(Status.ERROR); } } catch (Throwable t) { boolean isAccessControl = t instanceof AccessControlException; boolean isSecurity = t instanceof SecurityException; if (!isAccessControl && !isSecurity) { throw t; } } return new Document[]{document}; }
Example 8
Source File: TikaContentExtractor.java From cxf with Apache License 2.0 | 5 votes |
/** * Extract the metadata only from the input stream. Depending on media type validation, * the detector could be run against input stream in order to ensure that parser supports this * type of content. * @param in input stream to extract the metadata from * @return the extracted metadata converted to SearchBean or null if extraction is not possible * or was unsuccessful */ public SearchBean extractMetadataToSearchBean(final InputStream in) { TikaContent tc = extractMetadata(in); if (tc == null) { return null; } Metadata metadata = tc.getMetadata(); SearchBean bean = new SearchBean(); for (final String property: metadata.names()) { bean.set(property, metadata.get(property)); } return bean; }
Example 9
Source File: TearlineContentExtractor.java From baleen with Apache License 2.0 | 5 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if (m.find()) { jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); } else { jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } }
Example 10
Source File: TikaLuceneContentExtractor.java From cxf with Apache License 2.0 | 5 votes |
private Document extractAll(final InputStream in, LuceneDocumentMetadata documentMetadata, boolean extractContent, boolean extractMetadata) { TikaContent content = extractor.extract(in, extractContent ? new ToTextContentHandler() : null); if (content == null) { return null; } final Document document = new Document(); if (documentMetadata == null) { documentMetadata = defaultDocumentMetadata; } if (content.getContent() != null) { document.add(getContentField(documentMetadata, content.getContent())); } if (extractMetadata) { Metadata metadata = content.getMetadata(); for (final String property: metadata.names()) { addField(document, documentMetadata, property, metadata.get(property)); } } if (!StringUtils.isEmpty(documentMetadata.getSource())) { document.add(new StringField(documentMetadata.getSourceFieldName(), documentMetadata.getSource(), Store.YES)); } return document; }
Example 11
Source File: ParseResult.java From beam with Apache License 2.0 | 5 votes |
private ParseResult(String fileLocation, String content, Metadata metadata, Throwable error) { checkArgument(fileLocation != null, "fileLocation can not be null"); checkArgument(content != null, "content can not be null"); checkArgument(metadata != null, "metadata can not be null"); this.fileLocation = fileLocation; this.content = content; this.metadata = metadata; this.metadataNames = metadata.names(); this.error = (error == null) ? null : new SerializableThrowable(error); }
Example 12
Source File: ElasticsearchSpewerTest.java From datashare with GNU Affero General Public License v3.0 | 5 votes |
private Map<String, Object> convert(Metadata metadata) { Map<String, Object> map = new HashMap<>(); for (String name: metadata.names()) { map.put(name, metadata.get(name)); } return map; }
Example 13
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 5 votes |
private static Map<String, String> makeMap(Metadata metadata) { Map<String, String> result = new HashMap<String, String>(); for (String key : metadata.names()) { result.put(key, metadata.get(key)); } return result; }
Example 14
Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0 | 5 votes |
@Override public JSONObject toJson(String filePath) throws OperationException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = new FileInputStream(new File(filePath))) { parser.parse(stream, handler, metadata); } catch (IOException | SAXException | TikaException e) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading"); } final String fileText = handler.toString(); if(fileText == null || fileText.isEmpty()) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document"); } JSONObject jsonObject = new JSONObject(); jsonObject.put("_txt", fileText); String[] metadataNames = metadata.names(); for(String name : metadataNames) { jsonObject.put(name, metadata.get(name)); } return jsonObject; }
Example 15
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
public static void debug(Metadata metadata) { for (String n : metadata.names()) { for (String v : metadata.getValues(n)) { System.out.println(n + " : "+v); } } }
Example 16
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
public static void debug(List<Metadata> list) { int i = 0; for (Metadata m : list) { for (String n : m.names()) { for (String v : m.getValues(n)) { System.out.println(i + ": "+n + " : "+v); } } i++; } }
Example 17
Source File: ExtractMediaMetadata.java From nifi with Apache License 2.0 | 4 votes |
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); } finally { tikaInputStream.close(); } final Map<String, String> results = new HashMap<>(); final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; } dataBuilder.setLength(0); if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) { dataBuilder.append(", "); } if (dataBuilder.length() + val.length() < maxAttribLen) { dataBuilder.append(val); } else { dataBuilder.append("..."); break; } } } else { dataBuilder.append(metadata.get(key)); } if (prefix == null) { results.put(key, dataBuilder.toString().trim()); } else { results.put(prefix + key, dataBuilder.toString().trim()); } // cutoff at max if provided if (maxAttribs != null && results.size() >= maxAttribs) { break; } } return results; }
Example 18
Source File: MP3Reader.java From red5-io with Apache License 2.0 | 4 votes |
/** * Creates reader from file input stream * * @param file * file input * @throws IOException * on IO error */ public MP3Reader(File file) throws IOException { this.file = file; fis = new FileInputStream(file); try { // parse the ID3 info BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); // MP3 parser Mp3Parser parser = new Mp3Parser(); parser.parse(fis, handler, metadata, null); log.debug("Contents of the document: {}", handler.toString()); // create meta data holder metaData = new MetaData(); String val = null; String[] metadataNames = metadata.names(); for (String name : metadataNames) { val = metadata.get(name); log.debug("Meta name: {} value: {}", name, val); if ("xmpDM:artist".equals(name)) { metaData.setArtist(val); } else if ("xmpDM:album".equals(name)) { metaData.setAlbum(val); } else if ("title".equals(name)) { metaData.setSongName(val); } else if ("xmpDM:genre".equals(name)) { metaData.setGenre(val); } else if ("xmpDM:logComment".equals(name)) { metaData.setComment(val); } else if ("xmpDM:trackNumber".equals(name)) { metaData.setTrack(val); } else if ("xmpDM:releaseDate".equals(name)) { metaData.setYear(val); } else if ("xmpDM:duration".equals(name) || "duration".equals(name)) { metaData.setDuration(val); } else if ("xmpDM:audioSampleRate".equals(name) || "samplerate".equals(name)) { metaData.setSampleRate(val); } else if ("channels".equals(name)) { metaData.setChannels(val); } } /* * //send album image if included List<Artwork> tagFieldList = idTag.getArtworkList(); if (tagFieldList == null || tagFieldList.isEmpty()) { log.debug("No cover art was found"); } * else { Artwork imageField = tagFieldList.get(0); log.debug("Picture type: {}", imageField.getPictureType()); FrameBodyAPIC imageFrameBody = new FrameBodyAPIC(); * imageFrameBody.setImageData(imageField.getBinaryData()); if (!imageFrameBody.isImageUrl()) { byte[] imageBuffer = (byte[]) * imageFrameBody.getObjectValue(DataTypes.OBJ_PICTURE_DATA); //set the cover image on the metadata metaData.setCovr(imageBuffer); // Create tag for onImageData event IoBuffer buf * = IoBuffer.allocate(imageBuffer.length); buf.setAutoExpand(true); Output out = new Output(buf); out.writeString("onImageData"); Map<Object, Object> props = new HashMap<Object, * Object>(); props.put("trackid", 1); props.put("data", imageBuffer); out.writeMap(props); buf.flip(); //Ugh i hate flash sometimes!! //Error #2095: flash.net.NetStream was unable * to invoke callback onImageData. ITag result = new Tag(IoConstants.TYPE_METADATA, 0, buf.limit(), null, 0); result.setBody(buf); //add to first frames firstTags.add(result); } } * } else { log.info("File did not contain ID3v2 data: {}", file.getName()); } */ } catch (Exception e) { log.error("MP3Reader {}", e); } // ensure we have a valid sample rate checkValidHeader(); // get the total bytes / file size fileSize = file.length(); log.debug("File size: {}", fileSize); // analyze keyframes data analyzeKeyFrames(); // create file metadata object firstTags.addFirst(createFileMeta()); log.trace("File input stream - open: {} position: {}", fis.getChannel().isOpen(), fis.getChannel().position()); // create a channel for reading fileChannel = fis.getChannel(); }
Example 19
Source File: S3River.java From es-amazon-s3-river with Apache License 2.0 | 4 votes |
/** Index an Amazon S3 file by retrieving its content and building the suitable Json content. */ private String indexFile(S3ObjectSummary summary){ if (logger.isDebugEnabled()){ logger.debug("Trying to index '{}'", summary.getKey()); } try{ // Build a unique id from S3 unique summary key. String fileId = buildIndexIdFromS3Key(summary.getKey()); if (feedDefinition.isJsonSupport()){ esIndex(indexName, typeName, summary.getKey(), s3.getContent(summary)); } else { byte[] fileContent = s3.getContent(summary); if (fileContent != null) { // Compute number of chars to index. // see https://github.com/lbroudoux/es-amazon-s3-river/issues/36 int indexedChars = 100000; if (feedDefinition.getIndexedCharsRatio() > 0) { indexedChars = (int) Math.round(fileContent.length * feedDefinition.getIndexedCharsRatio()); } // Parse content using Tika directly. Metadata fileMetadata = new Metadata(); String parsedContent = TikaHolder.tika().parseToString( new BytesStreamInput(fileContent), fileMetadata, indexedChars); // Store Tika metadatas into a map. Map<String, Object> fileMetadataMap = new HashMap<String, Object>(); for (String key : fileMetadata.names()) { fileMetadataMap.put(key, fileMetadata.get(key)); } esIndex(indexName, typeName, fileId, jsonBuilder() .startObject() .field(S3RiverUtil.DOC_FIELD_TITLE, summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) .field(S3RiverUtil.DOC_FIELD_MODIFIED_DATE, summary.getLastModified().getTime()) .field(S3RiverUtil.DOC_FIELD_SOURCE_URL, s3.getDownloadUrl(summary, feedDefinition)) .field(S3RiverUtil.DOC_FIELD_METADATA, s3.getS3UserMetadata(summary.getKey())) .startObject("file") .field("_name", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) .field("title", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) .field("file", parsedContent) .field("metadata", fileMetadataMap) .endObject() .endObject() ); return fileId; } } } catch (Exception e) { logger.warn("Can not index " + summary.getKey() + " : " + e.getMessage()); } return null; }
Example 20
Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0 | 4 votes |
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException { // Input & Output Variables ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes); byte[] xml_bytes = null; // Tika Parser Objects Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { // Populate Metadata Object with Tika Parser parser.parse(base_input, handler, metadata, context); // Container & Writer for Metadata Properties meta_props = new Properties(); StringWriter sw = new StringWriter(); // Put Tika Metadata in Properties for(String name : metadata.names()) { if (!metadata.get(name).isEmpty()) { meta_props.put(name, metadata.get(name)); } } meta_props.store(sw, "Tika Values"); // Expected Harvester Properties String meta_descr = metadata.get(TikaCoreProperties.DESCRIPTION); String meta_modif = metadata.get(TikaCoreProperties.MODIFIED); String meta_title = metadata.get(TikaCoreProperties.TITLE); // Default Label for Undefined Tika Properties DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd"); Date date = new Date(); String date_today = date_format.format(date); String tika_label = String.format("TIKA_%s", date_today); // Check For Null Values & Set Defaults if (meta_descr == null) { meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString()); } else { meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr); } if (meta_modif == null) { meta_props.put(WKAConstants.WKA_MODIFIED, tika_label); } else { meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif); } if (meta_title == null) { meta_props.put(WKAConstants.WKA_TITLE, file_name); } else { meta_props.put(WKAConstants.WKA_TITLE, meta_title); } // Build XML as Bytes MapAttribute attr = AttributeUtils.fromProperties(meta_props); Document document = new SimpleDcMetaBuilder().create(attr); xml_bytes = XmlUtils.toString(document).getBytes("UTF-8"); } catch (Exception ex) { LOG.error(String.format("Error reading data."), ex); } finally { base_input.close(); } return xml_bytes; }