Java Code Examples for org.apache.tika.metadata.Metadata#set()
The following examples show how to use
org.apache.tika.metadata.Metadata#set() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReplayCrawl.java From ache with Apache License 2.0 | 6 votes |
private Metadata createHeadersMetadata(Page page) { Map<String, List<String>> headers = page.getResponseHeaders(); Metadata metadata = new Metadata(); for (Entry<String, List<String>> header : headers.entrySet()) { for (String value : header.getValue()) { metadata.set(header.getKey(), value); } } return metadata; }
Example 2
Source File: UpdatableInputStreamDigester.java From extract with MIT License | 6 votes |
/** * @param is input stream to read from * @param metadata metadata for reporting the digest * @return whether or not this finished the input stream * @throws IOException */ private boolean digestStream(InputStream is, Metadata metadata) throws IOException { byte[] digestBytes; MessageDigest messageDigest = newMessageDigest(); updateDigest(messageDigest, new ByteArrayInputStream(getDigestUpdateModifier().getBytes())); updateDigest(messageDigest, is); digestBytes = messageDigest.digest(); if (is instanceof SimpleBoundedInputStream) { if (((SimpleBoundedInputStream) is).hasHitBound()) { return false; } } metadata.set(getMetadataKey(), encoder.encode(digestBytes)); return true; }
Example 3
Source File: SpewerTest.java From extract with MIT License | 6 votes |
@Test public void testWritesISO8601Dates() throws IOException { final SpewerStub spewer = new SpewerStub(); final TikaDocument tikaDocument = factory.create("test.txt"); final Metadata metadata = tikaDocument.getMetadata(); final FieldNames fields = spewer.getFields(); // TODO: this should go in a separate test for the MetadataTransformer. final String[] dates = {"2011-12-03+01:00", "2015-06-03", "Tue Jan 27 17:03:21 2004", "19106-06-07T08:00:00Z"}; final String[] isoDates = {"2011-12-03T12:00:00Z", "2015-06-03T12:00:00Z", "2004-01-27T17:03:21Z", "+19106-06-07T08:00:00Z"}; int i = 0; for (String date: dates) { metadata.set(Office.CREATION_DATE, date); spewer.writeMetadata(tikaDocument); Assert.assertEquals(date, spewer.metadata.get(fields.forMetadata(Office.CREATION_DATE.getName()))); Assert.assertEquals(isoDates[i++], spewer.metadata.get(fields.forMetadataISODate(Office.CREATION_DATE.getName()))); // Reset the store of written metadata on each iteration. spewer.close(); } }
Example 4
Source File: DefaultResultsVisitor.java From allure2 with Apache License 2.0 | 6 votes |
public static String probeContentType(final InputStream is, final String name) { try (InputStream stream = new BufferedInputStream(is)) { final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, name); return getDefaultMimeTypes().detect(stream, metadata).toString(); } catch (IOException e) { LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e); return WILDCARD; } }
Example 5
Source File: ImageConverter.java From openmeetings with Apache License 2.0 | 6 votes |
private static ProcessResult initSize(BaseFileItem f, File img, String mime) { ProcessResult res = new ProcessResult(); res.setProcess("get image dimensions :: " + f.getId()); final Parser parser = new ImageParser(); try (InputStream is = new FileInputStream(img)) { Metadata metadata = new Metadata(); metadata.set(CONTENT_TYPE, mime); parser.parse(is, new DefaultHandler(), metadata, new ParseContext()); f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH))); f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH))); res.setExitCode(ZERO); } catch (Exception e) { log.error("Error while getting dimensions", e); res.setError("Error while getting dimensions"); res.setException(e.getMessage()); res.setExitCode(-1); } return res; }
Example 6
Source File: TransportAmazonLambdaS3.java From github-bucket with ISC License | 5 votes |
@Override void writeFile(final String path, final byte[] data) throws IOException { ObjectMetadata bucketMetadata = new ObjectMetadata(); bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data)); bucketMetadata.setContentLength(data.length); // Give Tika a few hints for the content detection Metadata tikaMetadata = new Metadata(); tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path))); // Fire! try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) { bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString()); s3.putObject(bucket, resolveKey(path), bis, bucketMetadata); } }
Example 7
Source File: RepositoryS3.java From github-bucket with ISC License | 5 votes |
private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException { byte[] content; byte[] newHash; LOG.debug("Start processing file: {}", path); try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) { // Get content content = IOUtils.toByteArray(is); // Get hash newHash = is.getMessageDigest().digest(); } if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) { LOG.info("Uploading file: {}", path); ObjectMetadata bucketMetadata = new ObjectMetadata(); bucketMetadata.setContentMD5(Base64.encodeAsString(newHash)); bucketMetadata.setContentLength(content.length); // Give Tika a few hints for the content detection Metadata tikaMetadata = new Metadata(); tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path))); // Fire! try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) { bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString()); s3.putObject(bucket.getName(), path, bis, bucketMetadata); return true; } } LOG.info("Skipping file (same checksum): {}", path); return false; }
Example 8
Source File: RepositoryS3.java From github-bucket with ISC License | 5 votes |
private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException { byte[] content; byte[] newHash; LOG.debug("Start processing file: {}", path); try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) { // Get content content = IOUtils.toByteArray(is); // Get hash newHash = is.getMessageDigest().digest(); } if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) { LOG.info("Uploading file: {}", path); ObjectMetadata bucketMetadata = new ObjectMetadata(); bucketMetadata.setContentMD5(Base64.encodeAsString(newHash)); bucketMetadata.setContentLength(content.length); // Give Tika a few hints for the content detection Metadata tikaMetadata = new Metadata(); tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path))); // Fire! try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) { bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString()); s3.putObject(bucket.getName(), path, bis, bucketMetadata); return true; } } LOG.info("Skipping file (same checksum): {}", path); return false; }
Example 9
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
/** Sets the input metadata for {@link Parser#parse}. */ public ParseFiles withInputMetadata(Metadata metadata) { Metadata inputMetadata = this.getInputMetadata(); if (inputMetadata != null) { for (String name : metadata.names()) { inputMetadata.set(name, metadata.get(name)); } } else { inputMetadata = metadata; } return toBuilder().setInputMetadata(inputMetadata).build(); }
Example 10
Source File: NodeTika.java From node-tika with MIT License | 5 votes |
private static void fillMetadata(Metadata metadata, String contentType, String uri) { // Set the file name. if (uri != null) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); } // Normalise the content-type. contentType = normalizeContentType(contentType); // Set the content-type. if (contentType != null) { metadata.add(HttpHeaders.CONTENT_TYPE, contentType); } }
Example 11
Source File: TikaIOTest.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) { ParseResult result = c.element(); Metadata m = new Metadata(); // Files contain many metadata properties. This function drops all but the "Author" // property manually added to "apache-beam-tika.odt" resource only to make // the tests simpler if (result.getFileLocation().endsWith("valid/apache-beam-tika.odt")) { m.set("Author", result.getMetadata().get("Author")); } ParseResult newResult = ParseResult.success(result.getFileLocation(), result.getContent(), m); c.output(newResult); }
Example 12
Source File: DirectoryScanner.java From importer-exporter with Apache License 2.0 | 5 votes |
private MediaType getMediaType(Path file) { try (InputStream stream = TikaInputStream.get(file)) { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, file.toString()); return tikaConfig.getDetector().detect(stream, metadata); } catch (IOException e) { return MediaType.EMPTY; } }
Example 13
Source File: FallbackParser.java From extract with MIT License | 5 votes |
@Override public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata, final ParseContext context) throws SAXException, IOException, TikaException { final Parser parser; final long size; String value = metadata.get(Metadata.CONTENT_LENGTH); if (null != value && !value.isEmpty()) { size = Long.valueOf(value); } else { try (final TikaInputStream tis = TikaInputStream.get(stream)) { size = tis.getLength(); } metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); } // If the file is not empty, throw a parse error. // Otherwise, output an empty document. if (size > 0) { parser = ErrorParser.INSTANCE; } else { metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream"); parser = EmptyParser.INSTANCE; } parser.parse(stream, handler, metadata, context); }
Example 14
Source File: TikaFilePlace.java From emissary with Apache License 2.0 | 5 votes |
/** * Use filename to support the mime type detection, if not disabled in TikaFilePlace.cfg * * @param d the IBaseDataObject payload to evaluate * @param metadata from the file, for Tika to process */ private void appendFilenameMimeTypeSupport(IBaseDataObject d, Metadata metadata) { if (includeFilenameMimeType) { logger.debug("Filename support for Mime Type detection is enabled"); metadata.set(Metadata.RESOURCE_NAME_KEY, d.getFilename()); } }
Example 15
Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
/** * Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may * cause OutOfMemory in Tika Note - doesn't use extractFromMimetype */ public void testParsingOfShapesInXLSXFiles() throws Exception { AutoDetectParser ap = new AutoDetectParser(); String filename = "dmsu1332-reproduced.xlsx"; URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename); File file = new File(url.getFile()); // Cheat and ask Tika for the mime type! Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata); String mimetype = mt.toString(); if (logger.isDebugEnabled()) { logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename); } // Have it processed // see MNT-15219 and REPO-3251 Map<QName, Serializable> properties = extractFromFile(file, mimetype); // check we got something assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename, properties.isEmpty()); if (properties.containsKey(ContentModel.PROP_AUTHOR)) { assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, "Udintsev, Anton (external - Project)", DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR))); } else { fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for " + mimetype); } // Ensure that we can also get things which are standard // Tika metadata properties, if we so choose to assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype, properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY)); assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype, mimetype, DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY))); }
Example 16
Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
/** * Test several different files * Note - doesn't use extractFromMimetype */ public void testSupportedMimetypes() throws Exception { String[] testFiles = new String[] { ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", //".vsd", // Our sample file lacks suitable metadata "2010.dwg", "2003.mpp", "2007.mpp", ".pdf", ".odt", }; AutoDetectParser ap = new AutoDetectParser(); for (String fileBase : testFiles) { String filename = "quick" + fileBase; URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename); File file = new File(url.getFile()); // Cheat and ask Tika for the mime type! Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata); String mimetype = mt.toString(); if (logger.isDebugEnabled()) { logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename); } // Have it processed Map<QName, Serializable> properties = extractFromFile(file, mimetype); // check we got something assertFalse("extractFromMimetype should return at least some properties, " + "none found for " + mimetype + " - " + filename, properties.isEmpty()); // check common metadata testCommonMetadata(mimetype, properties); // check file-type specific metadata testFileSpecificMetadata(mimetype, properties); } }
Example 17
Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
/** * Asks Tika to translate the contents into HTML */ private void generateHTML(Parser p, RenderingContext context) { ContentReader contentReader = context.makeContentReader(); // Setup things to parse with StringWriter sw = new StringWriter(); ContentHandler handler = buildContentHandler(sw, context); // Tell Tika what we're dealing with Metadata metadata = new Metadata(); metadata.set( Metadata.CONTENT_TYPE, contentReader.getMimetype() ); metadata.set( Metadata.RESOURCE_NAME_KEY, nodeService.getProperty( context.getSourceNode(), ContentModel.PROP_NAME ).toString() ); // Our parse context needs to extract images ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new TikaImageExtractingParser(context)); // Parse try { p.parse( contentReader.getContentInputStream(), handler, metadata, parseContext ); } catch(Exception e) { throw new RenditionServiceException("Tika HTML Conversion Failed", e); } // As a string String html = sw.toString(); // If we're doing body-only, remove all the html namespaces // that will otherwise clutter up the document boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false); if(bodyOnly) { html = html.replaceAll("<\\?xml.*?\\?>", ""); html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p"); html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1"); html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div"); html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table"); html = html.replaceAll(" ",""); } // Save it ContentWriter contentWriter = context.makeContentWriter(); contentWriter.setMimetype("text/html"); contentWriter.putContent( html ); }
Example 18
Source File: DocumentFactory.java From extract with MIT License | 4 votes |
public TikaDocument create(final String id, final Path path, final long size) { final Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size)); return new TikaDocument(id, identifier, path, metadata); }
Example 19
Source File: TikaIOTest.java From beam with Apache License 2.0 | 4 votes |
private static Metadata getOdtMetadata() { Metadata m = new Metadata(); m.set("Author", "BeamTikaUser"); return m; }
Example 20
Source File: TikaTest.java From tika-server with Apache License 2.0 | 4 votes |
protected XMLResult getXML(String filePath, Parser parser) throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, filePath); return getXML(filePath, parser, metadata); }