org.apache.tika.metadata.Metadata#set

Source File: ReplayCrawl.java From ache with Apache License 2.0

6 votes

private Metadata createHeadersMetadata(Page page) {
    Map<String, List<String>> headers = page.getResponseHeaders();
    Metadata metadata = new Metadata();
    for (Entry<String, List<String>> header : headers.entrySet()) {
        for (String value : header.getValue()) {
            metadata.set(header.getKey(), value);
        }
    }
    return metadata;
}

Source File: UpdatableInputStreamDigester.java From extract with MIT License

6 votes

/**
 * @param is       input stream to read from
 * @param metadata metadata for reporting the digest
 * @return whether or not this finished the input stream
 * @throws IOException
 */
private boolean digestStream(InputStream is, Metadata metadata) throws IOException {
    byte[] digestBytes;
    MessageDigest messageDigest = newMessageDigest();

    updateDigest(messageDigest, new ByteArrayInputStream(getDigestUpdateModifier().getBytes()));
    updateDigest(messageDigest, is);
    digestBytes = messageDigest.digest();

    if (is instanceof SimpleBoundedInputStream) {
        if (((SimpleBoundedInputStream) is).hasHitBound()) {
            return false;
        }
    }
    metadata.set(getMetadataKey(), encoder.encode(digestBytes));
    return true;
}

Source File: SpewerTest.java From extract with MIT License

6 votes

@Test
public void testWritesISO8601Dates() throws IOException {
	final SpewerStub spewer = new SpewerStub();
	final TikaDocument tikaDocument = factory.create("test.txt");
	final Metadata metadata = tikaDocument.getMetadata();
	final FieldNames fields = spewer.getFields();

	// TODO: this should go in a separate test for the MetadataTransformer.
	final String[] dates = {"2011-12-03+01:00", "2015-06-03", "Tue Jan 27 17:03:21 2004", "19106-06-07T08:00:00Z"};
	final String[] isoDates = {"2011-12-03T12:00:00Z", "2015-06-03T12:00:00Z", "2004-01-27T17:03:21Z",
			"+19106-06-07T08:00:00Z"};
	int i = 0;

	for (String date: dates) {
		metadata.set(Office.CREATION_DATE, date);
		spewer.writeMetadata(tikaDocument);

		Assert.assertEquals(date, spewer.metadata.get(fields.forMetadata(Office.CREATION_DATE.getName())));
		Assert.assertEquals(isoDates[i++],
				spewer.metadata.get(fields.forMetadataISODate(Office.CREATION_DATE.getName())));

		// Reset the store of written metadata on each iteration.
		spewer.close();
	}
}

Source File: DefaultResultsVisitor.java From allure2 with Apache License 2.0

6 votes

public static String probeContentType(final InputStream is, final String name) {
    try (InputStream stream = new BufferedInputStream(is)) {
        final Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
        return getDefaultMimeTypes().detect(stream, metadata).toString();
    } catch (IOException e) {
        LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e);
        return WILDCARD;
    }
}

Source File: ImageConverter.java From openmeetings with Apache License 2.0

6 votes

private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}

Source File: TransportAmazonLambdaS3.java From github-bucket with ISC License

5 votes

@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}

Source File: RepositoryS3.java From github-bucket with ISC License

5 votes

private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException {
    byte[] content;
    byte[] newHash;
    LOG.debug("Start processing file: {}", path);
    try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) {
        // Get content
        content = IOUtils.toByteArray(is);
        // Get hash
        newHash = is.getMessageDigest().digest();
    }
    if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) {
        LOG.info("Uploading file: {}", path);
        ObjectMetadata bucketMetadata = new ObjectMetadata();
        bucketMetadata.setContentMD5(Base64.encodeAsString(newHash));
        bucketMetadata.setContentLength(content.length);
        // Give Tika a few hints for the content detection
        Metadata tikaMetadata = new Metadata();
        tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
        // Fire!
        try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) {
            bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
            s3.putObject(bucket.getName(), path, bis, bucketMetadata);
            return true;
        }
    }
    LOG.info("Skipping file (same checksum): {}", path);
    return false;
}

Source File: RepositoryS3.java From github-bucket with ISC License

5 votes

private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException {
    byte[] content;
    byte[] newHash;
    LOG.debug("Start processing file: {}", path);
    try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) {
        // Get content
        content = IOUtils.toByteArray(is);
        // Get hash
        newHash = is.getMessageDigest().digest();
    }
    if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) {
        LOG.info("Uploading file: {}", path);
        ObjectMetadata bucketMetadata = new ObjectMetadata();
        bucketMetadata.setContentMD5(Base64.encodeAsString(newHash));
        bucketMetadata.setContentLength(content.length);
        // Give Tika a few hints for the content detection
        Metadata tikaMetadata = new Metadata();
        tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
        // Fire!
        try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) {
            bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
            s3.putObject(bucket.getName(), path, bis, bucketMetadata);
            return true;
        }
    }
    LOG.info("Skipping file (same checksum): {}", path);
    return false;
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

/** Sets the input metadata for {@link Parser#parse}. */
public ParseFiles withInputMetadata(Metadata metadata) {
  Metadata inputMetadata = this.getInputMetadata();
  if (inputMetadata != null) {
    for (String name : metadata.names()) {
      inputMetadata.set(name, metadata.get(name));
    }
  } else {
    inputMetadata = metadata;
  }
  return toBuilder().setInputMetadata(inputMetadata).build();
}

Source File: NodeTika.java From node-tika with MIT License

5 votes

private static void fillMetadata(Metadata metadata, String contentType, String uri) {

		// Set the file name.
		if (uri != null) {
			metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());
		}

		// Normalise the content-type.
		contentType = normalizeContentType(contentType);

		// Set the content-type.
		if (contentType != null) {
			metadata.add(HttpHeaders.CONTENT_TYPE, contentType);
		}
	}

Source File: TikaIOTest.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) {
  ParseResult result = c.element();
  Metadata m = new Metadata();
  // Files contain many metadata properties. This function drops all but the "Author"
  // property manually added to "apache-beam-tika.odt" resource only to make
  // the tests simpler
  if (result.getFileLocation().endsWith("valid/apache-beam-tika.odt")) {
    m.set("Author", result.getMetadata().get("Author"));
  }
  ParseResult newResult = ParseResult.success(result.getFileLocation(), result.getContent(), m);
  c.output(newResult);
}

Source File: DirectoryScanner.java From importer-exporter with Apache License 2.0

5 votes

private MediaType getMediaType(Path file) {
    try (InputStream stream = TikaInputStream.get(file)) {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, file.toString());
        return tikaConfig.getDetector().detect(stream, metadata);
    } catch (IOException e) {
        return MediaType.EMPTY;
    }
}

Source File: FallbackParser.java From extract with MIT License

5 votes

@Override
public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context) throws SAXException, IOException, TikaException {
	final Parser parser;
	final long size;
	String value = metadata.get(Metadata.CONTENT_LENGTH);

	if (null != value && !value.isEmpty()) {
		size = Long.valueOf(value);
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(stream)) {
			size = tis.getLength();
		}

		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
	}

	// If the file is not empty, throw a parse error.
	// Otherwise, output an empty document.
	if (size > 0) {
		parser = ErrorParser.INSTANCE;
	} else {
		metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
		parser = EmptyParser.INSTANCE;
	}

	parser.parse(stream, handler, metadata, context);
}

Source File: TikaFilePlace.java From emissary with Apache License 2.0

5 votes

/**
 * Use filename to support the mime type detection, if not disabled in TikaFilePlace.cfg
 *
 * @param d the IBaseDataObject payload to evaluate
 * @param metadata from the file, for Tika to process
 */
private void appendFilenameMimeTypeSupport(IBaseDataObject d, Metadata metadata) {
    if (includeFilenameMimeType) {
        logger.debug("Filename support for Mime Type detection is enabled");
        metadata.set(Metadata.RESOURCE_NAME_KEY, d.getFilename());
    }
}

Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
 * cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
 */
public void testParsingOfShapesInXLSXFiles() throws Exception
{
    AutoDetectParser ap = new AutoDetectParser();

    String filename = "dmsu1332-reproduced.xlsx";
    URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
    File file = new File(url.getFile());

    // Cheat and ask Tika for the mime type!
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
    String mimetype = mt.toString();

    if (logger.isDebugEnabled())
    {
        logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
    }

    // Have it processed
    // see MNT-15219 and REPO-3251
    Map<QName, Serializable> properties = extractFromFile(file, mimetype);

    // check we got something
    assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename, 
            properties.isEmpty());

    if (properties.containsKey(ContentModel.PROP_AUTHOR))
    {
        assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, 
                "Udintsev, Anton (external - Project)",
                DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
    }
    else
    {
        fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
                + mimetype);
    }

    // Ensure that we can also get things which are standard
    // Tika metadata properties, if we so choose to
    assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype, 
            properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
    assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype, 
            mimetype,
            DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
}

Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Test several different files
 * Note - doesn't use extractFromMimetype
 */
public void testSupportedMimetypes() throws Exception
{
    String[] testFiles = new String[] {
          ".doc", ".docx", ".xls", ".xlsx",
          ".ppt", ".pptx", 
          //".vsd", // Our sample file lacks suitable metadata
          "2010.dwg",
          "2003.mpp", "2007.mpp",
          ".pdf",
          ".odt",
    };
       
    AutoDetectParser ap = new AutoDetectParser();
    for (String fileBase : testFiles)
    {
       String filename = "quick" + fileBase;
       URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
       File file = new File(url.getFile());
       
       // Cheat and ask Tika for the mime type!
       Metadata metadata = new Metadata();
       metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
       MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
       String mimetype = mt.toString();
       
       if (logger.isDebugEnabled())
       {
          logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
       }

       // Have it processed
       Map<QName, Serializable> properties = extractFromFile(file, mimetype);
       
       // check we got something
       assertFalse("extractFromMimetype should return at least some properties, " +
       		"none found for " + mimetype + " - " + filename,
          properties.isEmpty());
       
       // check common metadata
       testCommonMetadata(mimetype, properties);
       // check file-type specific metadata
       testFileSpecificMetadata(mimetype, properties);
    }
}

Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Asks Tika to translate the contents into HTML
 */
private void generateHTML(Parser p, RenderingContext context)
{
   ContentReader contentReader = context.makeContentReader();
   
   // Setup things to parse with
   StringWriter sw = new StringWriter();
   ContentHandler handler = buildContentHandler(sw, context);
   
   // Tell Tika what we're dealing with
   Metadata metadata = new Metadata();
   metadata.set(
         Metadata.CONTENT_TYPE, 
         contentReader.getMimetype()
   );
   metadata.set(
         Metadata.RESOURCE_NAME_KEY, 
         nodeService.getProperty( 
               context.getSourceNode(),
               ContentModel.PROP_NAME
         ).toString()
   );

   // Our parse context needs to extract images
   ParseContext parseContext = new ParseContext();
   parseContext.set(Parser.class, new TikaImageExtractingParser(context));
   
   // Parse
   try {
      p.parse(
            contentReader.getContentInputStream(),
            handler, metadata, parseContext
      );
   } catch(Exception e) {
      throw new RenditionServiceException("Tika HTML Conversion Failed", e);
   }
   
   // As a string
   String html = sw.toString();
   
   // If we're doing body-only, remove all the html namespaces
   //  that will otherwise clutter up the document
   boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
   if(bodyOnly) {
      html = html.replaceAll("<\\?xml.*?\\?>", "");
      html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
      html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
      html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
      html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
      html = html.replaceAll("&#13;","");
   }
   
   // Save it
   ContentWriter contentWriter = context.makeContentWriter();
   contentWriter.setMimetype("text/html");
   contentWriter.putContent( html );
}

Source File: DocumentFactory.java From extract with MIT License

4 votes

public TikaDocument create(final String id, final Path path, final long size) {
	final Metadata metadata = new Metadata();

	metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
	return new TikaDocument(id, identifier, path, metadata);
}

Source File: TikaIOTest.java From beam with Apache License 2.0

4 votes

private static Metadata getOdtMetadata() {
  Metadata m = new Metadata();
  m.set("Author", "BeamTikaUser");
  return m;
}

Source File: TikaTest.java From tika-server with Apache License 2.0

4 votes

protected XMLResult getXML(String filePath, Parser parser) throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, filePath);
    return getXML(filePath, parser, metadata);
}

Java Code Examples for org.apache.tika.metadata.Metadata#set()