org.apache.tika.io.TikaInputStream#get

Source File: TikaTest.java From tika-server with Apache License 2.0

7 votes

@Override
public void handle(String filename, MediaType mediaType,
                   InputStream stream) {
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    if (! stream.markSupported()) {
        stream = TikaInputStream.get(stream);
    }
    stream.mark(0);
    try {
        IOUtils.copy(stream, os);
        bytes.add(os.toByteArray());
        stream.reset();
    } catch (IOException e) {
        //swallow
    }
}

Source File: ExecUtil.java From ctsms with GNU Lesser General Public License v2.1

6 votes

public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}

Source File: DirectoryManifest.java From genie with Apache License 2.0

6 votes

private String getMimeType(final String name, final Path path) {
    // TODO: Move configuration of special handling cases to external configuration for flexibility
    //       probably a map of filename -> type or extension -> type or produced mime-type -> desired mime-type
    switch (name) {
        case "stdout":
        case "stderr":
        case "run":
            return MediaType.TEXT_PLAIN.toString();
        default:
            try (TikaInputStream inputStream = TikaInputStream.get(path)) {
                return this.tikaConfig.getDetector().detect(inputStream, this.metadata).toString();
            } catch (final IOException ioe) {
                log.error("Unable to detect mime type for {} due to error", path, ioe);
                return MediaType.OCTET_STREAM.toString();
            }
    }
}

Source File: UnpackBuilder.java From kite with Apache License 2.0

6 votes

private boolean parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedExtractor extractor, Record record) {
  String name = entry.getName();
  if (archive.canReadEntryData(entry)) {
    Record entrydata = new Record(); // TODO: or pass myself?
    //Record entrydata = record.copy();
    
    // For detectors to work, we need a mark/reset supporting
    // InputStream, which ArchiveInputStream isn't, so wrap
    TemporaryResources tmp = new TemporaryResources();
    try {
      TikaInputStream tis = TikaInputStream.get(archive, tmp);
      return extractor.parseEmbedded(tis, entrydata, name, getChild());
    } finally {
      try {
        tmp.dispose();
      } catch (TikaException e) {
        LOG.warn("Cannot dispose of tmp Tika resources", e);
      }
    }
  } else {
    return false;
  } 
}

Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0

6 votes

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}

Source File: EmbedSpawner.java From extract with MIT License

6 votes

@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}

Source File: MimetypeMap.java From alfresco-data-model with GNU Lesser General Public License v3.0

5 votes

private MediaType detectType(String filename, InputStream input)
{
	TikaInputStream inp = null;
    if (input != null)
    {
    	inp = TikaInputStream.get(input);
    }
    return detectType(filename, inp);
}

Source File: TransportAmazonLambdaS3.java From github-bucket with ISC License

5 votes

@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}

Source File: RepositoryS3.java From github-bucket with ISC License

5 votes

private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException {
    byte[] content;
    byte[] newHash;
    LOG.debug("Start processing file: {}", path);
    try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) {
        // Get content
        content = IOUtils.toByteArray(is);
        // Get hash
        newHash = is.getMessageDigest().digest();
    }
    if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) {
        LOG.info("Uploading file: {}", path);
        ObjectMetadata bucketMetadata = new ObjectMetadata();
        bucketMetadata.setContentMD5(Base64.encodeAsString(newHash));
        bucketMetadata.setContentLength(content.length);
        // Give Tika a few hints for the content detection
        Metadata tikaMetadata = new Metadata();
        tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
        // Fire!
        try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) {
            bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
            s3.putObject(bucket.getName(), path, bis, bucketMetadata);
            return true;
        }
    }
    LOG.info("Skipping file (same checksum): {}", path);
    return false;
}

Source File: TikaPoweredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * There seems to be some sort of issue with some downstream
 *  3rd party libraries, and input streams that come from
 *  a {@link ContentReader}. This happens most often with
 *  JPEG and Tiff files.
 * For these cases, buffer out to a local file if not
 *  already there
 */
protected InputStream getInputStream(ContentReader reader) throws IOException
{
   // Prefer the File if available, it's generally quicker
   if(reader instanceof FileContentReader) 
   {
      return TikaInputStream.get( ((FileContentReader)reader).getFile() );
   }
   
   // Grab the InputStream for the Content
   InputStream input = reader.getContentInputStream();
   
   // Images currently always require a file
   if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(reader.getMimetype()) ||
      MimetypeMap.MIMETYPE_IMAGE_TIFF.equals(reader.getMimetype())) 
   {
      TemporaryResources tmp = new TemporaryResources();
      TikaInputStream stream = TikaInputStream.get(input, tmp);
      stream.getFile(); // Have it turned into File backed
      return stream;
   }
   else
   {
      // The regular Content InputStream should be fine
      return input; 
   }
}

Source File: HtmlDetector.java From data-prep with Apache License 2.0

5 votes

/**
 * Reads an input stream and checks if it has a HTML format.
 * 
 * The general contract of a detector is to not close the specified stream before returning. It is to the
 * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
 * {@see TikaInputStream} in order to let the stream always return the same bytes.
 * 
 * 
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an HTML format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
    if (inputStream == null) {
        return null;
    } else {
        inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
        byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
        int n = 0;

        for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m =
                inputStream.read(buffer, n, buffer.length - n)) {
            n += m;
        }

        inputStream.reset();
        String head = FormatUtils.readFromBuffer(buffer, 0, n);
        try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
            Charset charset = htmlEncodingDetector.detect(stream, metadata);

            if (charset != null) {
                return new Format(htmlFormatFamily, charset.name());
            }
        }
        return null;
    }

}

Source File: TransportAmazonLambdaS3.java From github-bucket with ISC License

5 votes

@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}

Source File: MediaTypeValidator.java From iaf with Apache License 2.0

5 votes

/**
 * Detects media type from input stream
 * 
 * @param inputStream
 * @param filename
 * @return
 * @throws IOException
 */
public MediaType getMediaType(InputStream inputStream, String filename) throws IOException {
	// Create every time as TemporaryResources is not thread-safe
	TemporaryResources tmp = new TemporaryResources();
	tmp.setTemporaryFileDirectory(Paths.get(pdfOutputlocation));
	try (TikaInputStream tis = TikaInputStream.get(inputStream, tmp)) {
		String type = tika.detect(tis, filename);
		return MediaType.parse(type);
	}
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}

Source File: EmbeddedExtractor.java From kite with Apache License 2.0

5 votes

public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) {
    // Use the delegate parser to parse this entry
    
    TemporaryResources tmp = new TemporaryResources();
    try {
      final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
      if (stream instanceof TikaInputStream) {
        final Object container = ((TikaInputStream) stream).getOpenContainer();
        if (container != null) {
          newStream.setOpenContainer(container);
        }
      }
      record = record.copy();

      record.replaceValues(Fields.ATTACHMENT_BODY, newStream);
      record.removeAll(Fields.ATTACHMENT_MIME_TYPE);
      record.removeAll(Fields.ATTACHMENT_CHARSET);
      
      record.removeAll(Fields.ATTACHMENT_NAME);
      if (name != null && name.length() > 0) {
        record.put(Fields.ATTACHMENT_NAME, name);
      }
      
      return child.process(record);
//    } catch (RuntimeException e) {
//      
//      // THIS IS THE DIFF WRT ParsingEmbeddedDocumentExtractor
//      throw new MorphlineRuntimeException(e);
//      
//        // TODO: can we log a warning somehow?
//        // Could not parse the entry, just skip the content
    } finally {
      Closeables.closeQuietly(tmp);
    }

  }

Source File: TikaFilePlace.java From emissary with Apache License 2.0

5 votes

/**
 * Use the Tika mime type (magic) detector to identify the file type
 *
 * @param d the IBaseDataObject payload to evaluate
 * @return mediaType
 */
private MediaType detectType(IBaseDataObject d) throws Exception {
    Metadata metadata = new Metadata();
    InputStream input = TikaInputStream.get(d.data(), metadata);
    appendFilenameMimeTypeSupport(d, metadata);
    MediaType mediaType = mimeTypes.detect(input, metadata);
    logger.debug("Tika type: " + mediaType.toString());
    return mediaType;
}

Source File: CachingTesseractOCRParser.java From extract with MIT License

5 votes

private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}

Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0

4 votes

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

    // If Tesseract is not on the path with the current config, do not try to run OCR
    // getSupportedTypes shouldn't have listed us as handling it, so this should only
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (! hasTesseract(config))
        return;

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    TemporaryResources tmp = new TemporaryResources();
    File output = null;
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File input = tikaStream.getFile();
        long size = tikaStream.getLength();

        if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

            output = tmp.createTemporaryFile();
            doOCR(input, output, config);

            // Tesseract appends .txt to output file name
            output = new File(output.getAbsolutePath() + ".txt");

            if (output.exists())
                extractOutput(new FileInputStream(output), xhtml);

        }

        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
    } finally {
        tmp.dispose();
        if (output != null) {
            output.delete();
        }
    }
}

Source File: Extractor.java From extract with MIT License

4 votes

/**
 * Create a pull-parser from the given {@link TikaInputStream}.
 *
 * @param path the stream to extract from
 * @return A pull-parsing reader.
 */
public TikaDocument extract(final Path path) throws IOException {
	final TikaDocument rootDocument = documentFactory.create(path);
	TikaInputStream tikaInputStream = TikaInputStream.get(path, rootDocument.getMetadata());
	final ParseContext context = new ParseContext();
	final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);

	// Set a fallback parser that outputs an empty tikaDocument for empty files,
	// otherwise throws an exception.
	autoDetectParser.setFallback(FallbackParser.INSTANCE);
	final Parser parser;

	if (null != digester) {
		parser = new DigestingParser(autoDetectParser, digester);
	} else {
		parser = autoDetectParser;
	}

	if (!ocrDisabled) {
		context.set(TesseractOCRConfig.class, ocrConfig);
	}

	context.set(PDFParserConfig.class, pdfConfig);

	// Only include "safe" tags in the HTML output from Tika's HTML parser.
	// This excludes script tags and objects.
	context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

	final Reader reader;
	final Function<Writer, ContentHandler> handler;

	if (OutputFormat.HTML == outputFormat) {
		handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
	} else {

		// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
		// because only the body of embeds is pushed to the content handler further down the line, we can't
		// expect a body tag.
		handler = WriteOutContentHandler::new;
	}

	if (EmbedHandling.SPAWN == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(rootDocument, context, embedOutput, handler));
	} else if (EmbedHandling.CONCATENATE == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedParser(rootDocument, context));
	} else {
		context.set(Parser.class, EmptyParser.INSTANCE);
		context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
	}

	// the constructor of ParsingReader actually parses the document in background
	if (OutputFormat.HTML == outputFormat) {
		reader = new ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context, handler);
	} else {
		reader = new org.apache.tika.parser.ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context);
	}
	rootDocument.setReader(reader);

	return rootDocument;
}

Source File: ExtractMediaMetadata.java From localization_nifi with Apache License 2.0

4 votes

private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                                       Integer maxAttribLen) throws IOException, TikaException, SAXException {
    final Metadata metadata = new Metadata();
    final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
    autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);

    final Map<String, String> results = new HashMap<>();
    final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
    final StringBuilder dataBuilder = new StringBuilder();
    for (final String key : metadata.names()) {
        if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
            continue;
        }
        dataBuilder.setLength(0);
        if (metadata.isMultiValued(key)) {
            for (String val : metadata.getValues(key)) {
                if (dataBuilder.length() > 1) {
                    dataBuilder.append(", ");
                }
                if (dataBuilder.length() + val.length() < maxAttribLen) {
                    dataBuilder.append(val);
                } else {
                    dataBuilder.append("...");
                    break;
                }
            }
        } else {
            dataBuilder.append(metadata.get(key));
        }
        if (prefix == null) {
            results.put(key, dataBuilder.toString().trim());
        } else {
            results.put(prefix + key, dataBuilder.toString().trim());
        }

        // cutoff at max if provided
        if (maxAttribs != null && results.size() >= maxAttribs) {
            break;
        }
    }
    return results;
}

Java Code Examples for org.apache.tika.io.TikaInputStream#get()