Java Code Examples for org.apache.tika.io.TikaInputStream#get()
The following examples show how to use
org.apache.tika.io.TikaInputStream#get() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaTest.java From tika-server with Apache License 2.0 | 7 votes |
@Override public void handle(String filename, MediaType mediaType, InputStream stream) { ByteArrayOutputStream os = new ByteArrayOutputStream(); if (! stream.markSupported()) { stream = TikaInputStream.get(stream); } stream.mark(0); try { IOUtils.copy(stream, os); bytes.add(os.toByteArray()); stream.reset(); } catch (IOException e) { //swallow } }
Example 2
Source File: ExecUtil.java From ctsms with GNU Lesser General Public License v2.1 | 6 votes |
public static String getMimeType(byte[] data, String fileName) throws Throwable { TikaInputStream tikaStream = null; Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); try { tikaStream = TikaInputStream.get(data, metadata); return detector.detect(tikaStream, metadata).toString(); } catch (Throwable t) { throw t; } finally { if (tikaStream != null) { try { tikaStream.close(); } catch (IOException e) { } } } }
Example 3
Source File: DirectoryManifest.java From genie with Apache License 2.0 | 6 votes |
private String getMimeType(final String name, final Path path) { // TODO: Move configuration of special handling cases to external configuration for flexibility // probably a map of filename -> type or extension -> type or produced mime-type -> desired mime-type switch (name) { case "stdout": case "stderr": case "run": return MediaType.TEXT_PLAIN.toString(); default: try (TikaInputStream inputStream = TikaInputStream.get(path)) { return this.tikaConfig.getDetector().detect(inputStream, this.metadata).toString(); } catch (final IOException ioe) { log.error("Unable to detect mime type for {} due to error", path, ioe); return MediaType.OCTET_STREAM.toString(); } } }
Example 4
Source File: UnpackBuilder.java From kite with Apache License 2.0 | 6 votes |
private boolean parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedExtractor extractor, Record record) { String name = entry.getName(); if (archive.canReadEntryData(entry)) { Record entrydata = new Record(); // TODO: or pass myself? //Record entrydata = record.copy(); // For detectors to work, we need a mark/reset supporting // InputStream, which ArchiveInputStream isn't, so wrap TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(archive, tmp); return extractor.parseEmbedded(tis, entrydata, name, getChild()); } finally { try { tmp.dispose(); } catch (TikaException e) { LOG.warn("Cannot dispose of tmp Tika resources", e); } } } else { return false; } }
Example 5
Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); FileOutputStream fos = null; TikaInputStream tis = null; try { int w = image.getWidth(null); int h = image.getHeight(null); BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB); File file = tmp.createTemporaryFile(); fos = new FileOutputStream(file); ImageIO.write(bImage, "png", fos); tis = TikaInputStream.get(file); parse(tis, handler, metadata, context); } finally { tmp.dispose(); if (tis != null) tis.close(); if (fos != null) fos.close(); } }
Example 6
Source File: EmbedSpawner.java From extract with MIT License | 6 votes |
@Override public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata, final boolean outputHtml) throws SAXException, IOException { // There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main // document as usual. if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata .get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) { final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler)); if (outputHtml) { writeStart(handler, metadata); } delegateParsing(input, embedHandler, metadata); if (outputHtml) { writeEnd(handler); } } else { try (final TikaInputStream tis = TikaInputStream.get(input)) { spawnEmbedded(tis, metadata); } } }
Example 7
Source File: MimetypeMap.java From alfresco-data-model with GNU Lesser General Public License v3.0 | 5 votes |
private MediaType detectType(String filename, InputStream input) { TikaInputStream inp = null; if (input != null) { inp = TikaInputStream.get(input); } return detectType(filename, inp); }
Example 8
Source File: TransportAmazonLambdaS3.java From github-bucket with ISC License | 5 votes |
@Override void writeFile(final String path, final byte[] data) throws IOException { ObjectMetadata bucketMetadata = new ObjectMetadata(); bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data)); bucketMetadata.setContentLength(data.length); // Give Tika a few hints for the content detection Metadata tikaMetadata = new Metadata(); tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path))); // Fire! try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) { bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString()); s3.putObject(bucket, resolveKey(path), bis, bucketMetadata); } }
Example 9
Source File: RepositoryS3.java From github-bucket with ISC License | 5 votes |
private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException { byte[] content; byte[] newHash; LOG.debug("Start processing file: {}", path); try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) { // Get content content = IOUtils.toByteArray(is); // Get hash newHash = is.getMessageDigest().digest(); } if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) { LOG.info("Uploading file: {}", path); ObjectMetadata bucketMetadata = new ObjectMetadata(); bucketMetadata.setContentMD5(Base64.encodeAsString(newHash)); bucketMetadata.setContentLength(content.length); // Give Tika a few hints for the content detection Metadata tikaMetadata = new Metadata(); tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path))); // Fire! try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) { bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString()); s3.putObject(bucket.getName(), path, bis, bucketMetadata); return true; } } LOG.info("Skipping file (same checksum): {}", path); return false; }
Example 10
Source File: TikaPoweredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * There seems to be some sort of issue with some downstream * 3rd party libraries, and input streams that come from * a {@link ContentReader}. This happens most often with * JPEG and Tiff files. * For these cases, buffer out to a local file if not * already there */ protected InputStream getInputStream(ContentReader reader) throws IOException { // Prefer the File if available, it's generally quicker if(reader instanceof FileContentReader) { return TikaInputStream.get( ((FileContentReader)reader).getFile() ); } // Grab the InputStream for the Content InputStream input = reader.getContentInputStream(); // Images currently always require a file if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(reader.getMimetype()) || MimetypeMap.MIMETYPE_IMAGE_TIFF.equals(reader.getMimetype())) { TemporaryResources tmp = new TemporaryResources(); TikaInputStream stream = TikaInputStream.get(input, tmp); stream.getFile(); // Have it turned into File backed return stream; } else { // The regular Content InputStream should be fine return input; } }
Example 11
Source File: HtmlDetector.java From data-prep with Apache License 2.0 | 5 votes |
/** * Reads an input stream and checks if it has a HTML format. * * The general contract of a detector is to not close the specified stream before returning. It is to the * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified * {@see TikaInputStream} in order to let the stream always return the same bytes. * * * @param metadata the specified TIKA {@link Metadata} * @param inputStream the specified input stream * @return either null or an HTML format * @throws IOException */ @Override public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException { if (inputStream == null) { return null; } else { inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE); byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE]; int n = 0; for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m = inputStream.read(buffer, n, buffer.length - n)) { n += m; } inputStream.reset(); String head = FormatUtils.readFromBuffer(buffer, 0, n); try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) { Charset charset = htmlEncodingDetector.detect(stream, metadata); if (charset != null) { return new Format(htmlFormatFamily, charset.name()); } } return null; } }
Example 12
Source File: TransportAmazonLambdaS3.java From github-bucket with ISC License | 5 votes |
@Override void writeFile(final String path, final byte[] data) throws IOException { ObjectMetadata bucketMetadata = new ObjectMetadata(); bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data)); bucketMetadata.setContentLength(data.length); // Give Tika a few hints for the content detection Metadata tikaMetadata = new Metadata(); tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path))); // Fire! try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) { bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString()); s3.putObject(bucket, resolveKey(path), bis, bucketMetadata); } }
Example 13
Source File: MediaTypeValidator.java From iaf with Apache License 2.0 | 5 votes |
/** * Detects media type from input stream * * @param inputStream * @param filename * @return * @throws IOException */ public MediaType getMediaType(InputStream inputStream, String filename) throws IOException { // Create every time as TemporaryResources is not thread-safe TemporaryResources tmp = new TemporaryResources(); tmp.setTemporaryFileDirectory(Paths.get(pdfOutputlocation)); try (TikaInputStream tis = TikaInputStream.get(inputStream, tmp)) { String type = tika.detect(tis, filename); return MediaType.parse(type); } }
Example 14
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } }
Example 15
Source File: EmbeddedExtractor.java From kite with Apache License 2.0 | 5 votes |
public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) { // Use the delegate parser to parse this entry TemporaryResources tmp = new TemporaryResources(); try { final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp); if (stream instanceof TikaInputStream) { final Object container = ((TikaInputStream) stream).getOpenContainer(); if (container != null) { newStream.setOpenContainer(container); } } record = record.copy(); record.replaceValues(Fields.ATTACHMENT_BODY, newStream); record.removeAll(Fields.ATTACHMENT_MIME_TYPE); record.removeAll(Fields.ATTACHMENT_CHARSET); record.removeAll(Fields.ATTACHMENT_NAME); if (name != null && name.length() > 0) { record.put(Fields.ATTACHMENT_NAME, name); } return child.process(record); // } catch (RuntimeException e) { // // // THIS IS THE DIFF WRT ParsingEmbeddedDocumentExtractor // throw new MorphlineRuntimeException(e); // // // TODO: can we log a warning somehow? // // Could not parse the entry, just skip the content } finally { Closeables.closeQuietly(tmp); } }
Example 16
Source File: TikaFilePlace.java From emissary with Apache License 2.0 | 5 votes |
/** * Use the Tika mime type (magic) detector to identify the file type * * @param d the IBaseDataObject payload to evaluate * @return mediaType */ private MediaType detectType(IBaseDataObject d) throws Exception { Metadata metadata = new Metadata(); InputStream input = TikaInputStream.get(d.data(), metadata); appendFilenameMimeTypeSupport(d, metadata); MediaType mediaType = mimeTypes.detect(input, metadata); logger.debug("Tika type: " + mediaType.toString()); return mediaType; }
Example 17
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata, final ParseContext context, TesseractOCRConfig config, final boolean inline) throws IOException, SAXException, TikaException { try (final TikaInputStream tis = TikaInputStream.get(in)) { cachedParse(tis, handler, metadata, context, config, inline); } catch (final InterruptedException e) { throw new TikaException("Interrupted.", e); } }
Example 18
Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0 | 4 votes |
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar if (! hasTesseract(config)) return; XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); TemporaryResources tmp = new TemporaryResources(); File output = null; try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); File input = tikaStream.getFile(); long size = tikaStream.getLength(); if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { output = tmp.createTemporaryFile(); doOCR(input, output, config); // Tesseract appends .txt to output file name output = new File(output.getAbsolutePath() + ".txt"); if (output.exists()) extractOutput(new FileInputStream(output), xhtml); } // Temporary workaround for TIKA-1445 - until we can specify // composite parsers with strategies (eg Composite, Try In Turn), // always send the image onwards to the regular parser to have // the metadata for them extracted as well _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context); } finally { tmp.dispose(); if (output != null) { output.delete(); } } }
Example 19
Source File: Extractor.java From extract with MIT License | 4 votes |
/** * Create a pull-parser from the given {@link TikaInputStream}. * * @param path the stream to extract from * @return A pull-parsing reader. */ public TikaDocument extract(final Path path) throws IOException { final TikaDocument rootDocument = documentFactory.create(path); TikaInputStream tikaInputStream = TikaInputStream.get(path, rootDocument.getMetadata()); final ParseContext context = new ParseContext(); final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser); // Set a fallback parser that outputs an empty tikaDocument for empty files, // otherwise throws an exception. autoDetectParser.setFallback(FallbackParser.INSTANCE); final Parser parser; if (null != digester) { parser = new DigestingParser(autoDetectParser, digester); } else { parser = autoDetectParser; } if (!ocrDisabled) { context.set(TesseractOCRConfig.class, ocrConfig); } context.set(PDFParserConfig.class, pdfConfig); // Only include "safe" tags in the HTML output from Tika's HTML parser. // This excludes script tags and objects. context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE); final Reader reader; final Function<Writer, ContentHandler> handler; if (OutputFormat.HTML == outputFormat) { handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer)); } else { // The default BodyContentHandler is used when constructing the ParsingReader for text output, but // because only the body of embeds is pushed to the content handler further down the line, we can't // expect a body tag. handler = WriteOutContentHandler::new; } if (EmbedHandling.SPAWN == embedHandling) { context.set(Parser.class, parser); context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(rootDocument, context, embedOutput, handler)); } else if (EmbedHandling.CONCATENATE == embedHandling) { context.set(Parser.class, parser); context.set(EmbeddedDocumentExtractor.class, new EmbedParser(rootDocument, context)); } else { context.set(Parser.class, EmptyParser.INSTANCE); context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker()); } // the constructor of ParsingReader actually parses the document in background if (OutputFormat.HTML == outputFormat) { reader = new ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context, handler); } else { reader = new org.apache.tika.parser.ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context); } rootDocument.setReader(reader); return rootDocument; }
Example 20
Source File: ExtractMediaMetadata.java From localization_nifi with Apache License 2.0 | 4 votes |
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); final Map<String, String> results = new HashMap<>(); final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; } dataBuilder.setLength(0); if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) { dataBuilder.append(", "); } if (dataBuilder.length() + val.length() < maxAttribLen) { dataBuilder.append(val); } else { dataBuilder.append("..."); break; } } } else { dataBuilder.append(metadata.get(key)); } if (prefix == null) { results.put(key, dataBuilder.toString().trim()); } else { results.put(prefix + key, dataBuilder.toString().trim()); } // cutoff at max if provided if (maxAttribs != null && results.size() >= maxAttribs) { break; } } return results; }