org.apache.tika.parser.Parser Java Examples
The following examples show how to use
org.apache.tika.parser.Parser.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SolrCellBuilder.java From kite with Apache License 2.0 | 6 votes |
private Parser detectParser(Record record) { if (!hasAtLeastOneMimeType(record)) { return null; } String mediaTypeStr = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); //ExtractingParams.STREAM_TYPE); assert mediaTypeStr != null; MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType(); Parser parser = mediaTypeToParserMap.get(mediaType); // fast path if (parser != null) { return parser; } // wildcard matching for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) { if (isMediaTypeMatch(mediaType, entry.getKey())) { return entry.getValue(); } } if (LOG.isDebugEnabled()) { LOG.debug("No supported MIME type parser found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr); } return null; }
Example #2
Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
public void testSupports() throws Exception { ArrayList<String> mimeTypes = new ArrayList<String>(); for (Parser p : new Parser[] { new OfficeParser(), new OpenDocumentParser(), new Mp3Parser(), new OOXMLParser() }) { Set<MediaType> mts = p.getSupportedTypes(new ParseContext()); for (MediaType mt : mts) { mimeTypes.add(mt.toString()); } } for (String mimetype : mimeTypes) { boolean supports = extracter.isSupported(mimetype); assertTrue("Mimetype should be supported: " + mimetype, supports); } }
Example #3
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, txtParser); txtParser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); }
Example #4
Source File: ImageConverter.java From openmeetings with Apache License 2.0 | 6 votes |
private static ProcessResult initSize(BaseFileItem f, File img, String mime) { ProcessResult res = new ProcessResult(); res.setProcess("get image dimensions :: " + f.getId()); final Parser parser = new ImageParser(); try (InputStream is = new FileInputStream(img)) { Metadata metadata = new Metadata(); metadata.set(CONTENT_TYPE, mime); parser.parse(is, new DefaultHandler(), metadata, new ParseContext()); f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH))); f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH))); res.setExitCode(ZERO); } catch (Exception e) { log.error("Error while getting dimensions", e); res.setError("Error while getting dimensions"); res.setException(e.getMessage()); res.setExitCode(-1); } return res; }
Example #5
Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param Bytes * @return * @throws PDException */ protected String Convert(InputStream Bytes) throws PDException { try { ContentHandler textHandler=new BodyContentHandler(-1); Metadata metadata=new Metadata(); Parser parser=new AutoDetectParser(); ParseContext context=new ParseContext(); parser.parse(Bytes, textHandler, metadata, context); FileMetadata=""; for (String key : metadata.names()) FileMetadata+=key+"="+metadata.get(key)+"\n"; FullText=textHandler.toString(); } catch (Exception ex) { PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage()); } return(FullText); }
Example #6
Source File: TikaProcessor.java From quarkus with Apache License 2.0 | 6 votes |
public static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath, Optional<String> requiredParsers, Map<String, Map<String, String>> parserParamMaps, Map<String, String> parserAbbreviations) throws Exception { Predicate<String> pred = p -> !NOT_NATIVE_READY_PARSERS.contains(p); List<String> providerNames = getProviderNames(Parser.class.getName()); if (tikaConfigPath.isPresent() || !requiredParsers.isPresent()) { return providerNames.stream().filter(pred).collect(Collectors.toMap(Function.identity(), p -> Collections.<TikaParserParameter> emptyList())); } else { List<String> abbreviations = Arrays.stream(requiredParsers.get().split(",")).map(s -> s.trim()) .collect(Collectors.toList()); Map<String, String> fullNamesAndAbbreviations = abbreviations.stream() .collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity())); return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p)) .collect(Collectors.toMap(Function.identity(), p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p))))); } }
Example #7
Source File: CachingTesseractOCRParserTest.java From extract with MIT License | 5 votes |
@Test public void testWriteToCache() throws Throwable { final Path simple = Paths.get(this.simple.toURI()); Writer writer = new StringWriter(); final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger(); final Parser parser = new CachingTesseractOCRParser(tmpDir) { private static final long serialVersionUID = 6551690243986921730L; @Override public void cacheHit() { hit.incrementAndGet(); } @Override public void cacheMiss() { miss.incrementAndGet(); } }; try (final InputStream in = Files.newInputStream(simple)) { parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext()); } Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim()); Assert.assertEquals(0, hit.get()); Assert.assertEquals(1, miss.get()); // Try again from the cache. writer = new StringWriter(); try (final InputStream in = Files.newInputStream(simple)) { parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext()); } Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim()); Assert.assertEquals(1, hit.get()); Assert.assertEquals(1, miss.get()); }
Example #8
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return metadata; }
Example #9
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
public String getTextWoDoublebreaks(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ ContentHandler handler = new OriginalBodyContentHandler(); try { parser.parse(is, handler, metadata, context); } finally { is.close(); } return handler.toString(); }
Example #10
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
/** * Basic text extraction. * <p> * Tries to close input stream after processing. */ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ ContentHandler handler = new BodyContentHandler(1000000); try { parser.parse(is, handler, metadata, context); } finally { is.close(); } return handler.toString(); }
Example #11
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(handlerType, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), new ParseContext()); } return handler.getMetadataList(); }
Example #12
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return handler.toString(); }
Example #13
Source File: TikaProcessor.java From quarkus with Apache License 2.0 | 5 votes |
@BuildStep @Record(ExecutionTime.STATIC_INIT) void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder, BuildProducer<ServiceProviderBuildItem> serviceProvider, TikaConfiguration configuration) throws Exception { Map<String, List<TikaParserParameter>> parsers = getSupportedParserConfig(configuration.tikaConfigPath, configuration.parsers, configuration.parserOptions, configuration.parser); String tikaXmlConfiguration = generateTikaXmlConfiguration(parsers); serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), new ArrayList<>(parsers.keySet()))); serviceProvider .produce(new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName()))); serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(), getProviderNames(EncodingDetector.class.getName()))); recorder.initTikaParser(beanContainer.getValue(), configuration, tikaXmlConfiguration); }
Example #14
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, metadata, context); } return handler.getMetadataList(); }
Example #15
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), context); } return handler.getMetadataList(); }
Example #16
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), parseContext); } return handler.getMetadataList(); }
Example #17
Source File: ParsingReader.java From extract with MIT License | 5 votes |
/** * Creates a reader for the content of the given binary stream * with the given document metadata. The given parser is used for the * parsing task that is run with the given executor. * * The created reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link #close()} method is called on this reader. * * @param parser parser instance * @param input binary stream * @param metadata document metadata * @param context parsing context * @throws IOException if the document can not be parsed */ public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext context, final Function<Writer, ContentHandler> handler) throws IOException { final PipedReader pipedReader = new PipedReader(); this.parser = parser; reader = new BufferedReader(pipedReader); try { writer = new PipedWriter(pipedReader); } catch (IOException e) { throw new IllegalStateException(e); // Should never happen. } this.input = input; this.metadata = metadata; this.context = context; // Generate the handler. this.handler = handler.apply(writer); parse(); // TIKA-203: Buffer first character to force metadata extraction. reader.mark(1); //noinspection ResultOfMethodCallIgnored reader.read(); reader.reset(); }
Example #18
Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License | 5 votes |
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException { ParseContext context = new ParseContext(); ContentHandler handler = new BodyContentHandler(-1); context.set(Parser.class, parser); DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm); context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor); parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context); return extractor.getDocument(); }
Example #19
Source File: EmbeddingHTMLParsingReader.java From extract with MIT License | 5 votes |
public EmbeddingHTMLParsingReader(final TikaDocument parent, final String open, final String close, final Parser parser, final TikaInputStream input, final Metadata metadata, final ParseContext context) throws IOException { super(parser, input, metadata, context, (writer)-> new SubstitutingContentHandler(parent, open, close, new ExpandedTitleContentHandler(new HTML5Serializer(writer)))); this.replacer = new TokenReplacingReader((token)-> { final EmbeddedTikaDocument embed = parent.getEmbed(token); if (null == embed) { return null; } return DataURIEncodingInputStream.createReader(embed.getPath(), embed.getMetadata()); }, reader, open, close); }
Example #20
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); }
Example #21
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } }
Example #22
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 5 votes |
public TikaCallable(Parser parser, BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor, InputStream input, Metadata metadata, boolean extractLanguage, ParseContext parseContext) { _parser = parser; _contentExtractor = contentExtractor; _linkExtractor = linkExtractor; _input = input; _metadata = metadata; _extractLanguage = extractLanguage; _parseContext = parseContext; }
Example #23
Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = super.buildParseContext(metadata, targetMimeType, options); boolean recurse = includeContents; if(options.getIncludeEmbedded() != null) { recurse = options.getIncludeEmbedded(); } if(recurse) { // Use an auto detect parser to handle the contents if(tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } context.set(Parser.class, new AutoDetectParser(tikaConfig)); } else { // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096 // so we need to specify an empty one if we don't want the recurse parsing to happen context.set(Parser.class, new EmptyParser()); } return context; }
Example #24
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override protected Parser getParser() { return new CompositeParser( tikaConfig.getMediaTypeRegistry(), parsers ); }
Example #25
Source File: TikaSpringConfiguredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * Injects the name of the Tika parser to use * @param className */ @SuppressWarnings("unchecked") public void setTikaParserName(String className) { tikaParserClassName = className; // Load the class try { tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName); setTikaParser(getParser()); } catch(ClassNotFoundException e) { throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found"); } }
Example #26
Source File: TikaSpringConfiguredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * Injects the Tika parser to use * @param tikaParser */ public void setTikaParser(Parser tikaParser) { this.tikaParser = tikaParser; // Build the mime types, updating the copy our parent // holds for us as we go along ArrayList<String> mimetypes = new ArrayList<String>(); for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext())) { mimetypes.add( mt.toString() ); } super.setSupportedMimetypes(mimetypes); }
Example #27
Source File: MP3MetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
@Override protected Parser getParser() { return new Mp3Parser(); }
Example #28
Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0 | 4 votes |
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException { // Input & Output Variables ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes); byte[] xml_bytes = null; // Tika Parser Objects Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { // Populate Metadata Object with Tika Parser parser.parse(base_input, handler, metadata, context); // Container & Writer for Metadata Properties meta_props = new Properties(); StringWriter sw = new StringWriter(); // Put Tika Metadata in Properties for(String name : metadata.names()) { if (!metadata.get(name).isEmpty()) { meta_props.put(name, metadata.get(name)); } } meta_props.store(sw, "Tika Values"); // Expected Harvester Properties String meta_descr = metadata.get(TikaCoreProperties.DESCRIPTION); String meta_modif = metadata.get(TikaCoreProperties.MODIFIED); String meta_title = metadata.get(TikaCoreProperties.TITLE); // Default Label for Undefined Tika Properties DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd"); Date date = new Date(); String date_today = date_format.format(date); String tika_label = String.format("TIKA_%s", date_today); // Check For Null Values & Set Defaults if (meta_descr == null) { meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString()); } else { meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr); } if (meta_modif == null) { meta_props.put(WKAConstants.WKA_MODIFIED, tika_label); } else { meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif); } if (meta_title == null) { meta_props.put(WKAConstants.WKA_TITLE, file_name); } else { meta_props.put(WKAConstants.WKA_TITLE, meta_title); } // Build XML as Bytes MapAttribute attr = AttributeUtils.fromProperties(meta_props); Document document = new SimpleDcMetaBuilder().create(attr); xml_bytes = XmlUtils.toString(document).getBytes("UTF-8"); } catch (Exception ex) { LOG.error(String.format("Error reading data."), ex); } finally { base_input.close(); } return xml_bytes; }
Example #29
Source File: PdfBoxMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
@Override protected Parser getParser() { return new PDFParser(); }
Example #30
Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
/** * Does auto-detection to select the best Tika * Parser. */ @Override protected Parser getParser() { return parser; }