org.apache.tika.parser.AutoDetectParser Java Examples
The following examples show how to use
org.apache.tika.parser.AutoDetectParser.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String extractMeta(String uri, String contentType) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); fillMetadata(parser, metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); parser.parse(inputStream, new DefaultHandler(), metadata); Map meta = new HashMap(); for (String name : metadata.names()) { String[] values = metadata.getValues(name); meta.put(name, values); } inputStream.close(); return new Gson().toJson(meta); }
Example #2
Source File: FileParserSingleton.java From scava with Eclipse Public License 2.0 | 6 votes |
private FileParserSingleton() { logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.preprocessor.fileparser"); BufferedReader fileList; try { fileList = loadFile(); readSupportedFilesList(fileList); logger.info("List of supported files has been sucessfully loaded"); parser = new AutoDetectParser(); } catch (IOException e) { logger.error("Error while loading the List of supported files:", e); e.printStackTrace(); } }
Example #3
Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
@PostConstruct public void init() throws IOException, SAXException, TikaException{ setFieldName(tikaFieldName); // load tika configuration tikaConfig = new TikaConfig(this.getClass().getClassLoader() .getResourceAsStream("tika-config.xml")); // load tesseract ocr configuration tesseractConfig = new TesseractOCRConfig(); if (tesseractTimeout > 0) { tesseractConfig.setTimeout(tesseractTimeout); } // load image magick configuration -- used for tiff conversion imgConfig = new ImageMagickConfig(); if (convertTimeout > 0) { imgConfig.setTimeout(convertTimeout); } parser = new AutoDetectParser(tikaConfig); }
Example #4
Source File: TikaUtil.java From scipio-erp with Apache License 2.0 | 6 votes |
/** * Finds media type (through Apache Tika library), based on filename and magic numbers. * @throws IOException */ public static MediaType findMediaType(InputStream is, String fileName) throws IOException { BufferedInputStream bis = new BufferedInputStream(is); try { AutoDetectParser parser = new AutoDetectParser(); Detector detector = parser.getDetector(); Metadata md = new Metadata(); md.add(Metadata.RESOURCE_NAME_KEY, fileName); MediaType mediaType = detector.detect(bis, md); return mediaType; } finally { try { bis.close(); } catch (IOException e) { ; } } }
Example #5
Source File: TikaContentExtractor.java From baleen with Apache License 2.0 | 6 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } }
Example #6
Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param Bytes * @return * @throws PDException */ protected String Convert(InputStream Bytes) throws PDException { try { ContentHandler textHandler=new BodyContentHandler(-1); Metadata metadata=new Metadata(); Parser parser=new AutoDetectParser(); ParseContext context=new ParseContext(); parser.parse(Bytes, textHandler, metadata, context); FileMetadata=""; for (String key : metadata.names()) FileMetadata+=key+"="+metadata.get(key)+"\n"; FullText=textHandler.toString(); } catch (Exception ex) { PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage()); } return(FullText); }
Example #7
Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
@Override protected void render(RenderingContext context) { ContentReader contentReader = context.makeContentReader(); String sourceMimeType = contentReader.getMimetype(); // Check that Tika supports the supplied file AutoDetectParser p = new AutoDetectParser(tikaConfig); MediaType sourceMediaType = MediaType.parse(sourceMimeType); if(! p.getParsers().containsKey(sourceMediaType)) { throw new RenditionServiceException( "Source mime type of " + sourceMimeType + " is not supported by Tika for HTML conversions" ); } // Make the HTML Version using Tika // This will also extract out any images as found generateHTML(p, context); }
Example #8
Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig) { config = tikaConfig; parser = new AutoDetectParser(config); SUPPORTED_MIMETYPES = new ArrayList<String>(); for(MediaType mt : parser.getParsers().keySet()) { // Add the canonical mime type SUPPORTED_MIMETYPES.add( mt.toString() ); // And add any aliases of the mime type too - Alfresco uses some // non canonical forms of various mimetypes, so we need all of them for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) { SUPPORTED_MIMETYPES.add( alias.toString() ); } } return SUPPORTED_MIMETYPES; }
Example #9
Source File: ExtractingDocumentLoader.java From lucene-solr with Apache License 2.0 | 6 votes |
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor, TikaConfig config, ParseContextConfig parseContextConfig, SolrContentHandlerFactory factory) { this.params = req.getParams(); this.core = req.getCore(); this.config = config; this.parseContextConfig = parseContextConfig; this.processor = processor; templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); //this is lightweight autoDetectParser = new AutoDetectParser(config); this.factory = factory; ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false); }
Example #10
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); }
Example #11
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testEncryptedWordDoc() throws Exception { System.out.println("testEncryptedWordDoc"); InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); try { parser.parse(stream, body, metadata); } catch (Exception ex) { //donowt } assertFalse(body.toString().contains("Word doc Encrypted")); }
Example #12
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testParseRequiringNotRequiringOCR() throws Exception { System.out.println("parse"); InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); try { parser.parse(stream, body, metadata); } finally { stream.close(); } assertTrue(body.toString().contains("An Example Paper")); }
Example #13
Source File: SimplePageParser.java From flink-crawler with Apache License 2.0 | 5 votes |
@Override public void open(RuntimeContext context) throws Exception { super.open(context); _parser = new AutoDetectParser(); _linkExtractor.setLinkTags(getParserPolicy().getLinkTags()); _linkExtractor.setLinkAttributeTypes(getParserPolicy().getLinkAttributeTypes()); }
Example #14
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } }
Example #15
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return metadata; }
Example #16
Source File: TearlineContentExtractor.java From baleen with Apache License 2.0 | 5 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if (m.find()) { jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); } else { jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } }
Example #17
Source File: JATEUtil.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
public static String parseToPlainText(InputStream fileStream) { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); String rawContent = ""; try { parser.parse(fileStream, handler, metadata); rawContent = handler.toString(); } catch (IOException | SAXException | TikaException e) { LOG.debug("Parsing Exception while extracting content from current file. " + e.toString()); } return rawContent; }
Example #18
Source File: AttachAttribute.java From entando-components with GNU Lesser General Public License v3.0 | 5 votes |
@Override public String getIndexeableFieldValue() { StringBuilder buffer = new StringBuilder(); if (null != super.getIndexeableFieldValue()) { buffer.append(super.getIndexeableFieldValue()); } String extraValue = null; ResourceInterface resource = this.getResource(); if (resource != null) { InputStream is = ((AttachResource) resource).getResourceStream(); if (null != is) { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); try { parser.parse(is, handler, metadata); extraValue = handler.toString(); } catch (Throwable t) { _logger.error("Error while processing the parsing", t); } finally { try { is.close(); } catch (IOException ex) { _logger.error("Error closing stream", ex); } } } } if (null != extraValue) { buffer.append(" ").append(extraValue); } return buffer.toString(); }
Example #19
Source File: ExtractMediaMetadata.java From nifi with Apache License 2.0 | 5 votes |
@SuppressWarnings("unused") @OnScheduled public void onScheduled(ProcessContext context) { String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue(); if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) { metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput)); } else { metadataKeyFilterRef.set(null); } autoDetectParser = new AutoDetectParser(); }
Example #20
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return handler.toString(); }
Example #21
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Ignore @Test public void testMassiveOCRDoc() throws Exception { System.out.println("testMassiveOCRDoc"); InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(stream, body, metadata); assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using")); }
Example #22
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testParseRequiringOCR() throws Exception { System.out.println("parse"); InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(stream, body, metadata); String parsedString = body.toString(); // From first page assertTrue(parsedString.contains("Father or mother")); // From second (last) page assertTrue(parsedString.contains("how you have determined who is the Nearest")); }
Example #23
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testEncryptedPDFDoc() throws Exception { System.out.println("testEncryptedPDFDoc"); InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); try { parser.parse(stream, body, metadata); } catch (Exception ex) { //donowt } assertFalse(body.toString().contains("PDF Encrypted")); }
Example #24
Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = super.buildParseContext(metadata, targetMimeType, options); boolean recurse = includeContents; if(options.getIncludeEmbedded() != null) { recurse = options.getIncludeEmbedded(); } if(recurse) { // Use an auto detect parser to handle the contents if(tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } context.set(Parser.class, new AutoDetectParser(tikaConfig)); } else { // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096 // so we need to specify an empty one if we don't want the recurse parsing to happen context.set(Parser.class, new EmptyParser()); } return context; }
Example #25
Source File: TikaPoweredContainerExtractor.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * Injects the TikaConfig to use * * @param tikaConfig The Tika Config to use */ public void setTikaConfig(TikaConfig tikaConfig) { this.config = tikaConfig; // Setup the detector and parser detector = new DefaultDetector(config.getMimeRepository()); parser = new AutoDetectParser(detector); }
Example #26
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, metadata, context); } return handler.getMetadataList(); }
Example #27
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), context); } return handler.getMetadataList(); }
Example #28
Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0 | 5 votes |
@Override public JSONObject toJson(String filePath) throws OperationException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = new FileInputStream(new File(filePath))) { parser.parse(stream, handler, metadata); } catch (IOException | SAXException | TikaException e) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading"); } final String fileText = handler.toString(); if(fileText == null || fileText.isEmpty()) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document"); } JSONObject jsonObject = new JSONObject(); jsonObject.put("_txt", fileText); String[] metadataNames = metadata.names(); for(String name : metadataNames) { jsonObject.put(name, metadata.get(name)); } return jsonObject; }
Example #29
Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0 | 5 votes |
@Override public String toText(String filePath) throws OperationException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = new FileInputStream(new File(filePath))) { parser.parse(stream, handler, metadata); return handler.toString(); } catch (IOException | SAXException | TikaException e) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading"); } }
Example #30
Source File: ExtractMediaMetadata.java From localization_nifi with Apache License 2.0 | 5 votes |
@SuppressWarnings("unused") @OnScheduled public void onScheduled(ProcessContext context) { String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue(); if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) { metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput)); } else { metadataKeyFilterRef.set(null); } autoDetectParser = new AutoDetectParser(); }