Java Code Examples for org.apache.tika.parser.AutoDetectParser#parse()
The following examples show how to use
org.apache.tika.parser.AutoDetectParser#parse() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaContentExtractor.java From baleen with Apache License 2.0 | 6 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } }
Example 2
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String extractMeta(String uri, String contentType) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); fillMetadata(parser, metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); parser.parse(inputStream, new DefaultHandler(), metadata); Map meta = new HashMap(); for (String name : metadata.names()) { String[] values = metadata.getValues(name); meta.put(name, values); } inputStream.close(); return new Gson().toJson(meta); }
Example 3
Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0 | 5 votes |
@Override public String toText(String filePath) throws OperationException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = new FileInputStream(new File(filePath))) { parser.parse(stream, handler, metadata); return handler.toString(); } catch (IOException | SAXException | TikaException e) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading"); } }
Example 4
Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0 | 5 votes |
@Override public JSONObject toJson(String filePath) throws OperationException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = new FileInputStream(new File(filePath))) { parser.parse(stream, handler, metadata); } catch (IOException | SAXException | TikaException e) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading"); } final String fileText = handler.toString(); if(fileText == null || fileText.isEmpty()) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document"); } JSONObject jsonObject = new JSONObject(); jsonObject.put("_txt", fileText); String[] metadataNames = metadata.names(); for(String name : metadataNames) { jsonObject.put(name, metadata.get(name)); } return jsonObject; }
Example 5
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testParseRequiringOCR() throws Exception { System.out.println("parse"); InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(stream, body, metadata); String parsedString = body.toString(); // From first page assertTrue(parsedString.contains("Father or mother")); // From second (last) page assertTrue(parsedString.contains("how you have determined who is the Nearest")); }
Example 6
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Ignore @Test public void testMassiveOCRDoc() throws Exception { System.out.println("testMassiveOCRDoc"); InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(stream, body, metadata); assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using")); }
Example 7
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testEncryptedPDFDoc() throws Exception { System.out.println("testEncryptedPDFDoc"); InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); try { parser.parse(stream, body, metadata); } catch (Exception ex) { //donowt } assertFalse(body.toString().contains("PDF Encrypted")); }
Example 8
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testEncryptedWordDoc() throws Exception { System.out.println("testEncryptedWordDoc"); InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx"); AutoDetectParser parser = new AutoDetectParser(config); //PDFPreprocessorParser parser = new PDFPreprocessorParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); try { parser.parse(stream, body, metadata); } catch (Exception ex) { //donowt } assertFalse(body.toString().contains("Word doc Encrypted")); }
Example 9
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Test public void testParseRequiringNotRequiringOCR() throws Exception { System.out.println("parse"); InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf"); AutoDetectParser parser = new AutoDetectParser(config); //AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); try { parser.parse(stream, body, metadata); } finally { stream.close(); } assertTrue(body.toString().contains("An Example Paper")); }
Example 10
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); }
Example 11
Source File: TearlineContentExtractor.java From baleen with Apache License 2.0 | 5 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if (m.find()) { jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); } else { jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } }
Example 12
Source File: JATEUtil.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
public static String parseToPlainText(InputStream fileStream) { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); String rawContent = ""; try { parser.parse(fileStream, handler, metadata); rawContent = handler.toString(); } catch (IOException | SAXException | TikaException e) { LOG.debug("Parsing Exception while extracting content from current file. " + e.toString()); } return rawContent; }
Example 13
Source File: AttachAttribute.java From entando-components with GNU Lesser General Public License v3.0 | 5 votes |
@Override public String getIndexeableFieldValue() { StringBuilder buffer = new StringBuilder(); if (null != super.getIndexeableFieldValue()) { buffer.append(super.getIndexeableFieldValue()); } String extraValue = null; ResourceInterface resource = this.getResource(); if (resource != null) { InputStream is = ((AttachResource) resource).getResourceStream(); if (null != is) { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); try { parser.parse(is, handler, metadata); extraValue = handler.toString(); } catch (Throwable t) { _logger.error("Error while processing the parsing", t); } finally { try { is.close(); } catch (IOException ex) { _logger.error("Error closing stream", ex); } } } } if (null != extraValue) { buffer.append(" ").append(extraValue); } return buffer.toString(); }
Example 14
Source File: NodeTika.java From node-tika with MIT License | 4 votes |
public static String extractText(String uri, Map<String, Object> options) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); String outputEncoding = null; String contentType = null; int maxLength = -1; if (options != null) { Object option; option = options.get("outputEncoding"); if (option != null) { outputEncoding = option.toString(); } option = options.get("contentType"); if (option != null) { contentType = option.toString(); } option = options.get("maxLength"); if (option != null) { maxLength = (int)Float.parseFloat(option.toString()); } } if (outputEncoding == null) { outputEncoding = "UTF-8"; } fillMetadata(parser, metadata, contentType, uri); fillParseContext(context, options); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding); final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength); final TikaInputStream inputStream = createInputStream(uri, metadata); // Set up recursive parsing of archives. // See: http://wiki.apache.org/tika/RecursiveMetadata context.set(Parser.class, parser); context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context)); try { parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context); } catch (Throwable e) { if (!contentHandler.isWriteLimitReached(e)) { throw e; } else { writer.close(); } } finally { inputStream.close(); } return outputStream.toString(outputEncoding); }