Java Code Examples for org.apache.tika.parser.Parser#parse()
The following examples show how to use
org.apache.tika.parser.Parser#parse() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param Bytes * @return * @throws PDException */ protected String Convert(InputStream Bytes) throws PDException { try { ContentHandler textHandler=new BodyContentHandler(-1); Metadata metadata=new Metadata(); Parser parser=new AutoDetectParser(); ParseContext context=new ParseContext(); parser.parse(Bytes, textHandler, metadata, context); FileMetadata=""; for (String key : metadata.names()) FileMetadata+=key+"="+metadata.get(key)+"\n"; FullText=textHandler.toString(); } catch (Exception ex) { PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage()); } return(FullText); }
Example 2
Source File: ImageConverter.java From openmeetings with Apache License 2.0 | 6 votes |
private static ProcessResult initSize(BaseFileItem f, File img, String mime) { ProcessResult res = new ProcessResult(); res.setProcess("get image dimensions :: " + f.getId()); final Parser parser = new ImageParser(); try (InputStream is = new FileInputStream(img)) { Metadata metadata = new Metadata(); metadata.set(CONTENT_TYPE, mime); parser.parse(is, new DefaultHandler(), metadata, new ParseContext()); f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH))); f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH))); res.setExitCode(ZERO); } catch (Exception e) { log.error("Error while getting dimensions", e); res.setError("Error while getting dimensions"); res.setException(e.getMessage()); res.setExitCode(-1); } return res; }
Example 3
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
/** * Basic text extraction. * <p> * Tries to close input stream after processing. */ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ ContentHandler handler = new BodyContentHandler(1000000); try { parser.parse(is, handler, metadata, context); } finally { is.close(); } return handler.toString(); }
Example 4
Source File: TikaTest.java From tika-server with Apache License 2.0 | 5 votes |
public String getTextWoDoublebreaks(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ ContentHandler handler = new OriginalBodyContentHandler(); try { parser.parse(is, handler, metadata, context); } finally { is.close(); } return handler.toString(); }
Example 5
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } }
Example 6
Source File: CachingTesseractOCRParserTest.java From extract with MIT License | 5 votes |
@Test public void testWriteToCache() throws Throwable { final Path simple = Paths.get(this.simple.toURI()); Writer writer = new StringWriter(); final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger(); final Parser parser = new CachingTesseractOCRParser(tmpDir) { private static final long serialVersionUID = 6551690243986921730L; @Override public void cacheHit() { hit.incrementAndGet(); } @Override public void cacheMiss() { miss.incrementAndGet(); } }; try (final InputStream in = Files.newInputStream(simple)) { parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext()); } Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim()); Assert.assertEquals(0, hit.get()); Assert.assertEquals(1, miss.get()); // Try again from the cache. writer = new StringWriter(); try (final InputStream in = Files.newInputStream(simple)) { parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext()); } Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim()); Assert.assertEquals(1, hit.get()); Assert.assertEquals(1, miss.get()); }
Example 7
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return handler.toString(); }
Example 8
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return metadata; }
Example 9
Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
/** * Asks Tika to translate the contents into HTML */ private void generateHTML(Parser p, RenderingContext context) { ContentReader contentReader = context.makeContentReader(); // Setup things to parse with StringWriter sw = new StringWriter(); ContentHandler handler = buildContentHandler(sw, context); // Tell Tika what we're dealing with Metadata metadata = new Metadata(); metadata.set( Metadata.CONTENT_TYPE, contentReader.getMimetype() ); metadata.set( Metadata.RESOURCE_NAME_KEY, nodeService.getProperty( context.getSourceNode(), ContentModel.PROP_NAME ).toString() ); // Our parse context needs to extract images ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new TikaImageExtractingParser(context)); // Parse try { p.parse( contentReader.getContentInputStream(), handler, metadata, parseContext ); } catch(Exception e) { throw new RenditionServiceException("Tika HTML Conversion Failed", e); } // As a string String html = sw.toString(); // If we're doing body-only, remove all the html namespaces // that will otherwise clutter up the document boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false); if(bodyOnly) { html = html.replaceAll("<\\?xml.*?\\?>", ""); html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p"); html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1"); html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div"); html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table"); html = html.replaceAll(" ",""); } // Save it ContentWriter contentWriter = context.makeContentWriter(); contentWriter.setMimetype("text/html"); contentWriter.putContent( html ); }
Example 10
Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0 | 4 votes |
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException { // Input & Output Variables ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes); byte[] xml_bytes = null; // Tika Parser Objects Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { // Populate Metadata Object with Tika Parser parser.parse(base_input, handler, metadata, context); // Container & Writer for Metadata Properties meta_props = new Properties(); StringWriter sw = new StringWriter(); // Put Tika Metadata in Properties for(String name : metadata.names()) { if (!metadata.get(name).isEmpty()) { meta_props.put(name, metadata.get(name)); } } meta_props.store(sw, "Tika Values"); // Expected Harvester Properties String meta_descr = metadata.get(TikaCoreProperties.DESCRIPTION); String meta_modif = metadata.get(TikaCoreProperties.MODIFIED); String meta_title = metadata.get(TikaCoreProperties.TITLE); // Default Label for Undefined Tika Properties DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd"); Date date = new Date(); String date_today = date_format.format(date); String tika_label = String.format("TIKA_%s", date_today); // Check For Null Values & Set Defaults if (meta_descr == null) { meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString()); } else { meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr); } if (meta_modif == null) { meta_props.put(WKAConstants.WKA_MODIFIED, tika_label); } else { meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif); } if (meta_title == null) { meta_props.put(WKAConstants.WKA_TITLE, file_name); } else { meta_props.put(WKAConstants.WKA_TITLE, meta_title); } // Build XML as Bytes MapAttribute attr = AttributeUtils.fromProperties(meta_props); Document document = new SimpleDcMetaBuilder().create(attr); xml_bytes = XmlUtils.toString(document).getBytes("UTF-8"); } catch (Exception ex) { LOG.error(String.format("Error reading data."), ex); } finally { base_input.close(); } return xml_bytes; }