org.apache.tika.sax.WriteOutContentHandler Java Examples

The following examples show how to use org.apache.tika.sax.WriteOutContentHandler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
Example #2
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                          final ParseContext context, final TesseractOCRConfig config, final boolean inline,
                          final Writer writer) throws SAXException, IOException, TikaException {
	final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer));

	if (inline) {
		super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config);
	} else {
		super.parse(tis, tee, metadata, context);
	}
}
 
Example #3
Source File: CachingTesseractOCRParserTest.java    From extract with MIT License 5 votes vote down vote up
@Test
public void testWriteToCache() throws Throwable {
	final Path simple = Paths.get(this.simple.toURI());

	Writer writer = new StringWriter();
	final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger();

	final Parser parser = new CachingTesseractOCRParser(tmpDir) {

		private static final long serialVersionUID = 6551690243986921730L;

		@Override
		public void cacheHit() {
			hit.incrementAndGet();
		}

		@Override
		public void cacheMiss() {
			miss.incrementAndGet();
		}
	};

	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(0, hit.get());
	Assert.assertEquals(1, miss.get());

	// Try again from the cache.
	writer = new StringWriter();
	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(1, hit.get());
	Assert.assertEquals(1, miss.get());
}
 
Example #4
Source File: Extractor.java    From extract with MIT License 4 votes vote down vote up
/**
 * Create a pull-parser from the given {@link TikaInputStream}.
 *
 * @param path the stream to extract from
 * @return A pull-parsing reader.
 */
public TikaDocument extract(final Path path) throws IOException {
	final TikaDocument rootDocument = documentFactory.create(path);
	TikaInputStream tikaInputStream = TikaInputStream.get(path, rootDocument.getMetadata());
	final ParseContext context = new ParseContext();
	final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);

	// Set a fallback parser that outputs an empty tikaDocument for empty files,
	// otherwise throws an exception.
	autoDetectParser.setFallback(FallbackParser.INSTANCE);
	final Parser parser;

	if (null != digester) {
		parser = new DigestingParser(autoDetectParser, digester);
	} else {
		parser = autoDetectParser;
	}

	if (!ocrDisabled) {
		context.set(TesseractOCRConfig.class, ocrConfig);
	}

	context.set(PDFParserConfig.class, pdfConfig);

	// Only include "safe" tags in the HTML output from Tika's HTML parser.
	// This excludes script tags and objects.
	context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

	final Reader reader;
	final Function<Writer, ContentHandler> handler;

	if (OutputFormat.HTML == outputFormat) {
		handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
	} else {

		// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
		// because only the body of embeds is pushed to the content handler further down the line, we can't
		// expect a body tag.
		handler = WriteOutContentHandler::new;
	}

	if (EmbedHandling.SPAWN == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(rootDocument, context, embedOutput, handler));
	} else if (EmbedHandling.CONCATENATE == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedParser(rootDocument, context));
	} else {
		context.set(Parser.class, EmptyParser.INSTANCE);
		context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
	}

	// the constructor of ParsingReader actually parses the document in background
	if (OutputFormat.HTML == outputFormat) {
		reader = new ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context, handler);
	} else {
		reader = new org.apache.tika.parser.ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context);
	}
	rootDocument.setReader(reader);

	return rootDocument;
}
 
Example #5
Source File: NodeTika.java    From node-tika with MIT License 4 votes vote down vote up
public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}