org.apache.tika.parser.ocr.TesseractOCRConfig Java Examples

The following examples show how to use org.apache.tika.parser.ocr.TesseractOCRConfig. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TikaDocumentItemProcessor.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
@PostConstruct
public void init() throws IOException, SAXException, TikaException{
    setFieldName(tikaFieldName);

    // load tika configuration
    tikaConfig = new TikaConfig(this.getClass().getClassLoader()
                            .getResourceAsStream("tika-config.xml"));

    // load tesseract ocr configuration
    tesseractConfig = new TesseractOCRConfig();
    if (tesseractTimeout > 0) {
        tesseractConfig.setTimeout(tesseractTimeout);
    }

    // load image magick configuration -- used for tiff conversion
    imgConfig = new ImageMagickConfig();
    if (convertTimeout > 0) {
        imgConfig.setTimeout(convertTimeout);
    }

    parser = new AutoDetectParser(tikaConfig);
}
 
Example #2
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) {
	final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();

	if (options == null) {

		// Disable OCR and return if no options are specified.
		disableOcr(ocrConfig);
		parseContext.set(TesseractOCRConfig.class, ocrConfig);

		return;
	}

	fillOcrOptions(ocrConfig, options);
	parseContext.set(TesseractOCRConfig.class, ocrConfig);

	final PDFParserConfig pdfParserConfig = new PDFParserConfig();
	fillPdfOptions(pdfParserConfig, options);
	parseContext.set(PDFParserConfig.class, pdfParserConfig);

	// Allow a password to be specified for encrypted files.
	fillPassword(parseContext, options);
}
 
Example #3
Source File: AlterPDFParser.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {

    TesseractOCRConfig cfg = new TesseractOCRConfig();
    // here I set default timeout of 2 hours
    // The calling process should check parsing process and terminate it by timeout
    cfg.setTimeout(60 * 60 * 2);
    context.set(TesseractOCRConfig.class, cfg);

    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
    boolean oldExtractInlineImages = config.getExtractInlineImages();
    boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly();

    // explicitly tells Tika to use OCR
    config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);

    Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML");
    Method m = c.getDeclaredMethod("process",
            PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);
    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
    config.setExtractInlineImages(oldExtractInlineImages);
    config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly);
}
 
Example #4
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
    // If Tesseract is installed, offer our supported image types
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    if (hasTesseract(config))
        return SUPPORTED_TYPES;

    // Otherwise don't advertise anything, so the other image parsers
    //  can be selected instead
    return Collections.emptySet();
}
 
Example #5
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
    String tessdataPrefix = "TESSDATA_PREFIX";
    Map<String, String> env = pb.environment();

    if (!config.getTessdataPath().isEmpty()) {
        env.put(tessdataPrefix, config.getTessdataPath());
    }
    else if(!config.getTesseractPath().isEmpty()) {
        env.put(tessdataPrefix, config.getTesseractPath());
    }
}
 
Example #6
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, handler, metadata, context, context.get(TesseractOCRConfig.class, DEFAULT_CONFIG), false);
	} else {
		super.parse(in, handler, metadata, context);
	}
}
 
Example #7
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parseInline(final InputStream in, final XHTMLContentHandler xhtml, final ParseContext context,
                        final TesseractOCRConfig config)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, xhtml, new Metadata(), context, null == config ?
				context.get(TesseractOCRConfig.class, DEFAULT_CONFIG) : config, true);
	} else {
		super.parseInline(in, xhtml, context, config);
	}
}
 
Example #8
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                          final ParseContext context, final TesseractOCRConfig config, final boolean inline,
                          final Writer writer) throws SAXException, IOException, TikaException {
	final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer));

	if (inline) {
		super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config);
	} else {
		super.parse(tis, tee, metadata, context);
	}
}
 
Example #9
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private boolean acquireLock(final TesseractOCRConfig config, final Path cacheLock)
		throws IOException, InterruptedException {
	for (int i = 0, l = config.getTimeout() + 1; i < l; i++) {
		try {
			Files.createFile(cacheLock);
			return true;
		} catch (final FileAlreadyExistsException e) {
			TimeUnit.SECONDS.sleep(1);
		}
	}

	return false;
}
 
Example #10
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}
 
Example #11
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                        final ParseContext context, final TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException, InterruptedException {
	final String hash;

	try (final InputStream buffered = Files.newInputStream(tis.getPath())) {
		hash = DigestUtils.sha256Hex(buffered);
	}

	final Path cachePath = outputPath.resolve(hash);
	final Path cacheLock = outputPath.resolve(hash + ".lock");

	// Acquire a lock both for reading and for writing.
	// If the lock can't be acquired, parse without caching.
	if (!acquireLock(config, cacheLock)) {
		fallbackParse(tis, handler, metadata, context, config, inline);
		return;
	}

	// You won't know for sure until you try....
	try (final Reader reader = Files.newBufferedReader(cachePath, UTF_8)) {
		cacheHit();
		readFromCache(reader, handler, metadata);
	} catch (final NoSuchFileException e) {
		final Path cacheTemp = outputPath.resolve(hash + ".tmp");

		// Write to a temporary file and only move to the final path if parsing completes successfully.
		// This way we ensure that we don't cache partial results from Tesseract if there's an error.
		try (final Writer writer = Files.newBufferedWriter(cacheTemp, UTF_8, StandardOpenOption.CREATE)) {
			cacheMiss();
			parseToCache(tis, handler, metadata, context, config, inline, writer);
		}

		Files.move(cacheTemp, cachePath, StandardCopyOption.ATOMIC_MOVE);
	} finally {
		Files.deleteIfExists(cacheLock);
	}
}
 
Example #12
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillOcrOptions(TesseractOCRConfig ocrConfig, Map<String, Object> options) {

		// Only set the OCR config object on the context if the language is specified.
		// OCR is disabled by default as it can give unexpected results.
		final Object ocrLanguage = options.get("ocrLanguage");
		if (ocrLanguage == null) {
			disableOcr(ocrConfig);

			return;
		}

		ocrConfig.setLanguage(ocrLanguage.toString());

		final Object ocrPath = options.get("ocrPath");
		final Object ocrMaxFileSize = options.get("ocrMaxFileSize");
		final Object ocrMinFileSize = options.get("ocrMinFileSize");
		final Object ocrPageSegmentationMode = options.get("ocrPageSegmentationMode");
		final Object ocrTimeout = options.get("ocrTimeout");

		if (ocrPath != null) {
			ocrConfig.setTesseractPath(ocrPath.toString());
		}

		if (ocrMaxFileSize != null) {
			ocrConfig.setMaxFileSizeToOcr(Integer.parseInt(ocrMaxFileSize.toString()));
		}

		if (ocrMinFileSize != null) {
			ocrConfig.setMinFileSizeToOcr(Integer.parseInt(ocrMinFileSize.toString()));
		}

		if (ocrPageSegmentationMode != null) {
			ocrConfig.setPageSegMode(ocrPageSegmentationMode.toString());
		}

		if (ocrTimeout != null) {
			ocrConfig.setTimeout(Integer.parseInt(ocrTimeout.toString()));
		}
	}
 
Example #13
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void disableOcr(TesseractOCRConfig ocrConfig) {

		// This is necessary until Tika introduces a way to blacklist parsers.
		// See https://issues.apache.org/jira/browse/TIKA-1557
		if (System.getProperty("os.name").startsWith("Windows")) {
			ocrConfig.setTesseractPath("\\Device\\Null\\");
		} else {
			ocrConfig.setTesseractPath("/dev/null/");
		}
	}
 
Example #14
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 4 votes vote down vote up
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

    // If Tesseract is not on the path with the current config, do not try to run OCR
    // getSupportedTypes shouldn't have listed us as handling it, so this should only
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (! hasTesseract(config))
        return;

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    TemporaryResources tmp = new TemporaryResources();
    File output = null;
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File input = tikaStream.getFile();
        long size = tikaStream.getLength();

        if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

            output = tmp.createTemporaryFile();
            doOCR(input, output, config);

            // Tesseract appends .txt to output file name
            output = new File(output.getAbsolutePath() + ".txt");

            if (output.exists())
                extractOutput(new FileInputStream(output), xhtml);

        }

        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
    } finally {
        tmp.dispose();
        if (output != null) {
            output.delete();
        }
    }
}
 
Example #15
Source File: Extractor.java    From extract with MIT License 4 votes vote down vote up
/**
 * Create a pull-parser from the given {@link TikaInputStream}.
 *
 * @param path the stream to extract from
 * @return A pull-parsing reader.
 */
public TikaDocument extract(final Path path) throws IOException {
	final TikaDocument rootDocument = documentFactory.create(path);
	TikaInputStream tikaInputStream = TikaInputStream.get(path, rootDocument.getMetadata());
	final ParseContext context = new ParseContext();
	final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);

	// Set a fallback parser that outputs an empty tikaDocument for empty files,
	// otherwise throws an exception.
	autoDetectParser.setFallback(FallbackParser.INSTANCE);
	final Parser parser;

	if (null != digester) {
		parser = new DigestingParser(autoDetectParser, digester);
	} else {
		parser = autoDetectParser;
	}

	if (!ocrDisabled) {
		context.set(TesseractOCRConfig.class, ocrConfig);
	}

	context.set(PDFParserConfig.class, pdfConfig);

	// Only include "safe" tags in the HTML output from Tika's HTML parser.
	// This excludes script tags and objects.
	context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

	final Reader reader;
	final Function<Writer, ContentHandler> handler;

	if (OutputFormat.HTML == outputFormat) {
		handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
	} else {

		// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
		// because only the body of embeds is pushed to the content handler further down the line, we can't
		// expect a body tag.
		handler = WriteOutContentHandler::new;
	}

	if (EmbedHandling.SPAWN == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(rootDocument, context, embedOutput, handler));
	} else if (EmbedHandling.CONCATENATE == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedParser(rootDocument, context));
	} else {
		context.set(Parser.class, EmptyParser.INSTANCE);
		context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
	}

	// the constructor of ParsingReader actually parses the document in background
	if (OutputFormat.HTML == outputFormat) {
		reader = new ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context, handler);
	} else {
		reader = new org.apache.tika.parser.ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context);
	}
	rootDocument.setReader(reader);

	return rootDocument;
}