org.apache.tika.parser.ParseContext#set

Source File: NodeTika.java From node-tika with MIT License

6 votes

private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) {
	final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();

	if (options == null) {

		// Disable OCR and return if no options are specified.
		disableOcr(ocrConfig);
		parseContext.set(TesseractOCRConfig.class, ocrConfig);

		return;
	}

	fillOcrOptions(ocrConfig, options);
	parseContext.set(TesseractOCRConfig.class, ocrConfig);

	final PDFParserConfig pdfParserConfig = new PDFParserConfig();
	fillPdfOptions(pdfParserConfig, options);
	parseContext.set(PDFParserConfig.class, pdfParserConfig);

	// Allow a password to be specified for encrypted files.
	fillPassword(parseContext, options);
}

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

6 votes

private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}

Source File: TikaCallable.java From flink-crawler with Apache License 2.0

6 votes

/**
 * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the
 * default set.
 * 
 * @return
 */
private ParseContext makeParseContext() {
    ParseContext result = new ParseContext();

    Set<String> validTags = _linkExtractor.getLinkTags();
    HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
    for (String tag : validTags) {
        if (defaultMapper.mapSafeElement(tag) == null) {
            result.set(HtmlMapper.class,
                    new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
            break;
        }
    }

    return result;
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

6 votes

@Test
public void offersNoTypesIfNotFound() throws Exception {
    PDFPreprocessorParser parser = new PDFPreprocessorParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType pdf = MediaType.application("pdf");

    // With an invalid path, will offer no types
    ImageMagickConfig invalidConfig = new ImageMagickConfig();
    invalidConfig.setImageMagickPath("/made/up/path");

    ParseContext parseContext = new ParseContext();
    parseContext.set(ImageMagickConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(PDFParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass());
}

Source File: AlterPDFParserTest.java From tika-server with Apache License 2.0

5 votes

@Test
public void testDoubleSpacedText() throws Exception {
    PDFParser pdfParser = new AlterPDFParser();
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    context.set(PDFParserConfig.class, config);

    InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/double_space_test.pdf");
    String text = getText(stream, pdfParser, context);
    stream.close();

    assertTrue(text.length() > 100);
}

Source File: NodeTika.java From node-tika with MIT License

5 votes

private static void fillPassword(ParseContext parseContext, Map<String, Object> options) {
	final Object password = options.get("password");

	if (password == null) {
		return;
	}

	parseContext.set(PasswordProvider.class, new PasswordProvider() {

		@Override
		public String getPassword(Metadata metadata) {
			return password.toString();
		}
	});
}

Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License

5 votes

public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}

Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0

5 votes

private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}

Source File: ParseContextConfig.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"rawtypes", "unchecked"})
public ParseContext create() {
  final ParseContext result = new ParseContext();

  for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
    result.set((Class) entry.getKey(), entry.getValue());
  }

  return result;
}

Source File: TikaPoweredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * By default returns a new ParseContent
 * 
 * @param metadata
 * @param sourceMimeType
 * @return the parse context
 */
protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
{
    ParseContext context = new ParseContext();
    DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
    if (selector != null)
    {
        context.set(DocumentSelector.class, selector);
    }
    return context;
}

Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}

Source File: PdfBoxContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

@Override
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
    ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
    if (pdfParserConfig != null)
    {
        pdfParserConfig.setExtractBookmarksText(extractBookmarksText);
        context.set(PDFParserConfig.class, pdfParserConfig);
    }
    // TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig?
    return context;
}

Source File: TikaPoweredContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * By default returns a ParseContent that does not recurse
 */
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
   ParseContext context = new ParseContext();
   DocumentSelector selector = getDocumentSelector(metadata, targetMimeType, options);
   if (selector != null)
   {
       context.set(DocumentSelector.class, selector);
   }
   return context;
}

Source File: AlterPDFParserTest.java From tika-server with Apache License 2.0

5 votes

private String getTextFromDoc(String docPath,
                              AlterPDFParser.ParsePdfMode parseMode) throws Exception {
    AlterPDFParser pdfParser = new AlterPDFParser();
    pdfParser.defaultParseMode = parseMode;
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    context.set(PDFParserConfig.class, config);

    InputStream stream = AlterPDFParserTest.class.getResourceAsStream(docPath);
    String text = getText(stream, pdfParser, context);
    stream.close();
    return text;
}

Source File: AlterPDFParser.java From tika-server with Apache License 2.0

5 votes

private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {

    TesseractOCRConfig cfg = new TesseractOCRConfig();
    // here I set default timeout of 2 hours
    // The calling process should check parsing process and terminate it by timeout
    cfg.setTimeout(60 * 60 * 2);
    context.set(TesseractOCRConfig.class, cfg);

    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
    boolean oldExtractInlineImages = config.getExtractInlineImages();
    boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly();

    // explicitly tells Tika to use OCR
    config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);

    Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML");
    Method m = c.getDeclaredMethod("process",
            PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);
    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
    config.setExtractInlineImages(oldExtractInlineImages);
    config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly);
}

Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Asks Tika to translate the contents into HTML
 */
private void generateHTML(Parser p, RenderingContext context)
{
   ContentReader contentReader = context.makeContentReader();
   
   // Setup things to parse with
   StringWriter sw = new StringWriter();
   ContentHandler handler = buildContentHandler(sw, context);
   
   // Tell Tika what we're dealing with
   Metadata metadata = new Metadata();
   metadata.set(
         Metadata.CONTENT_TYPE, 
         contentReader.getMimetype()
   );
   metadata.set(
         Metadata.RESOURCE_NAME_KEY, 
         nodeService.getProperty( 
               context.getSourceNode(),
               ContentModel.PROP_NAME
         ).toString()
   );

   // Our parse context needs to extract images
   ParseContext parseContext = new ParseContext();
   parseContext.set(Parser.class, new TikaImageExtractingParser(context));
   
   // Parse
   try {
      p.parse(
            contentReader.getContentInputStream(),
            handler, metadata, parseContext
      );
   } catch(Exception e) {
      throw new RenditionServiceException("Tika HTML Conversion Failed", e);
   }
   
   // As a string
   String html = sw.toString();
   
   // If we're doing body-only, remove all the html namespaces
   //  that will otherwise clutter up the document
   boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
   if(bodyOnly) {
      html = html.replaceAll("<\\?xml.*?\\?>", "");
      html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
      html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
      html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
      html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
      html = html.replaceAll("&#13;","");
   }
   
   // Save it
   ContentWriter contentWriter = context.makeContentWriter();
   contentWriter.setMimetype("text/html");
   contentWriter.putContent( html );
}

Source File: TikaParser.java From quarkus with Apache License 2.0

4 votes

protected TikaContent parseStream(InputStream entityStream, String contentType, ContentHandler tikaHandler)
        throws TikaParseException {
    try {
        ParseContext context = new ParseContext();
        // AutoDetectParser must be set in the context to enable the parsing of the embedded content
        Parser contextParser = this.appendEmbeddedContent ? parser : ((RecursiveParserWrapper) parser).getWrappedParser();
        context.set(Parser.class, contextParser);

        org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata();
        if (contentType != null) {
            tikaMetadata.set(HttpHeaders.CONTENT_TYPE, contentType);
        }

        try (InputStream tikaStream = TikaInputStream.get(entityStream)) {
            parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
            if (this.appendEmbeddedContent) {
                // the embedded content if any has already been appended to the master content
                return new TikaContent(tikaHandler == null ? null : tikaHandler.toString().trim(), convert(tikaMetadata));
            } else {
                RecursiveParserWrapperHandler rHandler = (RecursiveParserWrapperHandler) tikaHandler;

                // The metadata list represents the master and embedded content (text and metadata)
                // The first metadata in the list represents the master (outer) content
                List<org.apache.tika.metadata.Metadata> allMetadata = rHandler.getMetadataList();
                String masterText = allMetadata.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);

                // Embedded (inner) content starts from the index 1.
                List<TikaContent> embeddedContent = new LinkedList<>();
                for (int i = 1; i < allMetadata.size(); i++) {
                    String embeddedText = allMetadata.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
                    // the embedded text can be null if the given document is an image
                    // and no text recognition parser is enabled
                    if (embeddedText != null) {
                        embeddedContent.add(new TikaContent(embeddedText.trim(), convert(allMetadata.get(i))));
                    }
                }
                return new TikaContent(masterText, convert(allMetadata.get(0)), embeddedContent);

            }
        }
    } catch (Exception e) {
        final String errorMessage = "Unable to parse the stream"
                + (contentType == null ? "" : " for content-type: " + contentType);
        throw new TikaParseException(errorMessage, e);
    }
}

Source File: NodeTika.java From node-tika with MIT License

4 votes

public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}

Source File: SimplePageParser.java From flink-crawler with Apache License 2.0

3 votes

/**
 * @param parserPolicy
 *            to customize operation of the parser
 * @param pageScorer
 *            to score importance of page (priority of its outlinks)
 * @param includeMarkup
 *            true if output should be raw HTML, versus extracted text <BR>
 * <BR>
 *            <B>Note:</B> There is no need to construct your own {@link SimpleLinkExtractor}
 *            simply to control the set of link tags and attributes it processes. Instead, use
 *            {@link ParserPolicy#setLinkTags} and {@link ParserPolicy#setLinkAttributeTypes},
 *            and then pass this policy to {@link SimplePageParser#SimpleParser(ParserPolicy)}.
 */
public SimplePageParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor,
        ParserPolicy parserPolicy, BasePageScorer pageScorer, boolean includeMarkup) {
    super(parserPolicy, pageScorer);

    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;

    if (includeMarkup) {
        _parseContext = new ParseContext();
        _parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
    }
}

Java Code Examples for org.apache.tika.parser.ParseContext#set()