Java Code Examples for org.apache.tika.parser.ParseContext#set()
The following examples show how to use
org.apache.tika.parser.ParseContext#set() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) { final TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); if (options == null) { // Disable OCR and return if no options are specified. disableOcr(ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig); return; } fillOcrOptions(ocrConfig, options); parseContext.set(TesseractOCRConfig.class, ocrConfig); final PDFParserConfig pdfParserConfig = new PDFParserConfig(); fillPdfOptions(pdfParserConfig, options); parseContext.set(PDFParserConfig.class, pdfParserConfig); // Allow a password to be specified for encrypted files. fillPassword(parseContext, options); }
Example 2
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, txtParser); txtParser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); }
Example 3
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 6 votes |
/** * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the * default set. * * @return */ private ParseContext makeParseContext() { ParseContext result = new ParseContext(); Set<String> validTags = _linkExtractor.getLinkTags(); HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE; for (String tag : validTags) { if (defaultMapper.mapSafeElement(tag) == null) { result.set(HtmlMapper.class, new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes())); break; } } return result; }
Example 4
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
@Test public void offersNoTypesIfNotFound() throws Exception { PDFPreprocessorParser parser = new PDFPreprocessorParser(); DefaultParser defaultParser = new DefaultParser(); MediaType pdf = MediaType.application("pdf"); // With an invalid path, will offer no types ImageMagickConfig invalidConfig = new ImageMagickConfig(); invalidConfig.setImageMagickPath("/made/up/path"); ParseContext parseContext = new ParseContext(); parseContext.set(ImageMagickConfig.class, invalidConfig); // No types offered assertEquals(0, parser.getSupportedTypes(parseContext).size()); // And DefaultParser won't use us assertEquals(PDFParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass()); }
Example 5
Source File: AlterPDFParserTest.java From tika-server with Apache License 2.0 | 5 votes |
@Test public void testDoubleSpacedText() throws Exception { PDFParser pdfParser = new AlterPDFParser(); ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); context.set(PDFParserConfig.class, config); InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/double_space_test.pdf"); String text = getText(stream, pdfParser, context); stream.close(); assertTrue(text.length() > 100); }
Example 6
Source File: NodeTika.java From node-tika with MIT License | 5 votes |
private static void fillPassword(ParseContext parseContext, Map<String, Object> options) { final Object password = options.get("password"); if (password == null) { return; } parseContext.set(PasswordProvider.class, new PasswordProvider() { @Override public String getPassword(Metadata metadata) { return password.toString(); } }); }
Example 7
Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License | 5 votes |
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException { ParseContext context = new ParseContext(); ContentHandler handler = new BodyContentHandler(-1); context.set(Parser.class, parser); DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm); context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor); parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context); return extractor.getDocument(); }
Example 8
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); }
Example 9
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } }
Example 10
Source File: ParseContextConfig.java From lucene-solr with Apache License 2.0 | 5 votes |
@SuppressWarnings({"rawtypes", "unchecked"}) public ParseContext create() { final ParseContext result = new ParseContext(); for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){ result.set((Class) entry.getKey(), entry.getValue()); } return result; }
Example 11
Source File: TikaPoweredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * By default returns a new ParseContent * * @param metadata * @param sourceMimeType * @return the parse context */ protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType) { ParseContext context = new ParseContext(); DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType); if (selector != null) { context.set(DocumentSelector.class, selector); } return context; }
Example 12
Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = super.buildParseContext(metadata, targetMimeType, options); boolean recurse = includeContents; if(options.getIncludeEmbedded() != null) { recurse = options.getIncludeEmbedded(); } if(recurse) { // Use an auto detect parser to handle the contents if(tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } context.set(Parser.class, new AutoDetectParser(tikaConfig)); } else { // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096 // so we need to specify an empty one if we don't want the recurse parsing to happen context.set(Parser.class, new EmptyParser()); } return context; }
Example 13
Source File: PdfBoxContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = super.buildParseContext(metadata, targetMimeType, options); if (pdfParserConfig != null) { pdfParserConfig.setExtractBookmarksText(extractBookmarksText); context.set(PDFParserConfig.class, pdfParserConfig); } // TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig? return context; }
Example 14
Source File: TikaPoweredContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * By default returns a ParseContent that does not recurse */ protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = new ParseContext(); DocumentSelector selector = getDocumentSelector(metadata, targetMimeType, options); if (selector != null) { context.set(DocumentSelector.class, selector); } return context; }
Example 15
Source File: AlterPDFParserTest.java From tika-server with Apache License 2.0 | 5 votes |
private String getTextFromDoc(String docPath, AlterPDFParser.ParsePdfMode parseMode) throws Exception { AlterPDFParser pdfParser = new AlterPDFParser(); pdfParser.defaultParseMode = parseMode; ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); context.set(PDFParserConfig.class, config); InputStream stream = AlterPDFParserTest.class.getResourceAsStream(docPath); String text = getText(stream, pdfParser, context); stream.close(); return text; }
Example 16
Source File: AlterPDFParser.java From tika-server with Apache License 2.0 | 5 votes |
private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { TesseractOCRConfig cfg = new TesseractOCRConfig(); // here I set default timeout of 2 hours // The calling process should check parsing process and terminate it by timeout cfg.setTimeout(60 * 60 * 2); context.set(TesseractOCRConfig.class, cfg); PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy(); boolean oldExtractInlineImages = config.getExtractInlineImages(); boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly(); // explicitly tells Tika to use OCR config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML"); Method m = c.getDeclaredMethod("process", PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class, PDFParserConfig.class); m.setAccessible(true); m.invoke(null, document, handler, context, metadata, config); config.setOcrStrategy(oldOcrStrategy); config.setExtractInlineImages(oldExtractInlineImages); config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly); }
Example 17
Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
/** * Asks Tika to translate the contents into HTML */ private void generateHTML(Parser p, RenderingContext context) { ContentReader contentReader = context.makeContentReader(); // Setup things to parse with StringWriter sw = new StringWriter(); ContentHandler handler = buildContentHandler(sw, context); // Tell Tika what we're dealing with Metadata metadata = new Metadata(); metadata.set( Metadata.CONTENT_TYPE, contentReader.getMimetype() ); metadata.set( Metadata.RESOURCE_NAME_KEY, nodeService.getProperty( context.getSourceNode(), ContentModel.PROP_NAME ).toString() ); // Our parse context needs to extract images ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new TikaImageExtractingParser(context)); // Parse try { p.parse( contentReader.getContentInputStream(), handler, metadata, parseContext ); } catch(Exception e) { throw new RenditionServiceException("Tika HTML Conversion Failed", e); } // As a string String html = sw.toString(); // If we're doing body-only, remove all the html namespaces // that will otherwise clutter up the document boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false); if(bodyOnly) { html = html.replaceAll("<\\?xml.*?\\?>", ""); html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p"); html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1"); html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div"); html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table"); html = html.replaceAll(" ",""); } // Save it ContentWriter contentWriter = context.makeContentWriter(); contentWriter.setMimetype("text/html"); contentWriter.putContent( html ); }
Example 18
Source File: TikaParser.java From quarkus with Apache License 2.0 | 4 votes |
protected TikaContent parseStream(InputStream entityStream, String contentType, ContentHandler tikaHandler) throws TikaParseException { try { ParseContext context = new ParseContext(); // AutoDetectParser must be set in the context to enable the parsing of the embedded content Parser contextParser = this.appendEmbeddedContent ? parser : ((RecursiveParserWrapper) parser).getWrappedParser(); context.set(Parser.class, contextParser); org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata(); if (contentType != null) { tikaMetadata.set(HttpHeaders.CONTENT_TYPE, contentType); } try (InputStream tikaStream = TikaInputStream.get(entityStream)) { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); if (this.appendEmbeddedContent) { // the embedded content if any has already been appended to the master content return new TikaContent(tikaHandler == null ? null : tikaHandler.toString().trim(), convert(tikaMetadata)); } else { RecursiveParserWrapperHandler rHandler = (RecursiveParserWrapperHandler) tikaHandler; // The metadata list represents the master and embedded content (text and metadata) // The first metadata in the list represents the master (outer) content List<org.apache.tika.metadata.Metadata> allMetadata = rHandler.getMetadataList(); String masterText = allMetadata.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); // Embedded (inner) content starts from the index 1. List<TikaContent> embeddedContent = new LinkedList<>(); for (int i = 1; i < allMetadata.size(); i++) { String embeddedText = allMetadata.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); // the embedded text can be null if the given document is an image // and no text recognition parser is enabled if (embeddedText != null) { embeddedContent.add(new TikaContent(embeddedText.trim(), convert(allMetadata.get(i)))); } } return new TikaContent(masterText, convert(allMetadata.get(0)), embeddedContent); } } } catch (Exception e) { final String errorMessage = "Unable to parse the stream" + (contentType == null ? "" : " for content-type: " + contentType); throw new TikaParseException(errorMessage, e); } }
Example 19
Source File: NodeTika.java From node-tika with MIT License | 4 votes |
public static String extractText(String uri, Map<String, Object> options) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); String outputEncoding = null; String contentType = null; int maxLength = -1; if (options != null) { Object option; option = options.get("outputEncoding"); if (option != null) { outputEncoding = option.toString(); } option = options.get("contentType"); if (option != null) { contentType = option.toString(); } option = options.get("maxLength"); if (option != null) { maxLength = (int)Float.parseFloat(option.toString()); } } if (outputEncoding == null) { outputEncoding = "UTF-8"; } fillMetadata(parser, metadata, contentType, uri); fillParseContext(context, options); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding); final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength); final TikaInputStream inputStream = createInputStream(uri, metadata); // Set up recursive parsing of archives. // See: http://wiki.apache.org/tika/RecursiveMetadata context.set(Parser.class, parser); context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context)); try { parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context); } catch (Throwable e) { if (!contentHandler.isWriteLimitReached(e)) { throw e; } else { writer.close(); } } finally { inputStream.close(); } return outputStream.toString(outputEncoding); }
Example 20
Source File: SimplePageParser.java From flink-crawler with Apache License 2.0 | 3 votes |
/** * @param parserPolicy * to customize operation of the parser * @param pageScorer * to score importance of page (priority of its outlinks) * @param includeMarkup * true if output should be raw HTML, versus extracted text <BR> * <BR> * <B>Note:</B> There is no need to construct your own {@link SimpleLinkExtractor} * simply to control the set of link tags and attributes it processes. Instead, use * {@link ParserPolicy#setLinkTags} and {@link ParserPolicy#setLinkAttributeTypes}, * and then pass this policy to {@link SimplePageParser#SimpleParser(ParserPolicy)}. */ public SimplePageParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor, ParserPolicy parserPolicy, BasePageScorer pageScorer, boolean includeMarkup) { super(parserPolicy, pageScorer); _contentExtractor = contentExtractor; _linkExtractor = linkExtractor; if (includeMarkup) { _parseContext = new ParseContext(); _parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); } }