org.apache.tika.parser.Parser Java Exaples

Source File: SolrCellBuilder.java From kite with Apache License 2.0

6 votes

private Parser detectParser(Record record) {
  if (!hasAtLeastOneMimeType(record)) {
    return null;
  }
  String mediaTypeStr = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); //ExtractingParams.STREAM_TYPE);
  assert mediaTypeStr != null;
  
  MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType();
  Parser parser = mediaTypeToParserMap.get(mediaType); // fast path
  if (parser != null) {
    return parser;
  }
  // wildcard matching
  for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) {
    if (isMediaTypeMatch(mediaType, entry.getKey())) {
      return entry.getValue();
    }
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug("No supported MIME type parser found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr);
  }
  return null;
}

Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0

6 votes

public void testSupports() throws Exception
{
    ArrayList<String> mimeTypes = new ArrayList<String>();
    for (Parser p : new Parser[] {
             new OfficeParser(), new OpenDocumentParser(),
             new Mp3Parser(), new OOXMLParser()
    }) {
       Set<MediaType> mts = p.getSupportedTypes(new ParseContext());
       for (MediaType mt : mts) 
       {
          mimeTypes.add(mt.toString());
       }
    }
    
    for (String mimetype : mimeTypes)
    {
        boolean supports = extracter.isSupported(mimetype);
        assertTrue("Mimetype should be supported: " + mimetype, supports);
    }
}

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

6 votes

private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}

Source File: ImageConverter.java From openmeetings with Apache License 2.0

6 votes

private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}

Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0

6 votes

/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}

Source File: TikaProcessor.java From quarkus with Apache License 2.0

6 votes

public static Map<String, List<TikaParserParameter>> getSupportedParserConfig(Optional<String> tikaConfigPath,
        Optional<String> requiredParsers,
        Map<String, Map<String, String>> parserParamMaps,
        Map<String, String> parserAbbreviations) throws Exception {
    Predicate<String> pred = p -> !NOT_NATIVE_READY_PARSERS.contains(p);
    List<String> providerNames = getProviderNames(Parser.class.getName());
    if (tikaConfigPath.isPresent() || !requiredParsers.isPresent()) {
        return providerNames.stream().filter(pred).collect(Collectors.toMap(Function.identity(),
                p -> Collections.<TikaParserParameter> emptyList()));
    } else {
        List<String> abbreviations = Arrays.stream(requiredParsers.get().split(",")).map(s -> s.trim())
                .collect(Collectors.toList());
        Map<String, String> fullNamesAndAbbreviations = abbreviations.stream()
                .collect(Collectors.toMap(p -> getParserNameFromConfig(p, parserAbbreviations), Function.identity()));
        return providerNames.stream().filter(pred).filter(p -> fullNamesAndAbbreviations.containsKey(p))
                .collect(Collectors.toMap(Function.identity(),
                        p -> getParserConfig(p, parserParamMaps.get(fullNamesAndAbbreviations.get(p)))));
    }
}

Source File: CachingTesseractOCRParserTest.java From extract with MIT License

5 votes

@Test
public void testWriteToCache() throws Throwable {
	final Path simple = Paths.get(this.simple.toURI());

	Writer writer = new StringWriter();
	final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger();

	final Parser parser = new CachingTesseractOCRParser(tmpDir) {

		private static final long serialVersionUID = 6551690243986921730L;

		@Override
		public void cacheHit() {
			hit.incrementAndGet();
		}

		@Override
		public void cacheMiss() {
			miss.incrementAndGet();
		}
	};

	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(0, hit.get());
	Assert.assertEquals(1, miss.get());

	// Try again from the cache.
	writer = new StringWriter();
	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(1, hit.get());
	Assert.assertEquals(1, miss.get());
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

public String getTextWoDoublebreaks(InputStream is, Parser parser,
                                    ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new OriginalBodyContentHandler();
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

/**
 * Basic text extraction.
 * <p>
 * Tries to close input stream after processing.
 */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new BodyContentHandler(1000000);
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception {
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(handlerType, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), new ParseContext());
    }
    return handler.getMetadataList();
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}

Source File: TikaProcessor.java From quarkus with Apache License 2.0

5 votes

@BuildStep
@Record(ExecutionTime.STATIC_INIT)
void initializeTikaParser(BeanContainerBuildItem beanContainer, TikaRecorder recorder,
        BuildProducer<ServiceProviderBuildItem> serviceProvider, TikaConfiguration configuration)
        throws Exception {
    Map<String, List<TikaParserParameter>> parsers = getSupportedParserConfig(configuration.tikaConfigPath,
            configuration.parsers,
            configuration.parserOptions, configuration.parser);
    String tikaXmlConfiguration = generateTikaXmlConfiguration(parsers);

    serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), new ArrayList<>(parsers.keySet())));
    serviceProvider
            .produce(new ServiceProviderBuildItem(Detector.class.getName(), getProviderNames(Detector.class.getName())));
    serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(),
            getProviderNames(EncodingDetector.class.getName())));

    recorder.initTikaParser(beanContainer.getValue(), configuration, tikaXmlConfiguration);
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, metadata, context);
    }
    return handler.getMetadataList();
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);

    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), context);
    }
    return handler.getMetadataList();
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception {
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), parseContext);
    }
    return handler.getMetadataList();
}

Source File: ParsingReader.java From extract with MIT License

5 votes

/**
 * Creates a reader for the content of the given binary stream
 * with the given document metadata. The given parser is used for the
 * parsing task that is run with the given executor.
 *
 * The created reader will be responsible for closing the given stream.
 * The stream and any associated resources will be closed at or before
 * the time when the {@link #close()} method is called on this reader.
 *
 * @param parser parser instance
 * @param input binary stream
 * @param metadata document metadata
 * @param context parsing context
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext
		context, final Function<Writer, ContentHandler> handler) throws IOException {
	final PipedReader pipedReader = new PipedReader();

	this.parser = parser;
	reader = new BufferedReader(pipedReader);

	try {
		writer = new PipedWriter(pipedReader);
	} catch (IOException e) {
		throw new IllegalStateException(e); // Should never happen.
	}

	this.input = input;
	this.metadata = metadata;
	this.context = context;

	// Generate the handler.
	this.handler = handler.apply(writer);

	parse();
	
	// TIKA-203: Buffer first character to force metadata extraction.
	reader.mark(1);

	//noinspection ResultOfMethodCallIgnored
	reader.read();
	reader.reset();
}

Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License

5 votes

public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}

Source File: EmbeddingHTMLParsingReader.java From extract with MIT License

5 votes

public EmbeddingHTMLParsingReader(final TikaDocument parent, final String open, final String close, final Parser
		parser, final TikaInputStream input, final Metadata metadata, final ParseContext context) throws
		IOException {
	super(parser, input, metadata, context, (writer)-> new SubstitutingContentHandler(parent, open, close, new
			ExpandedTitleContentHandler(new HTML5Serializer(writer))));
	this.replacer = new TokenReplacingReader((token)-> {
		final EmbeddedTikaDocument embed = parent.getEmbed(token);

		if (null == embed) {
			return null;
		}

		return DataURIEncodingInputStream.createReader(embed.getPath(), embed.getMetadata());
	}, reader, open, close);
}

Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0

5 votes

private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}

Source File: TikaCallable.java From flink-crawler with Apache License 2.0

5 votes

public TikaCallable(Parser parser, BaseContentExtractor contentExtractor,
        BaseLinkExtractor linkExtractor, InputStream input, Metadata metadata,
        boolean extractLanguage, ParseContext parseContext) {
    _parser = parser;
    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;
    _input = input;
    _metadata = metadata;
    _extractLanguage = extractLanguage;
    _parseContext = parseContext;
}

Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}

Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

@Override
protected Parser getParser() 
{
   return new CompositeParser(
         tikaConfig.getMediaTypeRegistry(), parsers
   );
}

Source File: TikaSpringConfiguredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * Injects the name of the Tika parser to use
 * @param className
 */
@SuppressWarnings("unchecked")
public void setTikaParserName(String className)
{
   tikaParserClassName = className;
   
   // Load the class
   try {
      tikaParserClass = (Class<? extends Parser>)Class.forName(tikaParserClassName);
      setTikaParser(getParser());
   } catch(ClassNotFoundException e) {
      throw new AlfrescoRuntimeException("Specified Tika Parser '" + tikaParserClassName + "' not found");
   }
}

Source File: TikaSpringConfiguredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * Injects the Tika parser to use
 * @param tikaParser
 */
public void setTikaParser(Parser tikaParser)
{
   this.tikaParser = tikaParser;
   
   // Build the mime types, updating the copy our parent
   //  holds for us as we go along
   ArrayList<String> mimetypes = new ArrayList<String>();
   for(MediaType mt : tikaParser.getSupportedTypes(new ParseContext()))
   {
      mimetypes.add( mt.toString() );
   }
   super.setSupportedMimetypes(mimetypes);
}

Source File: MP3MetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

@Override
protected Parser getParser() 
{
   return new Mp3Parser();
}

Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0

4 votes

public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}

Source File: PdfBoxMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

@Override
protected Parser getParser() 
{
   return new PDFParser();
}

Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Does auto-detection to select the best Tika
 *  Parser.
 */
@Override
protected Parser getParser() 
{
   return parser;
}

org.apache.tika.parser.Parser Java Examples