org.apache.tika.parser.AutoDetectParser Java Exaples

Source File: NodeTika.java From node-tika with MIT License

6 votes

public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}

Source File: FileParserSingleton.java From scava with Eclipse Public License 2.0

6 votes

private FileParserSingleton()
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.preprocessor.fileparser");
	BufferedReader fileList;
	try
	{
		fileList = loadFile();
		readSupportedFilesList(fileList);
		logger.info("List of supported files has been sucessfully loaded");
		parser = new AutoDetectParser();
	}
	catch (IOException  e) 
	{
		logger.error("Error while loading the List of supported files:", e);
		e.printStackTrace();
	}	
}

Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0

6 votes

@PostConstruct
public void init() throws IOException, SAXException, TikaException{
    setFieldName(tikaFieldName);

    // load tika configuration
    tikaConfig = new TikaConfig(this.getClass().getClassLoader()
                            .getResourceAsStream("tika-config.xml"));

    // load tesseract ocr configuration
    tesseractConfig = new TesseractOCRConfig();
    if (tesseractTimeout > 0) {
        tesseractConfig.setTimeout(tesseractTimeout);
    }

    // load image magick configuration -- used for tiff conversion
    imgConfig = new ImageMagickConfig();
    if (convertTimeout > 0) {
        imgConfig.setTimeout(convertTimeout);
    }

    parser = new AutoDetectParser(tikaConfig);
}

Source File: TikaUtil.java From scipio-erp with Apache License 2.0

6 votes

/**
 * Finds media type (through Apache Tika library), based on filename and magic numbers.
 * @throws IOException
 */
public static MediaType findMediaType(InputStream is, String fileName) throws IOException {
    BufferedInputStream bis = new BufferedInputStream(is);
    try {
        AutoDetectParser parser = new AutoDetectParser();
        Detector detector = parser.getDetector();
        Metadata md = new Metadata();
        md.add(Metadata.RESOURCE_NAME_KEY, fileName);
        MediaType mediaType = detector.detect(bis, md);
        return mediaType;
    } finally {
        try {
            bis.close();
        } catch (IOException e) {
            ;
        }
    }
}

Source File: TikaContentExtractor.java From baleen with Apache License 2.0

6 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}

Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0

6 votes

/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}

Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0

6 votes

@Override
protected void render(RenderingContext context)
{
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();
    
    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if(! p.getParsers().containsKey(sourceMediaType))
    {
       throw new RenditionServiceException(
             "Source mime type of " + sourceMimeType + 
             " is not supported by Tika for HTML conversions"
       );
    }
    
    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
}

Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0

6 votes

private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
   config = tikaConfig;
   parser = new AutoDetectParser(config);

   SUPPORTED_MIMETYPES = new ArrayList<String>();
   for(MediaType mt : parser.getParsers().keySet()) 
   {
      // Add the canonical mime type
      SUPPORTED_MIMETYPES.add( mt.toString() );
      
      // And add any aliases of the mime type too - Alfresco uses some
      //  non canonical forms of various mimetypes, so we need all of them
      for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) 
      {
          SUPPORTED_MIMETYPES.add( alias.toString() );
      }
   }
   return SUPPORTED_MIMETYPES;
}

Source File: ExtractingDocumentLoader.java From lucene-solr with Apache License 2.0

6 votes

public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
                         TikaConfig config, ParseContextConfig parseContextConfig,
                                SolrContentHandlerFactory factory) {
  this.params = req.getParams();
  this.core = req.getCore();
  this.config = config;
  this.parseContextConfig = parseContextConfig;
  this.processor = processor;

  templateAdd = new AddUpdateCommand(req);
  templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
  templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);

  //this is lightweight
  autoDetectParser = new AutoDetectParser(config);
  this.factory = factory;
  
  ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}

Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0

5 votes

private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testEncryptedWordDoc() throws Exception {
    System.out.println("testEncryptedWordDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("Word doc Encrypted"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testParseRequiringNotRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } finally {
        stream.close();
    }
    assertTrue(body.toString().contains("An Example Paper"));
}

Source File: SimplePageParser.java From flink-crawler with Apache License 2.0

5 votes

@Override
public void open(RuntimeContext context) throws Exception {
    super.open(context);

    _parser = new AutoDetectParser();
    _linkExtractor.setLinkTags(getParserPolicy().getLinkTags());
    _linkExtractor.setLinkAttributeTypes(getParserPolicy().getLinkAttributeTypes());
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}

Source File: TearlineContentExtractor.java From baleen with Apache License 2.0

5 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}

Source File: JATEUtil.java From jate with GNU Lesser General Public License v3.0

5 votes

public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}

Source File: AttachAttribute.java From entando-components with GNU Lesser General Public License v3.0

5 votes

@Override
public String getIndexeableFieldValue() {
	StringBuilder buffer = new StringBuilder();
	if (null != super.getIndexeableFieldValue()) {
		buffer.append(super.getIndexeableFieldValue());
	}
	String extraValue = null;
	ResourceInterface resource = this.getResource();
	if (resource != null) {
		InputStream is = ((AttachResource) resource).getResourceStream();
		if (null != is) {
			AutoDetectParser parser = new AutoDetectParser();
			BodyContentHandler handler = new BodyContentHandler(-1);
			Metadata metadata = new Metadata();
			try {
				parser.parse(is, handler, metadata);
				extraValue = handler.toString();
			} catch (Throwable t) {
				_logger.error("Error while processing the parsing", t);
			} finally {
				try {
					is.close();
				} catch (IOException ex) {
					_logger.error("Error closing stream", ex);
				}
			}
		}
	}
	if (null != extraValue) {
		buffer.append(" ").append(extraValue);
	}
	return buffer.toString();
}

Source File: ExtractMediaMetadata.java From nifi with Apache License 2.0

5 votes

@SuppressWarnings("unused")
@OnScheduled
public void onScheduled(ProcessContext context) {
    String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue();
    if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) {
        metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput));
    } else {
        metadataKeyFilterRef.set(null);
    }

    autoDetectParser = new AutoDetectParser();
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Ignore
@Test
public void testMassiveOCRDoc() throws Exception {
    System.out.println("testMassiveOCRDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testParseRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    String parsedString = body.toString();
    // From first page
    assertTrue(parsedString.contains("Father or mother"));
    // From second (last) page
    assertTrue(parsedString.contains("how you have determined who is the Nearest"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testEncryptedPDFDoc() throws Exception {
    System.out.println("testEncryptedPDFDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("PDF Encrypted"));
}

Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}

Source File: TikaPoweredContainerExtractor.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use 
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.config = tikaConfig;
    
    // Setup the detector and parser
    detector = new DefaultDetector(config.getMimeRepository());
    parser = new AutoDetectParser(detector);
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, metadata, context);
    }
    return handler.getMetadataList();
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);

    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), context);
    }
    return handler.getMetadataList();
}

Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0

5 votes

@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}

Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0

5 votes

@Override
public String toText(String filePath) throws OperationException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }
}

Source File: ExtractMediaMetadata.java From localization_nifi with Apache License 2.0

5 votes

@SuppressWarnings("unused")
@OnScheduled
public void onScheduled(ProcessContext context) {
    String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue();
    if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) {
        metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput));
    } else {
        metadataKeyFilterRef.set(null);
    }

    autoDetectParser = new AutoDetectParser();
}

org.apache.tika.parser.AutoDetectParser Java Examples