org.apache.tika.sax.BodyContentHandler Java Exaples

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

6 votes

private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}

Source File: TikaExtractor.java From ache with Apache License 2.0

6 votes

public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}

Source File: TikaContentExtractor.java From baleen with Apache License 2.0

6 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}

Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0

6 votes

/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}

Source File: EmbedSpawner.java From extract with MIT License

6 votes

@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}

Source File: EmbedParser.java From extract with MIT License

5 votes

@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {
	if (outputHtml) {
		writeStart(handler, metadata);
	}

	delegateParsing(input, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata);

	if (outputHtml) {
		writeEnd(handler);
	}
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}

Source File: AttachAttribute.java From entando-components with GNU Lesser General Public License v3.0

5 votes

@Override
public String getIndexeableFieldValue() {
	StringBuilder buffer = new StringBuilder();
	if (null != super.getIndexeableFieldValue()) {
		buffer.append(super.getIndexeableFieldValue());
	}
	String extraValue = null;
	ResourceInterface resource = this.getResource();
	if (resource != null) {
		InputStream is = ((AttachResource) resource).getResourceStream();
		if (null != is) {
			AutoDetectParser parser = new AutoDetectParser();
			BodyContentHandler handler = new BodyContentHandler(-1);
			Metadata metadata = new Metadata();
			try {
				parser.parse(is, handler, metadata);
				extraValue = handler.toString();
			} catch (Throwable t) {
				_logger.error("Error while processing the parsing", t);
			} finally {
				try {
					is.close();
				} catch (IOException ex) {
					_logger.error("Error closing stream", ex);
				}
			}
		}
	}
	if (null != extraValue) {
		buffer.append(" ").append(extraValue);
	}
	return buffer.toString();
}

Source File: JATEUtil.java From jate with GNU Lesser General Public License v3.0

5 votes

public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}

Source File: TearlineContentExtractor.java From baleen with Apache License 2.0

5 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}

Source File: PdfParser.java From superword with Apache License 2.0

5 votes

/**
 * 将PDF文件解析为文本
 * @param file 本地PDF文件的相对路径或绝对路径
 * @return 提取的文本
 */
public static String parsePdfFileToPlainText(String file) {
    try(InputStream stream = new FileInputStream(file)) {
        BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
        Metadata metadata = new Metadata();
        PARSER.parse(stream, handler, metadata);
        return handler.toString();
    } catch (Exception e){
        e.printStackTrace();
    }
    return "";
}

Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License

5 votes

public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

/**
 * Basic text extraction.
 * <p>
 * Tries to close input stream after processing.
 */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new BodyContentHandler(1000000);
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}

Source File: WidgetMacroLibraryTests.java From scipio-erp with Apache License 2.0

5 votes

public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = (InputStream) http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
        BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
        Metadata metadata = new Metadata();
        new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
        screenOutString = handler.toString();
    } finally {
        screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testParseRequiringNotRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } finally {
        stream.close();
    }
    assertTrue(body.toString().contains("An Example Paper"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testEncryptedWordDoc() throws Exception {
    System.out.println("testEncryptedWordDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("Word doc Encrypted"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testEncryptedPDFDoc() throws Exception {
    System.out.println("testEncryptedPDFDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("PDF Encrypted"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Ignore
@Test
public void testMassiveOCRDoc() throws Exception {
    System.out.println("testMassiveOCRDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using"));
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Test
public void testParseRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    String parsedString = body.toString();
    // From first page
    assertTrue(parsedString.contains("Father or mother"));
    // From second (last) page
    assertTrue(parsedString.contains("how you have determined who is the Nearest"));
}

Source File: TikaPoweredContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * Returns an appropriate Tika ContentHandler for the
 *  requested content type. Normally you'll let this
 *  work as default, but if you need fine-grained
 *  control of how the Tika events become text then
 *  override and supply your own.
 */
protected ContentHandler getContentHandler(String targetMimeType, Writer output) 
               throws TransformerConfigurationException
{
   if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType)) 
   {
      return new BodyContentHandler(output);
   }
   
   SAXTransformerFactory factory = (SAXTransformerFactory)
         SAXTransformerFactory.newInstance();
   TransformerHandler handler = factory.newTransformerHandler();
   handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
   handler.setResult(new StreamResult(output));
   
   if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
   {
      handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
      return new ExpandedTitleContentHandler(handler);
   }
   else if(MimetypeMap.MIMETYPE_XHTML.equals(targetMimeType) ||
           MimetypeMap.MIMETYPE_XML.equals(targetMimeType))
   {
      handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
   }
   else
   {
      throw new TransformerInfoException(
            WRONG_FORMAT_MESSAGE_ID,
            new IllegalArgumentException("Requested target type " + targetMimeType + " not supported")
      );
   }
   return handler;
}

Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0

5 votes

@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}

Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0

5 votes

@Override
public String toText(String filePath) throws OperationException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }
}

Source File: ParsingReader.java From extract with MIT License

4 votes

public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext
		context) throws IOException {
	this(parser, input, metadata, context, BodyContentHandler::new);
}

Source File: TikaEntityProcessor.java From lucene-solr with Apache License 2.0

4 votes

private static ContentHandler getTextContentHandler(Writer writer) {
  return new BodyContentHandler(writer);
}

Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0

4 votes

public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}

Source File: PDFPreprocessorParser.java From CogStack-Pipeline with Apache License 2.0

4 votes

@Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        ImageMagickConfig config = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);

        // If ImageMagick is not on the path with the current config, do not try to run OCR
        // getSupportedTypes shouldn't have listed us as handling it, so this should only
        //  occur if someone directly calls this parser, not via DefaultParser or similar
//        TemporaryResources tmp = new TemporaryResources();
        //TikaInputStream pdfStream = TikaInputStream.get(stream);
        PDFParser pdfParser = new PDFParser();

        //create temp handlers to investigate object
        BodyContentHandler body = new BodyContentHandler();
        Metadata pdfMetadata = new Metadata();

        //needed to reset stream
        if (stream.markSupported()) {
            stream.mark(Integer.MAX_VALUE);
        }

        //first do initial parse to see if there's subsantial content in pdf metadata already
        pdfParser.parse(stream, body, pdfMetadata, context);
        stream.reset();
        //if there's content - reparse with official handlers/metadata. What else can you do? Also check imagemagick is available

        if (body.toString().length() > 100 || !hasImageMagick(config)) {
            pdfParser.parse(stream, handler, metadata, context);
            metadata.set("X-PDFPREPROC-OCR-APPLIED", "NA");
            return;
        }

        metadata.set("X-PDFPREPROC-ORIGINAL", body.toString());
        metadata.set("X-PDFPREPROC-OCR-APPLIED", "FAIL");
        // "FAIL" will be overwritten if it succeeds later

        //add the PDF metadata to the official metadata object
        Arrays.asList(pdfMetadata.names()).stream().forEach(name -> {
            metadata.add(name, pdfMetadata.get(name));
        });


        //objects to hold file references for manipulation outside of Java
        File tiffFileOfPDF = null;
        File pdfFileFromStream = File.createTempFile("tempPDF", ".pdf");
        try {

            FileUtils.copyInputStreamToFile(stream, pdfFileFromStream);
            tiffFileOfPDF = File.createTempFile("tempTIFF", ".tiff");
            makeTiffFromPDF(pdfFileFromStream,tiffFileOfPDF, config);
            if (tiffFileOfPDF.exists()) {
                long tessStartTime = System.currentTimeMillis();
                TesseractOCRParser tesseract = new TesseractOCRParser();

                tesseract.parse(FileUtils.openInputStream(tiffFileOfPDF), handler, metadata, context);
                metadata.set("X-PDFPREPROC-OCR-APPLIED", "SUCCESS");

                LOG.debug("Document parsing -- OCR processing time: {} ms", System.currentTimeMillis() - tessStartTime);
            }
        } finally {
            if (tiffFileOfPDF.exists()) {
                tiffFileOfPDF.delete();
            }
            if (pdfFileFromStream.exists()) {
                pdfFileFromStream.delete();
            }
        }
    }

Source File: NodeTika.java From node-tika with MIT License

4 votes

public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}

Source File: MP3Reader.java From red5-io with Apache License 2.0

4 votes

/**
 * Creates reader from file input stream
 * 
 * @param file
 *            file input
 * @throws IOException
 *             on IO error
 */
public MP3Reader(File file) throws IOException {
    this.file = file;
    fis = new FileInputStream(file);
    try {
        // parse the ID3 info
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        // MP3 parser
        Mp3Parser parser = new Mp3Parser();
        parser.parse(fis, handler, metadata, null);
        log.debug("Contents of the document: {}", handler.toString());
        // create meta data holder
        metaData = new MetaData();
        String val = null;
        String[] metadataNames = metadata.names();
        for (String name : metadataNames) {
            val = metadata.get(name);
            log.debug("Meta name: {} value: {}", name, val);
            if ("xmpDM:artist".equals(name)) {
                metaData.setArtist(val);
            } else if ("xmpDM:album".equals(name)) {
                metaData.setAlbum(val);
            } else if ("title".equals(name)) {
                metaData.setSongName(val);
            } else if ("xmpDM:genre".equals(name)) {
                metaData.setGenre(val);
            } else if ("xmpDM:logComment".equals(name)) {
                metaData.setComment(val);
            } else if ("xmpDM:trackNumber".equals(name)) {
                metaData.setTrack(val);
            } else if ("xmpDM:releaseDate".equals(name)) {
                metaData.setYear(val);
            } else if ("xmpDM:duration".equals(name) || "duration".equals(name)) {
                metaData.setDuration(val);
            } else if ("xmpDM:audioSampleRate".equals(name) || "samplerate".equals(name)) {
                metaData.setSampleRate(val);
            } else if ("channels".equals(name)) {
                metaData.setChannels(val);
            }
        }
        /*
         * //send album image if included List<Artwork> tagFieldList = idTag.getArtworkList(); if (tagFieldList == null || tagFieldList.isEmpty()) { log.debug("No cover art was found"); }
         * else { Artwork imageField = tagFieldList.get(0); log.debug("Picture type: {}", imageField.getPictureType()); FrameBodyAPIC imageFrameBody = new FrameBodyAPIC();
         * imageFrameBody.setImageData(imageField.getBinaryData()); if (!imageFrameBody.isImageUrl()) { byte[] imageBuffer = (byte[])
         * imageFrameBody.getObjectValue(DataTypes.OBJ_PICTURE_DATA); //set the cover image on the metadata metaData.setCovr(imageBuffer); // Create tag for onImageData event IoBuffer buf
         * = IoBuffer.allocate(imageBuffer.length); buf.setAutoExpand(true); Output out = new Output(buf); out.writeString("onImageData"); Map<Object, Object> props = new HashMap<Object,
         * Object>(); props.put("trackid", 1); props.put("data", imageBuffer); out.writeMap(props); buf.flip(); //Ugh i hate flash sometimes!! //Error #2095: flash.net.NetStream was unable
         * to invoke callback onImageData. ITag result = new Tag(IoConstants.TYPE_METADATA, 0, buf.limit(), null, 0); result.setBody(buf); //add to first frames firstTags.add(result); } }
         * } else { log.info("File did not contain ID3v2 data: {}", file.getName()); }
         */
    } catch (Exception e) {
        log.error("MP3Reader {}", e);
    }
    // ensure we have a valid sample rate
    checkValidHeader();
    // get the total bytes / file size
    fileSize = file.length();
    log.debug("File size: {}", fileSize);
    // analyze keyframes data
    analyzeKeyFrames();
    // create file metadata object
    firstTags.addFirst(createFileMeta());
    log.trace("File input stream - open: {} position: {}", fis.getChannel().isOpen(), fis.getChannel().position());
    // create a channel for reading
    fileChannel = fis.getChannel();
}

org.apache.tika.sax.BodyContentHandler Java Examples