org.apache.poi.hslf.extractor.PowerPointExtractor Java Examples

The following examples show how to use org.apache.poi.hslf.extractor.PowerPointExtractor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MsPowerPointTextExtractor.java    From document-management-system with GNU General Public License v2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
	try {
		PowerPointExtractor extractor = new PowerPointExtractor(stream);
		return extractor.getText(true, true);
	} catch (RuntimeException e) {
		logger.warn("Failed to extract PowerPoint text content", e);
		throw new IOException(e.getMessage(), e);
	} finally {
		try {
			stream.close();
		} catch (IOException ignored) {
		}
	}
}
 
Example #2
Source File: PowerPointFormatModule.java    From ontopia with Apache License 2.0 5 votes vote down vote up
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  try {
    PowerPointExtractor extractor = new PowerPointExtractor(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
    String s = extractor.getText();
    char[] c = s.toCharArray();
    handler.startRegion("document");
    handler.text(c, 0, c.length);
    handler.endRegion();
  } catch (Exception e) {
    throw new OntopiaRuntimeException(e);
  }    
}
 
Example #3
Source File: MetadataExtractor.java    From document-management-system with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Extract metadata from Office Word
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(is);
	OfficeMetadata md = new OfficeMetadata();
	SummaryInformation si = null;

	if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
		si = new WordExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
		si = new ExcelExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
		si = new PowerPointExtractor(fs).getSummaryInformation();
	}

	if (si != null) {
		md.setTitle(si.getTitle());
		md.setSubject(si.getSubject());
		md.setAuthor(si.getAuthor());
		md.setLastAuthor(si.getLastAuthor());
		md.setKeywords(si.getKeywords());
		md.setComments(si.getComments());
		md.setTemplate(si.getTemplate());
		md.setRevNumber(si.getRevNumber());
		md.setApplicationName(si.getApplicationName());
		md.setEditTime(si.getEditTime());
		md.setPageCount(si.getPageCount());
		md.setWordCount(si.getWordCount());
		md.setCharCount(si.getCharCount());
		md.setSecurity(si.getSecurity());

		Calendar createDateTime = Calendar.getInstance();
		createDateTime.setTime(si.getCreateDateTime());
		md.setCreateDateTime(createDateTime);

		Calendar lastSaveDateTime = Calendar.getInstance();
		lastSaveDateTime.setTime(si.getLastSaveDateTime());
		md.setLastSaveDateTime(lastSaveDateTime);

		Calendar lastPrinted = Calendar.getInstance();
		lastPrinted.setTime(si.getLastPrinted());
		md.setLastPrinted(lastPrinted);
	}

	log.info("officeExtractor: {}", md);
	return md;
}
 
Example #4
Source File: MSPowerpointIndexerTest.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem ppExtractor = Mockito.mock(POIFSFileSystem.class);
    PowerPointExtractor powerPointExtractor = Mockito.mock(PowerPointExtractor.class);
    XSLFPowerPointExtractor xslfExtractor = Mockito.mock(XSLFPowerPointExtractor.class);
    XMLSlideShow xmlSlideShow = Mockito.mock(XMLSlideShow.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any(InputStream.class))
            .thenThrow(OfficeXmlFileException.class)
            .thenReturn(ppExtractor)
            .thenThrow(APIManagementException.class);
    PowerMockito.whenNew(PowerPointExtractor.class).withParameterTypes(POIFSFileSystem.class)
            .withArguments(ppExtractor).thenReturn(powerPointExtractor);
    PowerMockito.whenNew(XMLSlideShow.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any())
            .thenReturn(xmlSlideShow);
    PowerMockito.whenNew(XSLFPowerPointExtractor.class).withArguments(xmlSlideShow).thenReturn(xslfExtractor);
    Mockito.when(powerPointExtractor.getText()).thenReturn("");
    Mockito.when(xslfExtractor.getText()).thenReturn("");
    MSPowerpointIndexer indexer = new MSPowerpointIndexer();

    IndexDocument ppDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/vnd.ms-powerpoint".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    ppDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    ppDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(ppDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}