org.apache.pdfbox.pdfparser.PDFParser Java Exaples

Source File: PdfBleachSession.java From DocBleach with MIT License

6 votes

@SuppressFBWarnings(
    value = "EXS_EXCEPTION_SOFTENING_RETURN_FALSE",
    justification = "This method is an helper to check the password")
private PDDocument testPassword(ScratchFile inFile, RandomAccessRead source, String password)
    throws IOException {
  PDFParser parser = new PDFParser(source, password, inFile);
  try {
    parser.parse();
    return parser.getPDDocument();
  } catch (InvalidPasswordException e) {
    LOGGER.error("The tested password is invalid");
    return null;
  } finally {
    source.rewind((int) source.getPosition());
  }
}

Source File: PDDocument.java From gcs with Mozilla Public License 2.0

6 votes

private static PDDocument load(RandomAccessBufferedFileInputStream raFile, String password,
                               InputStream keyStore, String alias,
                               MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try
    {
        PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    }
    catch (IOException ioe)
    {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}

Source File: PDDocument.java From gcs with Mozilla Public License 2.0

6 votes

/**
 * Parses a PDF. Depending on the memory settings parameter the given input stream is either
 * copied to memory or to a temporary file to enable random access to the pdf.
 *
 * @param input stream that contains the document. Don't forget to close it after loading.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(InputStream input, String password, InputStream keyStore, 
                              String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try
    {
        RandomAccessRead source = scratchFile.createBuffer(input);
        PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    }
    catch (IOException ioe)
    {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}

Source File: PDFReader.java From swcv with MIT License

6 votes

private boolean getFile(String url)
{
    try
    {
        URL u = new URL(url);
        URLConnection con = u.openConnection();
        InputStream in = con.getInputStream();
        PDFParser p = new PDFParser(in);
        p.parse();
        PDDocument pdoc = new PDDocument(p.getDocument());
        PDFTextStripper pts = new PDFTextStripper();
        text = pts.getText(pdoc);
        pdoc.close();

        return true;
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return false;
    }
}

Source File: ExtractTextTools.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: SignatureOptions.java From gcs with Mozilla Public License 2.0

5 votes

private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException
{
    pdfSource = rar;
    PDFParser parser = new PDFParser(pdfSource);
    parser.parse();
    visualSignature = parser.getDocument();
}

Source File: PDFIndexer.java From carbon-apimgt with Apache License 2.0

5 votes

protected PDFParser getPdfParser(File2Index fileData) throws IOException {
	return new PDFParser(new ByteArrayInputStream(fileData.data));
}

Source File: PDFIndexerTest.java From carbon-apimgt with Apache License 2.0

5 votes

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
    String mediaType = "application/pdf+test";
    final String MEDIA_TYPE = "mediaType";
    PDFParser parser = Mockito.mock(PDFParser.class);
    COSDocument cosDoc = Mockito.mock(COSDocument.class);
    PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
    Mockito.doThrow(IOException.class).when(cosDoc).close();
    Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
    Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
    PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);

    // should return the default media type when media type is not defined in file2Index
    IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if error occurs in finally block
    file2Index.mediaType = mediaType;
    pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

}

Source File: ShrinkPDF.java From shrink-pdf with MIT License

5 votes

/**
 * Shrink a PDF
 * @param f {@code File} pointing to the PDF to shrink
 * @param compQual Compression quality parameter. 0 is
 *                 smallest file, 1 is highest quality.
 * @return The compressed {@code PDDocument}
 * @throws FileNotFoundException
 * @throws IOException 
 */
private PDDocument shrinkMe() 
        throws FileNotFoundException, IOException {
     if(compQual < 0)
         compQual = compQualDefault;
     final RandomAccessBufferedFileInputStream rabfis = 
             new RandomAccessBufferedFileInputStream(input);
     final PDFParser parser = new PDFParser(rabfis);
     parser.parse();
     final PDDocument doc = parser.getPDDocument();
     final PDPageTree pages = doc.getPages();
     final ImageWriter imgWriter;
     final ImageWriteParam iwp;
     if(tiff) {
         final Iterator<ImageWriter> tiffWriters =
               ImageIO.getImageWritersBySuffix("png");
         imgWriter = tiffWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         //iwp.setCompressionMode(ImageWriteParam.MODE_DISABLED);
     } else {
         final Iterator<ImageWriter> jpgWriters = 
               ImageIO.getImageWritersByFormatName("jpeg");
         imgWriter = jpgWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         iwp.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
         iwp.setCompressionQuality(compQual);
     }
     for(PDPage p : pages) {
          scanResources(p.getResources(), doc, imgWriter, iwp);
     }
     return doc;
}

Source File: PDF2TextExample.java From tutorials with MIT License

5 votes

private static void generateTxtFromPDF(String filename) throws IOException {
	File f = new File(filename);
	String parsedText;
	PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
	parser.parse();

	COSDocument cosDoc = parser.getDocument();

	PDFTextStripper pdfStripper = new PDFTextStripper();
	PDDocument pdDoc = new PDDocument(cosDoc);

	parsedText = pdfStripper.getText(pdDoc);

	if (cosDoc != null)
		cosDoc.close();
	if (pdDoc != null)
		pdDoc.close();

	PrintWriter pw = new PrintWriter("src/output/pdf.txt");
	pw.print(parsedText);
	pw.close();
}

Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0

4 votes

public PDFIndexerWrapper(PDFParser pdfParser, PDFTextStripper stripper) {
    this.pdfParser = pdfParser;
    this.pdfTextStripper = stripper;
}

Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0

4 votes

@Override
protected PDFParser getPdfParser(AsyncIndexer.File2Index fileData) throws IOException {
    return pdfParser;
}

Source File: PDDocument.java From gcs with Mozilla Public License 2.0

3 votes

/**
 * Parses a PDF.
 * 
 * @param input byte array that contains the document.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(byte[] input, String password, InputStream keyStore, 
        String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    RandomAccessRead source = new RandomAccessBuffer(input);
    PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
    parser.parse();
    return parser.getPDDocument();
}

org.apache.pdfbox.pdfparser.PDFParser Java Examples