org.apache.pdfbox.text.PDFTextStripper Java Exaples

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
     * <a href="https://stackoverflow.com/questions/53773479/java-rotated-file-extraction">
     * java- rotated file extraction?
     * </a>
     * <br/>
     * <a href="https://www.dropbox.com/s/g1pe8zb9m5kajif/lol.pdf?dl=0">
     * lol.pdf
     * </a>
     * <p>
     * Indeed, regular text extraction results on many lines, essentially
     * one for each text chunk. One can improve this in two ways, either
     * one activates sorting or one removes the Rotate entries from the
     * page dictionaries.
     * </p>
     */
    @Test
    public void testLol() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("lol.pdf")    )
        {
            PDDocument document = Loader.loadPDF(resource);
// Option 1: Remove Rotate entries
//            for (PDPage page : document.getPages()) {
//                page.setRotation(0);
//            }

            PDFTextStripper stripper = new PDFTextStripper();
// Option 2: Sort by position
            stripper.setSortByPosition(true);
            String text = stripper.getText(document);

            System.out.printf("\n*\n* lol.pdf\n*\n%s\n", text);
            Files.write(new File(RESULT_FOLDER, "lol.txt").toPath(), Collections.singleton(text));
        }
    }

Source File: ExtractCharacterCodes.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/50664162/some-glyph-ids-missing-while-trying-to-extract-glyph-id-from-pdf">
 * Some glyph ID's missing while trying to extract glyph ID from pdf
 * </a>
 * <br/>
 * <a href="http://1drv.ms/b/s!AmHcFaD-gMGyhkHr4PY6F4krYJ32">
 * pattern3.pdf
 * </a>
 * <p>
 * This test shows how to access the character codes of the extracted text
 * while preventing the {@link PDFTextStripper} from doing any preprocessing
 * steps, in particular from doing any diacritics merges.
 * </p>
 */
@Test
public void testExtractFromPattern3() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("pattern3.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper() {
            
            @Override
            protected void processTextPosition(TextPosition textPosition) {
                try {
                    writeString(String.format("%s%s", textPosition.getUnicode(), Arrays.toString(textPosition.getCharacterCodes())));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        };
        String text = stripper.getText(document);

        System.out.printf("\n*\n* pattern3.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "pattern3.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractColorText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/59031734/get-text-color-in-pdfbox">
 * Get text color in PDFBox
 * </a>
 * <p>
 * This test has already been executed for the original color text stripper class from my answer to
 * <a href="https://stackoverflow.com/questions/21430341/identifying-the-text-based-on-the-output-in-pdf-using-pdfbox">
 * Identifying the text based on the output in PDF using PDFBOX
 * </a>
 * </p>
 * 
 * @throws IOException
 */
@Test
public void testExtractFromFurzoSample() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("furzo Sample.pdf");
            PDDocument document = Loader.loadPDF(resource) ) {
        PDFTextStripper stripper = new ColorTextStripper();
        String text = stripper.getText(document);

        Files.write(new File(RESULT_FOLDER, "furzo Sample.txt").toPath(), text.getBytes("UTF-8"));

        System.out.println("/// furzo Sample.pdf ///");
        System.out.println("Stripped text with color:");
        System.out.println(">>>");
        System.out.println(text);
        System.out.println("<<<");
    }
}

Source File: TitleBlockWriterTest.java From eplmp with Eclipse Public License 1.0

6 votes

@Test
public void createTitleBlockForPartIterationTest() throws Exception {
    PartTitleBlockData partTitleBlockData = new PartTitleBlockData(partIteration, new Locale("en"));
    byte[] titleBlock = new TitleBlockWriter(partTitleBlockData).createTitleBlock();
    PDDocument loadedDocument = PDDocument.load(titleBlock);

    Assert.assertNotNull(loadedDocument);
    String text = new PDFTextStripper().getText(loadedDocument);

    loadedDocument.close();

    Assert.assertFalse(text.isEmpty());
    Assert.assertTrue(text.contains(user.getLogin()));
    Assert.assertTrue(text.contains(partIteration.getNumber()));
    Assert.assertTrue(text.contains(partIteration.getPartRevision().getDescription()));

}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
 * PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
 * </a>
 * <br/>
 * <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
 * test-2.pdf
 * </a>
 * <p>
 * Due to the broken <b>ToUnicode</b> maps the output of immediate text
 * extraction from this document is unsatisfying, cf. {@link #testTest2()}.
 * It can be improved by removing these <b>ToUnicode</b> maps as this test
 * shows.
 * </p>
 */
@Test
public void testNoToUnicodeTest2() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("test-2.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);

        for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
        {
            PDPage page = document.getPage(pageNr);
            PDResources resources = page.getResources();
            removeToUnicodeMaps(resources);
        }

        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/a/56580253/1729265">
 * wen li's answer to "PDFBox extracting paragraphs"
 * </a>
 * <br/>
 * <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">
 * PDF32000_2008.pdf
 * </a>
 * <p>
 * Here it looks the other way around compared to what the OP claims:
 * there is a space at the end of all but the last paragraph line.
 * </p>
 */
@Test
public void testPDF32000pageii() throws IOException
{
    try (   InputStream resource = new URL("https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf").openStream()    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(2);
        stripper.setEndPage(2);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PDF32000_2008.pdf Page ii\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PDF32000_2008-page-ii.txt").toPath(), Collections.singleton(text));
    }
}

Source File: PdfExtractTextsBatchController.java From MyBox with Apache License 2.0

6 votes

@Override
public boolean makeMoreParameters() {
    try {
        if (!super.makeMoreParameters()) {
            return false;
        }
        separator = separatorInput.getText();
        if (!separatorCheck.isSelected() || separator == null || separator.isEmpty()) {
            separator = null;
        }
        stripper = new PDFTextStripper();
        return true;
    } catch (Exception e) {
        logger.error(e.toString());
        stripper = null;
        return false;
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/a/56580253/1729265">
 * wen li's answer to "PDFBox extracting paragraphs"
 * </a>
 * <br/>
 * <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">
 * PDF32000_2008.pdf
 * </a>
 * <p>
 * Here one sees that there is not always a space at the end of all
 * the non-last paragraph lines, "PDF/X" is split as "PDF/" and "X"
 * between lines, and there is no space in-between.
 * </p>
 */
@Test
public void testPDF32000pagevii() throws IOException
{
    try (   InputStream resource = new URL("https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf").openStream()    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(7);
        stripper.setEndPage(7);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PDF32000_2008.pdf Page ii\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PDF32000_2008-page-vii.txt").toPath(), Collections.singleton(text));
    }
}

Source File: PdfIntegrationTests.java From java-wkhtmltopdf-wrapper with MIT License

6 votes

@Test
public void testPdfWithXvfb() throws Exception {
    WrapperConfig wc = null;
    if (!System.getProperty("os.name").toLowerCase().contains("windows")) {
        XvfbConfig xc = new XvfbConfig();
        xc.addParams(new Param("--auto-servernum"), new Param("--server-num=1"));

        wc = new WrapperConfig();
        wc.setXvfbConfig(xc);
    }
    Pdf pdf = wc != null ? new Pdf(wc) : new Pdf();
    pdf.addPageFromUrl("http://www.google.com");

    pdf.saveAs("output.pdf");

    // WHEN
    byte[] pdfBytes = pdf.getPDF();
    PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
    String pdfText = new PDFTextStripper().getText(pdDocument);

    Assert.assertThat("document should be generated", pdfText, containsString("Google"));
}

Source File: GetLinesFromPDF.java From blog-codes with Apache License 2.0

6 votes

/**
 * @throws IOException If there is an error parsing the document.
 */
public static void main( String[] args ) throws IOException {
    PDDocument document = null;
    String fileName = "/home/lili/data/test.pdf";
    try {
        document = PDDocument.load( new File(fileName) );
        PDFTextStripper stripper = new GetLinesFromPDF();
        stripper.setSortByPosition( true );
        stripper.setStartPage( 0 );
        stripper.setEndPage( document.getNumberOfPages() );
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
        
        // print lines
        for(String line:lines){
            System.out.println(line); 
        }
    }
    finally {
        if( document != null ) {
            document.close();
        }
    }
}

Source File: PrintTextLocations.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	PDDocument document = null;
	try {
		document = PDDocument.load(new File("/home/lili/data/test.pdf"));

		PDFTextStripper stripper = new PrintTextLocations();
		stripper.setSortByPosition(true);
		stripper.setStartPage(0);
		stripper.setEndPage(document.getNumberOfPages());

		Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
		stripper.writeText(document, dummy);
	} finally {
		if (document != null) {
			document.close();
		}
	}
}

Source File: PDF.java From pdf-test with MIT License

6 votes

private PDF(String name, byte[] content) {
  this.content = content;

  try (InputStream inputStream = new ByteArrayInputStream(content)) {
    try (PDDocument pdf = PDDocument.load(inputStream)) {
      this.text = new PDFTextStripper().getText(pdf);
      this.numberOfPages = pdf.getNumberOfPages();
      this.author = pdf.getDocumentInformation().getAuthor();
      this.creationDate = pdf.getDocumentInformation().getCreationDate();
      this.creator = pdf.getDocumentInformation().getCreator();
      this.keywords = pdf.getDocumentInformation().getKeywords();
      this.producer = pdf.getDocumentInformation().getProducer();
      this.subject = pdf.getDocumentInformation().getSubject();
      this.title = pdf.getDocumentInformation().getTitle();
      this.encrypted = pdf.isEncrypted();
      
      PDSignature signature = pdf.getLastSignatureDictionary();
      this.signed = signature != null;
      this.signerName = signature == null ? null : signature.getName();
      this.signatureTime = signature == null ? null : signature.getSignDate();
    }
  }
  catch (Exception e) {
    throw new IllegalArgumentException("Invalid PDF file: " + name, e);
  }
}

Source File: ExtractTextExample.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws InvalidPasswordException, IOException {
    try (PDDocument document = PDDocument.load(new File("/home/lili/data/test.pdf"))) {
        if (!document.isEncrypted()) {
            PDFTextStripper tStripper = new PDFTextStripper();
            // 如果想抽取某一页或者某几页，可以使用下面的方法限定范围。
            // 目前是抽取所有页
            tStripper.setStartPage(0);
            tStripper.setEndPage(document.getNumberOfPages());
            String pdfFileInText = tStripper.getText(document);
            String lines[] = pdfFileInText.split("\\r?\\n"); 
            for (String line : lines) {
                System.out.println(line);  
            } 
        }
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/47515609/invalid-block-type-while-using-pdfbox-2-0-8">
 * Invalid block type while using pdfbox 2.0.8
 * </a>
 * <br>
 * <a href="https://www.dropbox.com/s/xjeksj0cay4x3vo/NoTemplateInError.pdf?dl=0">
 * NoTemplateInError.pdf
 * </a>
 * <p>
 * The issue cannot be reproduced.
 * </p>
 */
@Test
public void testNoTemplateInError() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("NoTemplateInError.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* NoTemplateInError.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "NoTemplateInError.txt").toPath(), Collections.singleton(text));
    }
}

Source File: PDF2TextExample.java From tutorials with MIT License

5 votes

private static void generateTxtFromPDF(String filename) throws IOException {
	File f = new File(filename);
	String parsedText;
	PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
	parser.parse();

	COSDocument cosDoc = parser.getDocument();

	PDFTextStripper pdfStripper = new PDFTextStripper();
	PDDocument pdDoc = new PDDocument(cosDoc);

	parsedText = pdfStripper.getText(pdDoc);

	if (cosDoc != null)
		cosDoc.close();
	if (pdDoc != null)
		pdDoc.close();

	PrintWriter pw = new PrintWriter("src/output/pdf.txt");
	pw.print(parsedText);
	pw.close();
}

Source File: ExtractWordCoordinates.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/50330484/could-someone-give-me-an-example-of-how-to-extract-coordinates-for-a-word-usin">
 * Could someone give me an example of how to extract coordinates for a 'word' using PDFBox
 * </a>
 * <br/>
 * <a href="https://www.tutorialkart.com/pdfbox/how-to-get-location-and-size-of-images-in-pdf/attachment/apache-pdf/">
 * apache.pdf
 * </a>
 * <p>
 * This test shows how to extract word coordinates combining the ideas of
 * the two tutorials referenced by the OP.
 * </p>
 */
@Test
public void testExtractWordsForGoodJuJu() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("apache.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new GetWordLocationAndSize();
        stripper.setSortByPosition( true );
        stripper.setStartPage( 0 );
        stripper.setEndPage( document.getNumberOfPages() );
 
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/37862159/pdf-reading-via-pdfbox-in-java">
 * pdf reading via pdfbox in java 
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B_Ke2amBgdpedUNwVTR3RVlRTFE/view?usp=sharing">
 * PnL_500010_0314.pdf
 * </a>
 * <p>
 * Indeed, the <code>PDFTextStripper</code> is not even informed about those undecipherable
 * text sections. Essentially the underlying method `PDFTextStreamEngine.showGlyph` filters
 * all unmappable glyphs from composite fonts. 
 * </p>
 */
@Test
public void testPnL_500010_0314() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("PnL_500010_0314.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PnL_500010_0314.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PnL_500010_0314.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/37862159/pdf-reading-via-pdfbox-in-java">
 * pdf reading via pdfbox in java 
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B_Ke2amBgdpebm96U05FcWFsSXM/view?usp=sharing">
 * Bal_532935_0314.pdf
 * </a>
 * <p>
 * The issue here is caused by PDFBox guessing an encoding. The underlying method
 * `PDFTextStreamEngine.showGlyph` does this for all unmappable glyphs from simple
 * fonts.
 * </p>
 */
@Test
public void testBal_532935_0314() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("Bal_532935_0314.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* Bal_532935_0314.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "Bal_532935_0314.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/38975091/pdfbox-gettext-not-returning-all-of-the-visible-text">
 * PDFBox getText not returning all of the visible text
 * </a>
 * <br>
 * <a href="https://dl.dropboxusercontent.com/u/14898138/03%20WP%20Enterprise%20BlackBerry%20Compete%20Datasheet_041612%20FINAL%20DRAFT.pdf">
 * 03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf
 * </a>
 * <p>
 * There is some 'writing' actually done using vector graphics, not text,
 * but aside from that all is accounted for.
 * </p>
 */
@Test
public void test03WpEnterpriseBlackBerryCompeteDatasheet_041612FinalDraft() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* 03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/48828500/again-having-invisible-text-coming-from-pdftextstripper">
 * Again having invisible text coming from PdfTextStripper
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=1P1oFu8cpZnzy9LF4wiGWPrk3PfL6dktt">
 * testFailed.pdf
 * </a>
 * <p>
 * The extracted, invisible text is rendered WHITE on WHITE.
 * </p>
 */
@Test
public void testTestFailed() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("testFailed.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* testFailed.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "testFailed.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/51672080/pdfdomtree-does-not-detecting-white-spaces-while-converting-a-pdf-file-to-html">
 * PDFDomTree does not detecting white spaces while converting a pdf file to html
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/1SZNFCvGVbQzCxJiRr8HlW99ravC_Cm71/view?usp=sharing">
 * demo.pdf
 * </a>
 * <p>
 * PDFBox shows no issue extracting the text from the given file.
 * </p>
 */
@Test
public void testDemo() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("demo.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* demo.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "demo.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/53382793/java-8-pdfbox-cant-gettext-of-pdf-file">
 * Java 8 PDFbox can't getText of pdf file
 * </a>
 * <br/>
 * <a href="http://www.o-cha.net/english/cup/pdf/29.pdf">
 * 29.pdf
 * </a>
 * <p>
 * Cannot reproduce any issue.
 * </p>
 */
@Test
public void test29() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("29.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* 29.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "29.txt").toPath(), Collections.singleton(text));
    }
}

Source File: PdfTest.java From camel-quarkus with Apache License 2.0

5 votes

@Order(1)
@Test
public void createFromTextShouldReturnANewPdfDocument() throws IOException {
    byte[] bytes = RestAssured.given().contentType(ContentType.TEXT)
            .body("content to be included in the created pdf document").post("/pdf/createFromText").then().statusCode(201)
            .extract().asByteArray();

    PDDocument doc = PDDocument.load(bytes);
    PDFTextStripper pdfTextStripper = new PDFTextStripper();
    String text = pdfTextStripper.getText(doc);
    assertEquals(1, doc.getNumberOfPages());
    assertTrue(text.contains("content to be included in the created pdf document"));
    doc.close();
}

Source File: PdfIntegrationTests.java From java-wkhtmltopdf-wrapper with MIT License

5 votes

private String getPdfTextFromBytes(byte[] pdfBytes) throws IOException {
    PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(pdfBytes));
    String text = new PDFTextStripper().getText(pdDocument);

    pdDocument.close();
    return text;
}

Source File: PDFFormatModule.java From ontopia with Apache License 2.0

5 votes

@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  try {
    PDDocument pdoc = PDDocument.load(new BufferedInputStream(new ByteArrayInputStream(cc.getContent())));
    PDFTextStripper stripper = new PDFTextStripper();
    String s = stripper.getText(pdoc);
    pdoc.close();
    char[] c = s.toCharArray();
    handler.startRegion("document");
    handler.text(c, 0, c.length);
    handler.endRegion();
  } catch (Exception e) {
    throw new OntopiaRuntimeException(e);
  }    
}

Source File: PdfUtils.java From job with MIT License

5 votes

public static String parsePdf2Text(InputStream input) throws Exception {
  PDDocument doc = PDDocument.load(input);
  ByteArrayOutputStream output = new ByteArrayOutputStream();
  OutputStreamWriter writer = new OutputStreamWriter(output);
  try {
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.writeText(doc, writer);
  } finally {
    doc.close();
    input.close();
    output.close();
    writer.close();
  }
  return new String(output.toByteArray());
}

Source File: PdfContentTypeChecker.java From tika-server with Apache License 2.0

5 votes

private void calculateObjectsInDocument(PDDocument document) throws IOException {
    this.pdfTextStripper = new PDFTextStripper();

    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        this.pageCount = allPages.getCount();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            readObjectsOnPage(page);
            calculateTextLengthOnPage(document, i + 1);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: SurvivorSongbookParser.java From Quelea with GNU General Public License v3.0

5 votes

/**
 * Get all the songs in the PDF document.
 * @return a list of all the songs.
 * @throws IOException if something went wrong.
 */
@Override
public List<SongDisplayable> getSongs(File location, StatusPanel statusPanel) throws IOException {
    PDDocument document = PDDocument.load(location);
    List<SongDisplayable> pdfSongs = new ArrayList<>();
    PDFTextStripper stripper = new PDFTextStripper();
    List<String> songParts = new ArrayList<>();
    for (int i = 0; i < document.getNumberOfPages(); i++) {
        String pageText = getPageText(document, stripper, i);
        if (pageText.trim().isEmpty()) {
            continue;
        }
        songParts.add(pageText);
        boolean twoPart = pageText.contains("(1 of");
        if (i < document.getNumberOfPages() - 1) { //This section in case the original (1 of x) is missed out
            String nextPageText = getPageText(document, stripper, i + 1);
            if (nextPageText.contains("(2 of")) {
                twoPart = true;
            }
        }
        if (!twoPart) {
            SongDisplayable song = processSong(songParts.toArray(new String[songParts.size()]));
            if (song != null) {
                pdfSongs.add(song);
            }
            songParts.clear();
        }
    }
    document.close();
    if (pdfSongs == null) {
        return new ArrayList<>();
    }
    else {
        return pdfSongs;
    }
}

Source File: PDFExtractor.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

public static void main(String args[]){
    try{
    File file = new File(getResourcePath());
    PDDocument pd = PDDocument.load(file);
    PDFTextStripper stripper = new PDFTextStripper();
    String text= stripper.getText(pd);
    System.out.println(text);
    }
    catch(IOException ex){
        System.out.println(ex);
    }
}

Source File: Pdf.java From webtau with Apache License 2.0

5 votes

public PdfText pageText(int pageIdx) {
    try {
        PDFTextStripper reader = new PDFTextStripper();
        reader.setStartPage(pageIdx + 1);
        reader.setEndPage(pageIdx + 1);

        return new PdfText("body.pdf.pageIdx(" + pageIdx + ").text", reader.getText(document));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

org.apache.pdfbox.text.PDFTextStripper Java Examples