org.apache.pdfbox.text.PDFTextStripper#getText

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
 * PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
 * </a>
 * <br/>
 * <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
 * test-2.pdf
 * </a>
 * <p>
 * Due to the broken <b>ToUnicode</b> maps the output of immediate text
 * extraction from this document is unsatisfying, cf. {@link #testTest2()}.
 * It can be improved by removing these <b>ToUnicode</b> maps as this test
 * shows.
 * </p>
 */
@Test
public void testNoToUnicodeTest2() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("test-2.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);

        for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
        {
            PDPage page = document.getPage(pageNr);
            PDResources resources = page.getResources();
            removeToUnicodeMaps(resources);
        }

        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractTextExample.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws InvalidPasswordException, IOException {
    try (PDDocument document = PDDocument.load(new File("/home/lili/data/test.pdf"))) {
        if (!document.isEncrypted()) {
            PDFTextStripper tStripper = new PDFTextStripper();
            // 如果想抽取某一页或者某几页，可以使用下面的方法限定范围。
            // 目前是抽取所有页
            tStripper.setStartPage(0);
            tStripper.setEndPage(document.getNumberOfPages());
            String pdfFileInText = tStripper.getText(document);
            String lines[] = pdfFileInText.split("\\r?\\n"); 
            for (String line : lines) {
                System.out.println(line);  
            } 
        }
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/a/56580253/1729265">
 * wen li's answer to "PDFBox extracting paragraphs"
 * </a>
 * <br/>
 * <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">
 * PDF32000_2008.pdf
 * </a>
 * <p>
 * Here it looks the other way around compared to what the OP claims:
 * there is a space at the end of all but the last paragraph line.
 * </p>
 */
@Test
public void testPDF32000pageii() throws IOException
{
    try (   InputStream resource = new URL("https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf").openStream()    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(2);
        stripper.setEndPage(2);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PDF32000_2008.pdf Page ii\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PDF32000_2008-page-ii.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractColorText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/59031734/get-text-color-in-pdfbox">
 * Get text color in PDFBox
 * </a>
 * <p>
 * This test has already been executed for the original color text stripper class from my answer to
 * <a href="https://stackoverflow.com/questions/21430341/identifying-the-text-based-on-the-output-in-pdf-using-pdfbox">
 * Identifying the text based on the output in PDF using PDFBOX
 * </a>
 * </p>
 * 
 * @throws IOException
 */
@Test
public void testExtractFromFurzoSample() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("furzo Sample.pdf");
            PDDocument document = Loader.loadPDF(resource) ) {
        PDFTextStripper stripper = new ColorTextStripper();
        String text = stripper.getText(document);

        Files.write(new File(RESULT_FOLDER, "furzo Sample.txt").toPath(), text.getBytes("UTF-8"));

        System.out.println("/// furzo Sample.pdf ///");
        System.out.println("Stripped text with color:");
        System.out.println(">>>");
        System.out.println(text);
        System.out.println("<<<");
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/a/56580253/1729265">
 * wen li's answer to "PDFBox extracting paragraphs"
 * </a>
 * <br/>
 * <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">
 * PDF32000_2008.pdf
 * </a>
 * <p>
 * Here one sees that there is not always a space at the end of all
 * the non-last paragraph lines, "PDF/X" is split as "PDF/" and "X"
 * between lines, and there is no space in-between.
 * </p>
 */
@Test
public void testPDF32000pagevii() throws IOException
{
    try (   InputStream resource = new URL("https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf").openStream()    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(7);
        stripper.setEndPage(7);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PDF32000_2008.pdf Page ii\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PDF32000_2008-page-vii.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/38975091/pdfbox-gettext-not-returning-all-of-the-visible-text">
 * PDFBox getText not returning all of the visible text
 * </a>
 * <br>
 * <a href="https://dl.dropboxusercontent.com/u/14898138/03%20WP%20Enterprise%20BlackBerry%20Compete%20Datasheet_041612%20FINAL%20DRAFT.pdf">
 * 03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf
 * </a>
 * <p>
 * There is some 'writing' actually done using vector graphics, not text,
 * but aside from that all is accounted for.
 * </p>
 */
@Test
public void test03WpEnterpriseBlackBerryCompeteDatasheet_041612FinalDraft() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* 03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "03 WP Enterprise BlackBerry Compete Datasheet_041612 FINAL DRAFT.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/37862159/pdf-reading-via-pdfbox-in-java">
 * pdf reading via pdfbox in java 
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B_Ke2amBgdpebm96U05FcWFsSXM/view?usp=sharing">
 * Bal_532935_0314.pdf
 * </a>
 * <p>
 * The issue here is caused by PDFBox guessing an encoding. The underlying method
 * `PDFTextStreamEngine.showGlyph` does this for all unmappable glyphs from simple
 * fonts.
 * </p>
 */
@Test
public void testBal_532935_0314() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("Bal_532935_0314.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* Bal_532935_0314.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "Bal_532935_0314.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/51672080/pdfdomtree-does-not-detecting-white-spaces-while-converting-a-pdf-file-to-html">
 * PDFDomTree does not detecting white spaces while converting a pdf file to html
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/1SZNFCvGVbQzCxJiRr8HlW99ravC_Cm71/view?usp=sharing">
 * demo.pdf
 * </a>
 * <p>
 * PDFBox shows no issue extracting the text from the given file.
 * </p>
 */
@Test
public void testDemo() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("demo.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* demo.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "demo.txt").toPath(), Collections.singleton(text));
    }
}

Source File: SearchSubword.java From testarea-pdfbox2 with Apache License 2.0

5 votes

List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
    final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
    PDFTextStripper stripper = new PDFTextStripper()
    {
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            System.out.printf("  -- %s\n", text);

            TextPositionSequence word = new TextPositionSequence(textPositions);
            String string = word.toString();

            int fromIndex = 0;
            int index;
            while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
            {
                hits.add(word.subSequence(index, index + searchTerm.length()));
                fromIndex = index + 1;
            }
            super.writeString(text, textPositions);
        }
    };
    
    stripper.setSortByPosition(true);
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    stripper.getText(document);
    return hits;
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/47515609/invalid-block-type-while-using-pdfbox-2-0-8">
 * Invalid block type while using pdfbox 2.0.8
 * </a>
 * <br>
 * <a href="https://www.dropbox.com/s/xjeksj0cay4x3vo/NoTemplateInError.pdf?dl=0">
 * NoTemplateInError.pdf
 * </a>
 * <p>
 * The issue cannot be reproduced.
 * </p>
 */
@Test
public void testNoTemplateInError() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("NoTemplateInError.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* NoTemplateInError.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "NoTemplateInError.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractTextTools.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: PDFExtractionExample.java From Java-for-Data-Science with MIT License

5 votes

public static void main(String[] args) {
    try {
        PDDocument document = PDDocument.load(new File("PDF File.pdf"));
            PDFTextStripper Tstripper = new PDFTextStripper();
            String documentText = Tstripper.getText(document);
            System.out.println(documentText);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://stackoverflow.com/questions/53551335/java-does-pdfbox-have-an-option-to-open-file-instead-of-loading-it">
 * Java- Does pdfBox have an option to open file instead of loading it?
 * </a>
 * <br/>
 * <a href="https://www.dropbox.com/s/osyk2ieoq6od2p8/10-million-password-list-top-1000000.pdf?dl=0">
 * 10-million-password-list-top-1000000.pdf
 * </a>
 * <p>
 * In contrast to the OP I did not need to fiddle with the memory
 * settings at all for a plain extraction. Furthermore, I got 999999
 * lines with words and 3 empty lines from the file, not 10000000
 * passwords.
 * </p>
 */
@Test
public void test10MillionPasswordListTop1000000() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10-million-password-list-top-1000000.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* 10-million-password-list-top-1000000.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "10-million-password-list-top-1000000.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractVisibleText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://stackoverflow.com/questions/47908124/pdfbox-removing-invisible-text-by-clip-filling-paths-issue">
 * PDFBox - Removing invisible text (by clip/filling paths issue)
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=1xcZOusx3cEdZX4AT8QAVDqZe33YWla0H">
 * test.pdf
 * </a> as testDmitryK.pdf
 * <p>
 * Indeed, using the original {@link PDFVisibleTextStripper} implementation
 * a lot of visible characters where dropped. This was due to the incorrect
 * calculation of the <code>end</code> of the character baseline in the methods
 * {@link PDFVisibleTextStripper#processTextPosition(org.apache.pdfbox.text.TextPosition)}
 * and {@link PDFVisibleTextStripper#deleteCharsInPath()}.
 * </p>
 * <p>
 * After patching those {@link PDFVisibleTextStripper} methods to make use of
 * <code>end</code> only optionally, running the test with that option results
 * in a decent extraction of visible text.
 * </p>
 */
@Test
public void testTestDmitryK() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("testDmitryK.pdf")  ) {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFVisibleTextStripper();
        stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* testDmitryK.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "testDmitryK.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractVisibleText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://stackoverflow.com/questions/47358127/remove-invisible-text-from-pdf-using-pdfbox">
 * remove invisible text from pdf using pdfbox
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/1F8vrzcABwxVGdN5W-7etQggY5xKtGplU/view">
 * RevTeaser09072016.pdf
 * </a>
 * <p>
 * This class tests the {@link PDFVisibleTextStripper} to ignore text hidden
 * by clipping or by covering with a filled path in the OP's sample document.
 * </p>
 */
@Test
public void testExtractFromRevTeaser09072016() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("RevTeaser09072016.pdf")  ) {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFVisibleTextStripper(true);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* RevTeaser09072016.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "RevTeaser09072016.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://stackoverflow.com/questions/49746202/read-pdf-file-using-pdfbox-in-utf-8-in-java-scala">
 * Read pdf file using pdfbox in UTF-8 in java/scala
 * </a>
 * <br/>
 * <a href="https://1drv.ms/b/s!AmHcFaD-gMGyhg6eyqSy2gu9sLWl">
 * test.pdf
 * </a> as testKabirManandhar.pdf
 * <p>
 * The issue can be reproduced. The cause are incomplete ToUnicode
 * maps. There is an option, though: The embedded font programs
 * appear to include more complete mappings, so repairing the
 * ToUnicode table seems feasible.
 * </p>
 */
@Test
public void testTestKabirManandhar() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("testKabirManandhar.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* testKabirManandhar.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "testKabirManandhar.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractVisibleText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://github.com/mkl-public/testarea-pdfbox2/issues/3">
 * One case fails to remove invisible texts or symbols
 * </a>
 * <br/>
 * <a href="https://github.com/mkl-public/testarea-pdfbox2/files/2481423/00000000000005fw6q.pdf">
 * 00000000000005fw6q.pdf
 * </a>
 * <p>
 * The "hidden text" recognized by Adobe here is only "hidden"
 * because it uses a glyph (page 1, Font F9, code 0000) for which
 * the embedded font draws nothing but which ToUnicode maps to
 * U+DBD0, a High Private Use Surrogate which by itself in general
 * makes no sense.
 * </p>
 */
@Test
public void test00000000000005fw6q() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("00000000000005fw6q.pdf")  ) {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFVisibleTextStripper();
        stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* 00000000000005fw6q.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "00000000000005fw6q.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
 * PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
 * </a>
 * <br/>
 * <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
 * test-2.pdf
 * </a>
 * <p>
 * Due to the broken <b>ToUnicode</b> maps the output of this test is
 * unsatisfying. It can be improved by removing these <b>ToUnicode</b>
 * maps, cf. {@link #testNoToUnicodeTest2()}.
 * </p>
 */
@Test
public void testTest2() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("test-2.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* test-2.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "test-2.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

3 votes

/**
 * <a href="https://stackoverflow.com/questions/54644435/error-when-extracting-text-from-pdf-using-pdfbox">
 * Error when extracting text from pdf using pdfbox
 * </a>
 * <br/>
 * <a href="http://ishouhuo.cn/cannotExtract.pdf">
 * cannotExtract.pdf
 * </a>
 * <p>
 * Indeed, all required information for text extraction are missing from the font
 * PingFangSC in all its variants. Thus, text extraction results automatically are
 * lacking.
 * </p>
 */
@Test
public void testCannotExtract() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("cannotExtract.pdf")    )
    {
        PDDocument document =  Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* cannotExtract.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "cannotExtract.txt").toPath(), Collections.singleton(text));
    }
}

Java Code Examples for org.apache.pdfbox.text.PDFTextStripper#getText()