org.apache.pdfbox.text.PDFTextStripper#setEndPage

Source File: PrintTextLocations.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	PDDocument document = null;
	try {
		document = PDDocument.load(new File("/home/lili/data/test.pdf"));

		PDFTextStripper stripper = new PrintTextLocations();
		stripper.setSortByPosition(true);
		stripper.setStartPage(0);
		stripper.setEndPage(document.getNumberOfPages());

		Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
		stripper.writeText(document, dummy);
	} finally {
		if (document != null) {
			document.close();
		}
	}
}

Source File: ExtractTextExample.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws InvalidPasswordException, IOException {
    try (PDDocument document = PDDocument.load(new File("/home/lili/data/test.pdf"))) {
        if (!document.isEncrypted()) {
            PDFTextStripper tStripper = new PDFTextStripper();
            // 如果想抽取某一页或者某几页，可以使用下面的方法限定范围。
            // 目前是抽取所有页
            tStripper.setStartPage(0);
            tStripper.setEndPage(document.getNumberOfPages());
            String pdfFileInText = tStripper.getText(document);
            String lines[] = pdfFileInText.split("\\r?\\n"); 
            for (String line : lines) {
                System.out.println(line);  
            } 
        }
    }
}

Source File: GetLinesFromPDF.java From blog-codes with Apache License 2.0

6 votes

/**
 * @throws IOException If there is an error parsing the document.
 */
public static void main( String[] args ) throws IOException {
    PDDocument document = null;
    String fileName = "/home/lili/data/test.pdf";
    try {
        document = PDDocument.load( new File(fileName) );
        PDFTextStripper stripper = new GetLinesFromPDF();
        stripper.setSortByPosition( true );
        stripper.setStartPage( 0 );
        stripper.setEndPage( document.getNumberOfPages() );
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
        
        // print lines
        for(String line:lines){
            System.out.println(line); 
        }
    }
    finally {
        if( document != null ) {
            document.close();
        }
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/a/56580253/1729265">
 * wen li's answer to "PDFBox extracting paragraphs"
 * </a>
 * <br/>
 * <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">
 * PDF32000_2008.pdf
 * </a>
 * <p>
 * Here it looks the other way around compared to what the OP claims:
 * there is a space at the end of all but the last paragraph line.
 * </p>
 */
@Test
public void testPDF32000pageii() throws IOException
{
    try (   InputStream resource = new URL("https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf").openStream()    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(2);
        stripper.setEndPage(2);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PDF32000_2008.pdf Page ii\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PDF32000_2008-page-ii.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/a/56580253/1729265">
 * wen li's answer to "PDFBox extracting paragraphs"
 * </a>
 * <br/>
 * <a href="https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf">
 * PDF32000_2008.pdf
 * </a>
 * <p>
 * Here one sees that there is not always a space at the end of all
 * the non-last paragraph lines, "PDF/X" is split as "PDF/" and "X"
 * between lines, and there is no space in-between.
 * </p>
 */
@Test
public void testPDF32000pagevii() throws IOException
{
    try (   InputStream resource = new URL("https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf").openStream()    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setStartPage(7);
        stripper.setEndPage(7);
        //stripper.setSortByPosition(true);
        String text = stripper.getText(document);

        System.out.printf("\n*\n* PDF32000_2008.pdf Page ii\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "PDF32000_2008-page-vii.txt").toPath(), Collections.singleton(text));
    }
}

Source File: Pdf.java From webtau with Apache License 2.0

5 votes

public PdfText pageText(int pageIdx) {
    try {
        PDFTextStripper reader = new PDFTextStripper();
        reader.setStartPage(pageIdx + 1);
        reader.setEndPage(pageIdx + 1);

        return new PdfText("body.pdf.pageIdx(" + pageIdx + ").text", reader.getText(document));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

Source File: ExtractTextTools.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Source File: SearchSubword.java From testarea-pdfbox2 with Apache License 2.0

5 votes

List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
    final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
    PDFTextStripper stripper = new PDFTextStripper()
    {
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            System.out.printf("  -- %s\n", text);

            TextPositionSequence word = new TextPositionSequence(textPositions);
            String string = word.toString();

            int fromIndex = 0;
            int index;
            while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
            {
                hits.add(word.subSequence(index, index + searchTerm.length()));
                fromIndex = index + 1;
            }
            super.writeString(text, textPositions);
        }
    };
    
    stripper.setSortByPosition(true);
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    stripper.getText(document);
    return hits;
}

Source File: ExtractWordCoordinates.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/50330484/could-someone-give-me-an-example-of-how-to-extract-coordinates-for-a-word-usin">
 * Could someone give me an example of how to extract coordinates for a 'word' using PDFBox
 * </a>
 * <br/>
 * <a href="https://www.tutorialkart.com/pdfbox/how-to-get-location-and-size-of-images-in-pdf/attachment/apache-pdf/">
 * apache.pdf
 * </a>
 * <p>
 * This test shows how to extract word coordinates combining the ideas of
 * the two tutorials referenced by the OP.
 * </p>
 */
@Test
public void testExtractWordsForGoodJuJu() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("apache.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new GetWordLocationAndSize();
        stripper.setSortByPosition( true );
        stripper.setStartPage( 0 );
        stripper.setEndPage( document.getNumberOfPages() );
 
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

4 votes

/**
     * <a href="https://stackoverflow.com/questions/54822124/pdftextstripperbyarea-and-pdftextstripper-parsing-different-text-output-for-tabl">
     * PDFTextStripperByArea and PDFTextStripper parsing different Text Output for Table with Merged Cell or Table cell with multi-line text content
     * </a>
     * <br/>
     * <a href="https://www4.esc13.net/uploads/webccat/docs/PDFTables_12142005.pdf">
     * PDFTables_12142005.pdf
     * </a>
     * <p>
     * Cannot reproduce the problem, and the OP does not react to clarification requests.
     * </p>
     */
    @Test
    public void testPDFTables_12142005() throws IOException {
        try (   InputStream resource = getClass().getResourceAsStream("PDFTables_12142005.pdf")    )
        {
            PDDocument document =  Loader.loadPDF(resource);

            PDFTextStripper textStripper = new PDFTextStripper();
            textStripper.setSortByPosition(true);
            textStripper.setAddMoreFormatting(false);
            // textStripper.setSpacingTolerance(1.5F);
            //textStripper.setAverageCharTolerance(averageCharToleranceValue);

            textStripper.setStartPage(2);
            textStripper.setEndPage(2);

            textStripper.getCurrentPage();
            String text = textStripper.getText(document).trim();
            System.out.println("PDF text is: " + "\n" + text.trim());

            System.out.println("----------------------------------------------------------------");

            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            stripper.setAddMoreFormatting(false);
            // stripper.setSpacingTolerance(1.5F);

            Dimension dimension = new Dimension();
            dimension.setSize(document.getPage(1).getMediaBox().getWidth(),
                    document.getPage(1).getMediaBox().getHeight());
//            Rectangle2D rect = toJavaRect(document.getBleedBox(), dimension);
//            Rectangle2D rect1 = toJavaRect(document.getArtBox(), dimension);
            PDRectangle mediaBox = document.getPage(1).getMediaBox();
            Rectangle2D rect = new Rectangle2D.Float(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight());
            Rectangle2D rect1 = rect;

            /*
             * Rectangle2D rect = new
             * Rectangle2D.Float(document.getBleedBox().getLowerLeftX(),
             * document.getBleedBox().getLowerLeftY(), document.getBleedBox().getWidth(),
             * document.getBleedBox().getHeight());
             */

            /*
             * Rectangle2D rect1 = new
             * Rectangle2D.Float(document.getArtBox().getLowerLeftX(),
             * document.getArtBox().getLowerLeftY(), document.getArtBox().getWidth(),
             * document.getArtBox().getHeight());
             */

            /*
             * Rectangle2D rect = new
             * Rectangle2D.Float(document.getBleedBox().getLowerLeftX(),
             * document.getBleedBox().getUpperRightY(), document.getBleedBox().getWidth(),
             * document.getBleedBox().getHeight());
             */

            System.out.println("Rectangle bleedBox Content : " + "\n" + rect);
            System.out.println("----------------------------------------------------------------");
            System.out.println("Rectangle artBox Content : " + "\n" + rect1);
            System.out.println("----------------------------------------------------------------");
            stripper.addRegion("Test1", rect);
            stripper.addRegion("Test2", rect1);
            stripper.extractRegions(document.getPage(1));

            System.out.println("Text in the area-BleedBox : " + "\n" + stripper.getTextForRegion("Test1").trim());
            System.out.println("----------------------------------------------------------------");
            System.out.println("Text in the area1-ArtBox : " + "\n" + stripper.getTextForRegion("Test2").trim());
            System.out.println("----------------------------------------------------------------");

            StringBuilder artPlusBleedBox = new StringBuilder();
            artPlusBleedBox.append(stripper.getTextForRegion("Test2").trim());
            artPlusBleedBox.append("\r\n");
            artPlusBleedBox.append(stripper.getTextForRegion("Test1").trim());

            System.out.println("Whole Page Text : " + artPlusBleedBox);
            System.out.println("----------------------------------------------------------------");
            text = new String(text.trim().getBytes(), "UTF-8");
            String text2 = new String(artPlusBleedBox.toString().trim().getBytes(), "UTF-8");
            System.out.println(" Matches equals with Both Content : " + text.equals(artPlusBleedBox.toString()));
            System.out.println(" String Matches equals with Both Content : " + text.equalsIgnoreCase(text2));
        }
    }

Source File: SurvivorSongbookParser.java From Quelea with GNU General Public License v3.0

3 votes

/**
 * Get the text on a page in the PDF document.
 * @param document the document.
 * @param stripper the PDF stripper used to get the text.
 * @param page     the page number.
 * @return the text on the given page.
 * @throws IOException if something went wrong.
 */
private String getPageText(PDDocument document, PDFTextStripper stripper, int page) throws IOException {
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    StringWriter textWriter = new StringWriter();
    stripper.writeText(document, textWriter);
    return textWriter.toString().replace("’", "'").replace("`", "'");
}

Java Code Examples for org.apache.pdfbox.text.PDFTextStripper#setEndPage()