Java Code Examples for org.apache.pdfbox.text.PDFTextStripper#writeText()

The following examples show how to use org.apache.pdfbox.text.PDFTextStripper#writeText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: PrintTextLocations.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
	PDDocument document = null;
	try {
		document = PDDocument.load(new File("/home/lili/data/test.pdf"));

		PDFTextStripper stripper = new PrintTextLocations();
		stripper.setSortByPosition(true);
		stripper.setStartPage(0);
		stripper.setEndPage(document.getNumberOfPages());

		Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
		stripper.writeText(document, dummy);
	} finally {
		if (document != null) {
			document.close();
		}
	}
}

Example 2

Source File: GetLinesFromPDF.java From blog-codes with Apache License 2.0

6 votes

/**
 * @throws IOException If there is an error parsing the document.
 */
public static void main( String[] args ) throws IOException {
    PDDocument document = null;
    String fileName = "/home/lili/data/test.pdf";
    try {
        document = PDDocument.load( new File(fileName) );
        PDFTextStripper stripper = new GetLinesFromPDF();
        stripper.setSortByPosition( true );
        stripper.setStartPage( 0 );
        stripper.setEndPage( document.getNumberOfPages() );
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
        
        // print lines
        for(String line:lines){
            System.out.println(line); 
        }
    }
    finally {
        if( document != null ) {
            document.close();
        }
    }
}

Example 3

Source File: TextToPdfContentTransformerTest.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

private void transformTextAndCheck(String text, String encoding, String checkText)
        throws IOException
{
    // Get a reader for the text
    ContentReader reader = buildContentReader(text, Charset.forName(encoding));
    
    // And a temp writer
    File out = TempFileProvider.createTempFile("AlfrescoTest_", ".pdf");
    ContentWriter writer = new FileContentWriter(out);
    writer.setMimetype("application/pdf");
    
    // Transform to PDF
    transformer.transform(reader, writer);
    
    // Read back in the PDF and check it
    PDDocument doc = PDDocument.load(out);
    PDFTextStripper textStripper = new PDFTextStripper();
    StringWriter textWriter = new StringWriter();
    textStripper.writeText(doc, textWriter);
    doc.close();
    
    String roundTrip = clean(textWriter.toString());
    
    assertEquals(
            "Incorrect text in PDF when starting from text in " + encoding,
            checkText, roundTrip
    );
}

Example 4

Source File: ExtractWordCoordinates.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/50330484/could-someone-give-me-an-example-of-how-to-extract-coordinates-for-a-word-usin">
 * Could someone give me an example of how to extract coordinates for a 'word' using PDFBox
 * </a>
 * <br/>
 * <a href="https://www.tutorialkart.com/pdfbox/how-to-get-location-and-size-of-images-in-pdf/attachment/apache-pdf/">
 * apache.pdf
 * </a>
 * <p>
 * This test shows how to extract word coordinates combining the ideas of
 * the two tutorials referenced by the OP.
 * </p>
 */
@Test
public void testExtractWordsForGoodJuJu() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("apache.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new GetWordLocationAndSize();
        stripper.setSortByPosition( true );
        stripper.setStartPage( 0 );
        stripper.setEndPage( document.getNumberOfPages() );
 
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        stripper.writeText(document, dummy);
    }
}

Example 5

Source File: PdfUtils.java From job with MIT License

5 votes

public static String parsePdf2Text(InputStream input) throws Exception {
  PDDocument doc = PDDocument.load(input);
  ByteArrayOutputStream output = new ByteArrayOutputStream();
  OutputStreamWriter writer = new OutputStreamWriter(output);
  try {
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.writeText(doc, writer);
  } finally {
    doc.close();
    input.close();
    output.close();
    writer.close();
  }
  return new String(output.toByteArray());
}

Example 6

Source File: FopIntegrationTest.java From wildfly-camel with Apache License 2.0

4 votes

private String extractTextFromDocument(PDDocument document) throws IOException {
    Writer output = new StringWriter();
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.writeText(document, output);
    return output.toString().trim();
}

Example 7

Source File: SurvivorSongbookParser.java From Quelea with GNU General Public License v3.0

3 votes

/**
 * Get the text on a page in the PDF document.
 * @param document the document.
 * @param stripper the PDF stripper used to get the text.
 * @param page     the page number.
 * @return the text on the given page.
 * @throws IOException if something went wrong.
 */
private String getPageText(PDDocument document, PDFTextStripper stripper, int page) throws IOException {
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    StringWriter textWriter = new StringWriter();
    stripper.writeText(document, textWriter);
    return textWriter.toString().replace("’", "'").replace("`", "'");
}