org.apache.pdfbox.pdmodel.PDDocument#getPages

Source File: PdfTools.java From MyBox with Apache License 2.0

6 votes

public static List<PDImageXObject> getImageListFromPDF(PDDocument document,
        Integer startPage) throws Exception {
    List<PDImageXObject> imageList = new ArrayList<>();
    if (null != document) {
        PDPageTree pages = document.getPages();
        startPage = startPage == null ? 0 : startPage;
        int len = pages.getCount();
        if (startPage < len) {
            for (int i = startPage; i < len; ++i) {
                PDPage page = pages.get(i);
                Iterable<COSName> objectNames = page.getResources().getXObjectNames();
                for (COSName imageObjectName : objectNames) {
                    if (page.getResources().isImageXObject(imageObjectName)) {
                        imageList.add((PDImageXObject) page.getResources().getXObject(imageObjectName));
                    }
                }
            }
        }
    }
    return imageList;
}

Source File: ExtractMarkedContent.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/54956720/how-to-replace-a-space-with-a-word-while-extract-the-data-from-pdf-using-pdfbox">
 * How to replace a space with a word while extract the data from PDF using PDFBox
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=10ZkdPlGWzMJeahwnQPzE6V7s09d1nvwq">
 * test.pdf
 * </a> as "testWPhromma.pdf"
 * <p>
 * This test shows how to, in principle, extract tagged text.
 * </p>
 */
@Test
public void testExtractTestWPhromma() throws IOException {
    System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
    try (   InputStream resource = getClass().getResourceAsStream("testWPhromma.pdf")) {
        PDDocument document = Loader.loadPDF(resource);

        Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();

        for (PDPage page : document.getPages()) {
            PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
            extractor.processPage(page);

            Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
            markedContents.put(page, theseMarkedContents);
            for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
                theseMarkedContents.put(markedContent.getMCID(), markedContent);
            }
        }

        PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
        showStructure(root, markedContents);
    }
}

Source File: ExtractMarkedContent.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/59192443/get-tags-related-bboxs-even-though-there-is-no-attributes-a-in-document-cata">
 * Get tag's related BBox's even though there is no attributes (/A in document catalog structure) related to Layout in PDFBox?
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/1_-tuWuReaTvrDsqQwldTnPYrMHSpXIWp/view?usp=sharing">
 * res_multipage.pdf
 * </a>
 * <p>
 * This test shows how to, in principle, extract tagged text from this document.
 * </p>
 */
@Test
public void testExtractResMultipage() throws IOException {
    System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
    try (   InputStream resource = getClass().getResourceAsStream("res_multipage.pdf")) {
        PDDocument document = Loader.loadPDF(resource);

        Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();

        for (PDPage page : document.getPages()) {
            PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
            extractor.processPage(page);

            Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
            markedContents.put(page, theseMarkedContents);
            for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
                theseMarkedContents.put(markedContent.getMCID(), markedContent);
            }
        }

        PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
        showStructure(root, markedContents);
    }
}

Source File: PdfScreenshotUtils.java From dss with GNU Lesser General Public License v2.1

6 votes

public static void checkPdfSimilarity(PDDocument document1, PDDocument document2, float minSimilarity) throws IOException {
	PDPageTree samplePageTree = document1.getPages();
	PDPageTree checkPageTree = document2.getPages();

	assertEquals(checkPageTree.getCount(), samplePageTree.getCount());

	PDFRenderer sampleRenderer = new PDFRenderer(document1);
	PDFRenderer checkRenderer = new PDFRenderer(document2);

	for (int pageNumber = 0; pageNumber < checkPageTree.getCount(); pageNumber++) {
		BufferedImage sampleImage = sampleRenderer.renderImageWithDPI(pageNumber, DPI);
		BufferedImage checkImage = checkRenderer.renderImageWithDPI(pageNumber, DPI);
		
           // ImageIO.write(sampleImage, "png", new File("target\\sampleImage.png"));
           // ImageIO.write(checkImage, "png", new File("target\\checkImage.png"));
           
		float checkSimilarity = checkImageSimilarity(sampleImage, checkImage, CHECK_RESOLUTION);
		assertTrue(checkSimilarity >= minSimilarity, "The image similarity " + checkSimilarity + " is lower the allowed limit " + minSimilarity);
	}
}

Source File: DashboardUtil.java From Insights with Apache License 2.0

5 votes

/**
 * Footer is filled with varaibles selected in Grafana by user
 * 
 * @param doc
 * @param title
 * @param variables
 * @return doc
 * @throws IOException
 */
private PDDocument footer(PDDocument doc, String title, String variables) throws IOException {
	try{
		PDPageTree pages = doc.getPages();
		for(PDPage p : pages){
			PDPageContentStream contentStream = new PDPageContentStream(doc, p, AppendMode.APPEND, false);
			contentStream.beginText();
			contentStream.newLineAtOffset(220, 780);
			contentStream.setFont(PDType1Font.HELVETICA, 11);
			contentStream.showText("OneDevOps Insights – "+title);
			contentStream.endText();
			if(!variables.equals("") && variables != null){
				contentStream.beginText();
				contentStream.newLineAtOffset(2, 17);
				contentStream.setFont(PDType1Font.HELVETICA, 9);
				contentStream.showText("This Report is generated based on the user selected values as below.");
				contentStream.endText();
				contentStream.beginText();
				contentStream.newLineAtOffset(2, 5);
				contentStream.setFont(PDType1Font.HELVETICA, 7);
				contentStream.showText(variables);
				contentStream.endText();
			}
			contentStream.close();
		}
	}catch(Exception e){
		Log.error("Error, Failed in Footer.. ", e.getMessage());
	}
	return doc;
}

Source File: Overlay.java From gcs with Mozilla Public License 2.0

5 votes

private void processPages(PDDocument document) throws IOException
{
    int pageCounter = 0;
    for (PDPage page : document.getPages())
    {
        pageCounter++;
        COSDictionary pageDictionary = page.getCOSObject();
        COSBase originalContent = pageDictionary.getDictionaryObject(COSName.CONTENTS);
        COSArray newContentArray = new COSArray();
        LayoutPage layoutPage = getLayoutPage(pageCounter, document.getNumberOfPages());
        if (layoutPage == null)
        {
            continue;
        }
        switch (position)
        {
            case FOREGROUND:
                // save state
                newContentArray.add(createStream("q\n"));
                addOriginalContent(originalContent, newContentArray);
                // restore state
                newContentArray.add(createStream("Q\n"));
                // overlay content last
                overlayPage(page, layoutPage, newContentArray);
                break;
            case BACKGROUND:
                // overlay content first
                overlayPage(page, layoutPage, newContentArray);

                addOriginalContent(originalContent, newContentArray);
                break;
            default:
                throw new IOException("Unknown type of position:" + position);
        }
        pageDictionary.setItem(COSName.CONTENTS, newContentArray);
    }
}

Source File: PdfVeryDenseMergeTool.java From testarea-pdfbox2 with Apache License 2.0

5 votes

void merge(PDDocument input) throws IOException
{
    for (PDPage page : input.getPages())
    {
        merge(input, page);
    }
}

Source File: PdfDenseMergeTool.java From testarea-pdfbox2 with Apache License 2.0

5 votes

void merge(PDDocument input) throws IOException
{
    for (PDPage page : input.getPages())
    {
        merge(input, page);
    }
}

Source File: ScalePages.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/49733329/java-stretch-pdf-pages-content">
 * Java- stretch pdf pages content
 * </a>
 * <p>
 * This test illustrates how to up-scale a PDF using the <b>UserUnit</b>
 * page property. 
 * </p>
 */
@Test
public void testUserUnitScaleAFieldTwice() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("/mkl/testarea/pdfbox2/form/aFieldTwice.pdf")) {
        PDDocument document = Loader.loadPDF(resource);

        for (PDPage page : document.getPages()) {
            page.getCOSObject().setFloat("UserUnit", 1.7f);
        }

        document.save(new File(RESULT_FOLDER, "aFieldTwice-scaled.pdf"));
    }
}

Source File: DetermineWidgetPage.java From testarea-pdfbox2 with Apache License 2.0

5 votes

int determineSafe(PDDocument document, PDAnnotationWidget widget) throws IOException
{
    COSDictionary widgetObject = widget.getCOSObject();
    PDPageTree pages = document.getPages();
    for (int i = 0; i < pages.getCount(); i++)
    {
        for (PDAnnotation annotation : pages.get(i).getAnnotations())
        {
            COSDictionary annotationObject = annotation.getCOSObject();
            if (annotationObject.equals(widgetObject))
                return i;
        }
    }
    return -1;
}

Source File: ExtractImages.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B9izTHWJQ7xlT2ZoQkJfbGRYcFE">
 * 10948.pdf
 * </a>
 * <p>
 * The only special thing about the two images returned for the sample PDF is that
 * one image is merely a mask used for the other image, and the other image is the
 * actual image used on the PDF page. If one only wants the images immediately used
 * in the page content, one also has to scan the page content.
 * </p>
 */
@Test
public void testExtractPageImageResources10948() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}

Source File: ExtractImages.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
 * 10948.pdf
 * </a>, renamed "10948-new.pdf" here to prevent a collision
 * <p>
 * Here the code extracts no image at all because the images are not immediate page
 * resources but wrapped in form xobjects.
 * </p>
 */
@Test
public void testExtractPageImageResources10948New() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948-new.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-new-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}

Source File: VisualizeMarkedContent.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * This method outputs an XML'ish representation of the structure
 * tree plus text extracted for it and additionally creates a PDF
 * with frames representing the bounding boxes of the text inside
 * the structure elements.
 */
public void visualize(String resourceName, String resultName) throws IOException {
    System.out.printf("\n\n===\n%s\n===\n", resourceName);
    try (   InputStream resource = getClass().getResourceAsStream(resourceName)) {
        PDDocument document = Loader.loadPDF(resource);

        Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();

        for (PDPage page : document.getPages()) {
            PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
            extractor.processPage(page);

            Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
            markedContents.put(page, theseMarkedContents);
            for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
                addToMap(theseMarkedContents, markedContent);
            }
        }

        PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
        Map<PDPage, PDPageContentStream> visualizations = new HashMap<>();
        showStructure(document, root, markedContents, visualizations);
        for (PDPageContentStream canvas : visualizations.values())
            canvas.close();

        document.save(new File(RESULT_FOLDER, resultName));
    }
}

Source File: ShrinkPDF.java From shrink-pdf with MIT License

5 votes

/**
 * Shrink a PDF
 * @param f {@code File} pointing to the PDF to shrink
 * @param compQual Compression quality parameter. 0 is
 *                 smallest file, 1 is highest quality.
 * @return The compressed {@code PDDocument}
 * @throws FileNotFoundException
 * @throws IOException 
 */
private PDDocument shrinkMe() 
        throws FileNotFoundException, IOException {
     if(compQual < 0)
         compQual = compQualDefault;
     final RandomAccessBufferedFileInputStream rabfis = 
             new RandomAccessBufferedFileInputStream(input);
     final PDFParser parser = new PDFParser(rabfis);
     parser.parse();
     final PDDocument doc = parser.getPDDocument();
     final PDPageTree pages = doc.getPages();
     final ImageWriter imgWriter;
     final ImageWriteParam iwp;
     if(tiff) {
         final Iterator<ImageWriter> tiffWriters =
               ImageIO.getImageWritersBySuffix("png");
         imgWriter = tiffWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         //iwp.setCompressionMode(ImageWriteParam.MODE_DISABLED);
     } else {
         final Iterator<ImageWriter> jpgWriters = 
               ImageIO.getImageWritersByFormatName("jpeg");
         imgWriter = jpgWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         iwp.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
         iwp.setCompressionQuality(compQual);
     }
     for(PDPage p : pages) {
          scanResources(p.getResources(), doc, imgWriter, iwp);
     }
     return doc;
}

Source File: DetermineBoundingBox.java From testarea-pdfbox2 with Apache License 2.0

4 votes

void drawBoundingBoxes(PDDocument pdDocument) throws IOException {
    for (PDPage pdPage : pdDocument.getPages()) {
        drawBoundingBox(pdDocument, pdPage);
    }
}

Source File: DashboardUtil.java From Insights with Apache License 2.0

2 votes

/**
 * Get previous page in the document.
 * 
 * @param document
 * @return {pageNum}
 */
private static int getPages(PDDocument document) {
	PDPageTree pages = document.getPages();
	return pages.getCount()-1;
}

Java Code Examples for org.apache.pdfbox.pdmodel.PDDocument#getPages()