org.apache.pdfbox.pdmodel.PDPageTree Java Exaples

Source File: PdfTools.java From MyBox with Apache License 2.0

6 votes

public static List<PDImageXObject> getImageListFromPDF(PDDocument document,
        Integer startPage) throws Exception {
    List<PDImageXObject> imageList = new ArrayList<>();
    if (null != document) {
        PDPageTree pages = document.getPages();
        startPage = startPage == null ? 0 : startPage;
        int len = pages.getCount();
        if (startPage < len) {
            for (int i = startPage; i < len; ++i) {
                PDPage page = pages.get(i);
                Iterable<COSName> objectNames = page.getResources().getXObjectNames();
                for (COSName imageObjectName : objectNames) {
                    if (page.getResources().isImageXObject(imageObjectName)) {
                        imageList.add((PDImageXObject) page.getResources().getXObject(imageObjectName));
                    }
                }
            }
        }
    }
    return imageList;
}

Source File: PDPageDestination.java From gcs with Mozilla Public License 2.0

6 votes

/**
 * Returns the page number for this destination, regardless of whether this is a page number or
 * a reference to a page.
 *
 * @since Apache PDFBox 1.0.0
 * @see org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
 * @return page number, or -1 if the destination type is unknown. The page number is 0-based if
 * it was in the dictionary (for remote destinations), and 1-based if it was computed from a
 * page reference (for local destinations).
 * @deprecated This method has inconsistent behavior (see returns), use {@link #retrievePageNumber()} instead.
 */
@Deprecated
public int findPageNumber()
{
    int retval = -1;
    if( array.size() > 0 )
    {
        COSBase page = array.getObject( 0 );
        if( page instanceof COSNumber )
        {
            retval = ((COSNumber)page).intValue();
        }
        else if (page instanceof COSDictionary)
        {
            COSBase parent = page;
            while (((COSDictionary) parent).getDictionaryObject(COSName.PARENT, COSName.P) != null)
            {
                parent = ((COSDictionary) parent).getDictionaryObject(COSName.PARENT, COSName.P);
            }
            // now parent is the pages node
            PDPageTree pages = new PDPageTree((COSDictionary) parent);
            return pages.indexOf(new PDPage((COSDictionary) page)) + 1;
        }
    }
    return retval;
}

Source File: PdfScreenshotUtils.java From dss with GNU Lesser General Public License v2.1

6 votes

public static void checkPdfSimilarity(PDDocument document1, PDDocument document2, float minSimilarity) throws IOException {
	PDPageTree samplePageTree = document1.getPages();
	PDPageTree checkPageTree = document2.getPages();

	assertEquals(checkPageTree.getCount(), samplePageTree.getCount());

	PDFRenderer sampleRenderer = new PDFRenderer(document1);
	PDFRenderer checkRenderer = new PDFRenderer(document2);

	for (int pageNumber = 0; pageNumber < checkPageTree.getCount(); pageNumber++) {
		BufferedImage sampleImage = sampleRenderer.renderImageWithDPI(pageNumber, DPI);
		BufferedImage checkImage = checkRenderer.renderImageWithDPI(pageNumber, DPI);
		
           // ImageIO.write(sampleImage, "png", new File("target\\sampleImage.png"));
           // ImageIO.write(checkImage, "png", new File("target\\checkImage.png"));
           
		float checkSimilarity = checkImageSimilarity(sampleImage, checkImage, CHECK_RESOLUTION);
		assertTrue(checkSimilarity >= minSimilarity, "The image similarity " + checkSimilarity + " is lower the allowed limit " + minSimilarity);
	}
}

Source File: PdfContentImagePreprocessor.java From tika-server with Apache License 2.0

5 votes

private void removeImagesAlphaChannelUnsafe() {
    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            processImagesFromResources(page.getResources());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: PdfContentTypeChecker.java From tika-server with Apache License 2.0

5 votes

private void calculateObjectsInDocument(PDDocument document) throws IOException {
    this.pdfTextStripper = new PDFTextStripper();

    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        this.pageCount = allPages.getCount();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            readObjectsOnPage(page);
            calculateTextLengthOnPage(document, i + 1);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: DashboardUtil.java From Insights with Apache License 2.0

5 votes

/**
 * Footer is filled with varaibles selected in Grafana by user
 * 
 * @param doc
 * @param title
 * @param variables
 * @return doc
 * @throws IOException
 */
private PDDocument footer(PDDocument doc, String title, String variables) throws IOException {
	try{
		PDPageTree pages = doc.getPages();
		for(PDPage p : pages){
			PDPageContentStream contentStream = new PDPageContentStream(doc, p, AppendMode.APPEND, false);
			contentStream.beginText();
			contentStream.newLineAtOffset(220, 780);
			contentStream.setFont(PDType1Font.HELVETICA, 11);
			contentStream.showText("OneDevOps Insights – "+title);
			contentStream.endText();
			if(!variables.equals("") && variables != null){
				contentStream.beginText();
				contentStream.newLineAtOffset(2, 17);
				contentStream.setFont(PDType1Font.HELVETICA, 9);
				contentStream.showText("This Report is generated based on the user selected values as below.");
				contentStream.endText();
				contentStream.beginText();
				contentStream.newLineAtOffset(2, 5);
				contentStream.setFont(PDType1Font.HELVETICA, 7);
				contentStream.showText(variables);
				contentStream.endText();
			}
			contentStream.close();
		}
	}catch(Exception e){
		Log.error("Error, Failed in Footer.. ", e.getMessage());
	}
	return doc;
}

Source File: DetermineWidgetPage.java From testarea-pdfbox2 with Apache License 2.0

5 votes

int determineSafe(PDDocument document, PDAnnotationWidget widget) throws IOException
{
    COSDictionary widgetObject = widget.getCOSObject();
    PDPageTree pages = document.getPages();
    for (int i = 0; i < pages.getCount(); i++)
    {
        for (PDAnnotation annotation : pages.get(i).getAnnotations())
        {
            COSDictionary annotationObject = annotation.getCOSObject();
            if (annotationObject.equals(widgetObject))
                return i;
        }
    }
    return -1;
}

Source File: RemoveStrikeoutComment.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/45812696/pdfbox-delete-comment-maintain-strikethrough">
 * PDFBox delete comment maintain strikethrough
 * </a>
 * <br/>
 * <a href="https://expirebox.com/files/3d955e6df4ca5874c38dbf92fc43b5af.pdf">
 * only_fields.pdf
 * </a>
 * <a href="https://file.io/DTvqhC">
 * (alternative download)
 * </a>
 * <p>
 * Due to a bug in the <code>COSArrayList</code> usage for page annotations,
 * the indirect reference to the annotation in question is not removed from
 * the actual page annotations array.
 * </p>
 */
@Test
public void testRemoveLikeStephan() throws IOException {
    try (InputStream resource = getClass().getResourceAsStream("only_fields.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        List<PDAnnotation> annotations = new ArrayList<>();
        PDPageTree allPages = document.getDocumentCatalog().getPages();

        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            annotations = page.getAnnotations();

            List<PDAnnotation> annotationToRemove = new ArrayList<PDAnnotation>();

            if (annotations.size() < 1)
                continue;
            else {
                for (PDAnnotation annotation : annotations) {

                    if (annotation.getContents() != null
                            && annotation.getContents().equals("Sample Strikethrough")) {
                        annotationToRemove.add(annotation);
                    }
                }
                annotations.removeAll(annotationToRemove);
            }
        }

        document.save(new File(RESULT_FOLDER, "only_fields-removeLikeStephan.pdf"));
    }
}

Source File: RemoveStrikeoutComment.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/45812696/pdfbox-delete-comment-maintain-strikethrough">
 * PDFBox delete comment maintain strikethrough
 * </a>
 * <br/>
 * <a href="https://expirebox.com/files/3d955e6df4ca5874c38dbf92fc43b5af.pdf">
 * only_fields.pdf
 * </a>
 * <a href="https://file.io/DTvqhC">
 * (alternative download)
 * </a>
 * <p>
 * The OP only wanted the comment removed, not the strike-through. Thus, we must
 * not remove the annotation but merely the comment building attributes.
 * </p>
 */
@Test
public void testRemoveLikeStephanImproved() throws IOException {
    final COSName POPUP = COSName.getPDFName("Popup");
    try (InputStream resource = getClass().getResourceAsStream("only_fields.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        List<PDAnnotation> annotations = new ArrayList<>();
        PDPageTree allPages = document.getDocumentCatalog().getPages();

        List<COSObjectable> objectsToRemove = new ArrayList<>();

        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            annotations = page.getAnnotations();

            for (PDAnnotation annotation : annotations) {
                if ("StrikeOut".equals(annotation.getSubtype()))
                {
                    COSDictionary annotationDict = annotation.getCOSObject();
                    COSBase popup = annotationDict.getItem(POPUP);
                    annotationDict.removeItem(POPUP);
                    annotationDict.removeItem(COSName.CONTENTS); // plain text comment
                    annotationDict.removeItem(COSName.RC);       // rich text comment
                    annotationDict.removeItem(COSName.T);        // author

                    if (popup != null)
                        objectsToRemove.add(popup);
                }
            }

            annotations.removeAll(objectsToRemove);
        }

        document.save(new File(RESULT_FOLDER, "only_fields-removeImproved.pdf"));
    }
}

Source File: ShrinkPDF.java From shrink-pdf with MIT License

5 votes

/**
 * Shrink a PDF
 * @param f {@code File} pointing to the PDF to shrink
 * @param compQual Compression quality parameter. 0 is
 *                 smallest file, 1 is highest quality.
 * @return The compressed {@code PDDocument}
 * @throws FileNotFoundException
 * @throws IOException 
 */
private PDDocument shrinkMe() 
        throws FileNotFoundException, IOException {
     if(compQual < 0)
         compQual = compQualDefault;
     final RandomAccessBufferedFileInputStream rabfis = 
             new RandomAccessBufferedFileInputStream(input);
     final PDFParser parser = new PDFParser(rabfis);
     parser.parse();
     final PDDocument doc = parser.getPDDocument();
     final PDPageTree pages = doc.getPages();
     final ImageWriter imgWriter;
     final ImageWriteParam iwp;
     if(tiff) {
         final Iterator<ImageWriter> tiffWriters =
               ImageIO.getImageWritersBySuffix("png");
         imgWriter = tiffWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         //iwp.setCompressionMode(ImageWriteParam.MODE_DISABLED);
     } else {
         final Iterator<ImageWriter> jpgWriters = 
               ImageIO.getImageWritersByFormatName("jpeg");
         imgWriter = jpgWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         iwp.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
         iwp.setCompressionQuality(compQual);
     }
     for(PDPage p : pages) {
          scanResources(p.getResources(), doc, imgWriter, iwp);
     }
     return doc;
}

Source File: PDDocumentCatalogBleach.java From DocBleach with MIT License

4 votes

private void sanitizePageActions(PDPageTree pages) throws IOException {
  LOGGER.trace("Checking Pages Actions");
  for (PDPage page : pages) {
    sanitizePage(page);
  }
}

Source File: PDFTextStripper.java From gcs with Mozilla Public License 2.0

4 votes

/**
 * This will process all of the pages and the text that is in them.
 *
 * @param pages The pages object in the document.
 *
 * @throws IOException If there is an error parsing the text.
 */
protected void processPages(PDPageTree pages) throws IOException
{
    PDPage startBookmarkPage = startBookmark == null ? null
            : startBookmark.findDestinationPage(document);
    if (startBookmarkPage != null)
    {
        startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1;
    }
    else
    {
        // -1 = undefined
        startBookmarkPageNumber = -1;
    }

    PDPage endBookmarkPage = endBookmark == null ? null
            : endBookmark.findDestinationPage(document);
    if (endBookmarkPage != null)
    {
        endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1;
    }
    else
    {
        // -1 = undefined
        endBookmarkPageNumber = -1;
    }

    if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
            && endBookmark != null
            && startBookmark.getCOSObject() == endBookmark.getCOSObject())
    {
        // this is a special case where both the start and end bookmark
        // are the same but point to nothing. In this case
        // we will not extract any text.
        startBookmarkPageNumber = 0;
        endBookmarkPageNumber = 0;
    }

    for (PDPage page : pages)
    {
        currentPageNo++;
        if (page.hasContents())
        {
            processPage(page);
        }
    }
}

Source File: DashboardUtil.java From Insights with Apache License 2.0

2 votes

/**
 * Get previous page in the document.
 * 
 * @param document
 * @return {pageNum}
 */
private static int getPages(PDDocument document) {
	PDPageTree pages = document.getPages();
	return pages.getCount()-1;
}

org.apache.pdfbox.pdmodel.PDPageTree Java Examples