org.apache.pdfbox.pdmodel.PDPage#getResources

Source File: PDFRenderer.java From gcs with Mozilla Public License 2.0

6 votes

private boolean hasBlendMode(PDPage page)
{
    // check the current resources for blend modes
    PDResources resources = page.getResources();
    if (resources == null)
    {
        return false;
    }
    for (COSName name : resources.getExtGStateNames())
    {
        PDExtendedGraphicsState extGState = resources.getExtGState(name);
        if (extGState == null)
        {
            // can happen if key exists but no value 
            // see PDFBOX-3950-23EGDHXSBBYQLKYOKGZUOVYVNE675PRD.pdf
            continue;
        }
        BlendMode blendMode = extGState.getBlendMode();
        if (blendMode != BlendMode.NORMAL)
        {
            return true;
        }
    }
    return false;
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
 * PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
 * </a>
 * <br/>
 * <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
 * test-2.pdf
 * </a>
 * <p>
 * Due to the broken <b>ToUnicode</b> maps the output of immediate text
 * extraction from this document is unsatisfying, cf. {@link #testTest2()}.
 * It can be improved by removing these <b>ToUnicode</b> maps as this test
 * shows.
 * </p>
 */
@Test
public void testNoToUnicodeTest2() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("test-2.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);

        for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
        {
            PDPage page = document.getPage(pageNr);
            PDResources resources = page.getResources();
            removeToUnicodeMaps(resources);
        }

        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
    }
}

Source File: PdfExtractImagesBatchController.java From MyBox with Apache License 2.0

5 votes

@Override
public int handleCurrentPage() {
    int index = 0;
    try {
        PDPage page = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = page.getResources();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable != null) {
            Iterator<COSName> pageIterator = iterable.iterator();
            while (pageIterator.hasNext()) {
                if (task.isCancelled()) {
                    break;
                }
                COSName cosName = pageIterator.next();
                if (!pdResources.isImageXObject(cosName)) {
                    continue;
                }
                PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
                String namePrefix = FileTools.getFilePrefix(currentParameters.currentSourceFile.getName())
                        + "_page" + currentParameters.currentPage + "_index" + index;
                String suffix = pdxObject.getSuffix();
                File tFile = makeTargetFile(namePrefix, "." + suffix, currentParameters.currentTargetPath);
                ImageFileWriters.writeImageFile(pdxObject.getImage(), suffix, tFile.getAbsolutePath());
                targetFileGenerated(tFile);
                if (isPreview) {
                    break;
                }
                index++;
            }
        }

    } catch (Exception e) {
        logger.error(e.toString());
    }
    return index;
}

Source File: PdfOcrBatchController.java From MyBox with Apache License 2.0

5 votes

protected int extractPage() {
    int index = 0;
    try {
        PDPage page = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = page.getResources();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable != null) {
            Iterator<COSName> pageIterator = iterable.iterator();
            while (pageIterator.hasNext()) {
                if (task.isCancelled()) {
                    break;
                }
                COSName cosName = pageIterator.next();
                if (!pdResources.isImageXObject(cosName)) {
                    continue;
                }
                PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
                BufferedImage bufferedImage = pdxObject.getImage();
                if (handleImage(bufferedImage)) {
                    lastImage = bufferedImage;
                    if (isPreview) {
                        break;
                    }
                    index++;
                }
            }
        }

    } catch (Exception e) {
        logger.error(e.toString());
    }
    return index;
}

Source File: Overlay.java From gcs with Mozilla Public License 2.0

5 votes

private COSName createOverlayXObject(PDPage page, LayoutPage layoutPage)
{
    PDFormXObject xobjForm = new PDFormXObject(layoutPage.overlayContentStream);
    xobjForm.setResources(new PDResources(layoutPage.overlayResources));
    xobjForm.setFormType(1);
    xobjForm.setBBox(layoutPage.overlayMediaBox.createRetranslatedRectangle());
    AffineTransform at = new AffineTransform();
    switch (layoutPage.overlayRotation)
    {
        case 90:
            at.translate(0, layoutPage.overlayMediaBox.getWidth());
            at.rotate(Math.toRadians(-90));
            break;
        case 180:
            at.translate(layoutPage.overlayMediaBox.getWidth(), layoutPage.overlayMediaBox.getHeight());
            at.rotate(Math.toRadians(-180));
            break;
        case 270:
            at.translate(layoutPage.overlayMediaBox.getHeight(), 0);
            at.rotate(Math.toRadians(-270));
            break;
        default:
            break;
    }
    xobjForm.setMatrix(at);
    PDResources resources = page.getResources();
    return resources.add(xobjForm, "OL");
}

Source File: ExtractImages.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B9izTHWJQ7xlT2ZoQkJfbGRYcFE">
 * 10948.pdf
 * </a>
 * <p>
 * The only special thing about the two images returned for the sample PDF is that
 * one image is merely a mask used for the other image, and the other image is the
 * actual image used on the PDF page. If one only wants the images immediately used
 * in the page content, one also has to scan the page content.
 * </p>
 */
@Test
public void testExtractPageImageResources10948() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}

Source File: ExtractImages.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
 * 10948.pdf
 * </a>, renamed "10948-new.pdf" here to prevent a collision
 * <p>
 * Here the code extracts no image at all because the images are not immediate page
 * resources but wrapped in form xobjects.
 * </p>
 */
@Test
public void testExtractPageImageResources10948New() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948-new.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-new-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}

Source File: PdfCompressImagesBatchController.java From MyBox with Apache License 2.0

4 votes

@Override
public int handleCurrentPage() {
    int count = 0;
    try {
        PDPage sourcePage = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = sourcePage.getResources();
        pdResources.getXObjectNames();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable == null) {
            return 0;
        }
        Iterator<COSName> pageIterator = iterable.iterator();
        while (pageIterator.hasNext()) {
            if (task.isCancelled()) {
                break;
            }
            COSName cosName = pageIterator.next();
            if (!pdResources.isImageXObject(cosName)) {
                continue;
            }
            PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
            BufferedImage sourceImage = pdxObject.getImage();
            PDImageXObject newObject = null;
            if (format == PdfImageFormat.Tiff) {
                ImageBinary imageBinary = new ImageBinary(sourceImage, threshold);
                imageBinary.setIsDithering(ditherCheck.isSelected());
                BufferedImage newImage = imageBinary.operate();
                newImage = ImageBinary.byteBinary(newImage);
                newObject = CCITTFactory.createFromImage(doc, newImage);

            } else if (format == PdfImageFormat.Jpeg) {
                newObject = JPEGFactory.createFromImage(doc, sourceImage, jpegQuality / 100f);
            }
            if (newObject != null) {
                pdResources.put(cosName, newObject);
                count++;
            }
            if (isPreview) {
                break;
            }
        }
        if (copyAllCheck.isSelected()) {
            targetDoc.getPage(currentParameters.currentPage - 1).setResources(pdResources);
        } else {
            targetDoc.addPage(sourcePage);
        }
    } catch (Exception e) {
        logger.error(e.toString());
    }
    return count;

}

Source File: LayerUtility.java From gcs with Mozilla Public License 2.0

4 votes

/**
 * Imports a page from some PDF file as a Form XObject so it can be placed on another page
 * in the target document.
 * <p>
 * You may want to call {@link #wrapInSaveRestore(PDPage) wrapInSaveRestore(PDPage)} before invoking the Form XObject to
 * make sure that the graphics state is reset.
 * 
 * @param sourceDoc the source PDF document that contains the page to be copied
 * @param page the page in the source PDF document to be copied
 * @return a Form XObject containing the original page's content
 * @throws IOException if an I/O error occurs
 */
public PDFormXObject importPageAsForm(PDDocument sourceDoc, PDPage page) throws IOException
{
    importOcProperties(sourceDoc);

    PDStream newStream = new PDStream(targetDoc, page.getContents(), COSName.FLATE_DECODE);
    PDFormXObject form = new PDFormXObject(newStream);

    //Copy resources
    PDResources pageRes = page.getResources();
    PDResources formRes = new PDResources();
    cloner.cloneMerge(pageRes, formRes);
    form.setResources(formRes);

    //Transfer some values from page to form
    transferDict(page.getCOSObject(), form.getCOSObject(), PAGE_TO_FORM_FILTER, true);

    Matrix matrix = form.getMatrix();
    AffineTransform at = matrix.createAffineTransform();
    PDRectangle mediaBox = page.getMediaBox();
    PDRectangle cropBox = page.getCropBox();
    PDRectangle viewBox = (cropBox != null ? cropBox : mediaBox);

    //Handle the /Rotation entry on the page dict
    int rotation = page.getRotation();

    //Transform to FOP's user space
    //at.scale(1 / viewBox.getWidth(), 1 / viewBox.getHeight());
    at.translate(mediaBox.getLowerLeftX() - viewBox.getLowerLeftX(),
            mediaBox.getLowerLeftY() - viewBox.getLowerLeftY());
    switch (rotation)
    {
    case 90:
        at.scale(viewBox.getWidth() / viewBox.getHeight(), viewBox.getHeight() / viewBox.getWidth());
        at.translate(0, viewBox.getWidth());
        at.rotate(-Math.PI / 2.0);
        break;
    case 180:
        at.translate(viewBox.getWidth(), viewBox.getHeight());
        at.rotate(-Math.PI);
        break;
    case 270:
        at.scale(viewBox.getWidth() / viewBox.getHeight(), viewBox.getHeight() / viewBox.getWidth());
        at.translate(viewBox.getHeight(), 0);
        at.rotate(-Math.PI * 1.5);
        break;
    default:
        //no additional transformations necessary
    }
    //Compensate for Crop Boxes not starting at 0,0
    at.translate(-viewBox.getLowerLeftX(), -viewBox.getLowerLeftY());
    if (!at.isIdentity())
    {
        form.setMatrix(at);
    }

    BoundingBox bbox = new BoundingBox();
    bbox.setLowerLeftX(viewBox.getLowerLeftX());
    bbox.setLowerLeftY(viewBox.getLowerLeftY());
    bbox.setUpperRightX(viewBox.getUpperRightX());
    bbox.setUpperRightY(viewBox.getUpperRightY());
    form.setBBox(new PDRectangle(bbox));

    return form;
}

Source File: RenderType3Character.java From testarea-pdfbox2 with Apache License 2.0

4 votes

/**
     * <a href="http://stackoverflow.com/questions/42032729/render-type3-font-character-as-image-using-pdfbox">
     * Render Type3 font character as image using PDFBox
     * </a>
     * <br/>
     * <a href="https://drive.google.com/file/d/0B0f6X4SAMh2KRDJTbm4tb3E1a1U/view">
     * 4700198773.pdf
     * </a>
     * from
     * <a href="http://stackoverflow.com/questions/37754112/extract-text-with-custom-font-result-non-readble">
     * extract text with custom font result non readble
     * </a>
     * <p>
     * This test shows how one can render individual Type 3 font glyphs as bitmaps.
     * Unfortunately PDFBox out-of-the-box does not provide a class to render contents
     * of arbitrary XObjects, merely for rendering pages; thus, we simply create a page
     * with the glyph in question and render that page.   
     * </p>
     * <p>
     * As the OP did not provide a sample PDF, we simply use one from another
     * stackoverflow question. There obviously might remain issues with the
     * OP's files.
     * </p>
     */
    @Test
    public void testRender4700198773() throws IOException, NoSuchMethodException, SecurityException, IllegalAccessException, IllegalArgumentException, InvocationTargetException
    {
        Method PDPageContentStreamWrite = PDPageContentStream.class.getSuperclass().getDeclaredMethod("write", String.class);
        PDPageContentStreamWrite.setAccessible(true);

        try (   InputStream resource = getClass().getResourceAsStream("4700198773.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);

            PDPage page = document.getPage(0);
            PDResources pageResources = page.getResources();
            COSName f1Name = COSName.getPDFName("F1");
            PDType3Font fontF1 = (PDType3Font) pageResources.getFont(f1Name);
            Map<String, Integer> f1NameToCode = fontF1.getEncoding().getNameToCodeMap();

            COSDictionary charProcsDictionary = fontF1.getCharProcs();
            for (COSName key : charProcsDictionary.keySet())
            {
                COSStream stream = (COSStream) charProcsDictionary.getDictionaryObject(key);
                PDType3CharProc charProc = new PDType3CharProc(fontF1, stream);
                PDRectangle bbox = charProc.getGlyphBBox();
                if (bbox == null)
                    bbox = charProc.getBBox();
                Integer code = f1NameToCode.get(key.getName());

                if (code != null)
                {
                    PDDocument charDocument = new PDDocument();
                    PDPage charPage = new PDPage(bbox);
                    charDocument.addPage(charPage);
                    charPage.setResources(pageResources);
                    PDPageContentStream charContentStream = new PDPageContentStream(charDocument, charPage);
                    charContentStream.beginText();
                    charContentStream.setFont(fontF1, bbox.getHeight());
//                    charContentStream.write(String.format("<%2X> Tj\n", code).getBytes());
                    PDPageContentStreamWrite.invoke(charContentStream, String.format("<%2X> Tj\n", code));
                    charContentStream.endText();
                    charContentStream.close();

                    File result = new File(RESULT_FOLDER, String.format("4700198773-%s-%s.png", key.getName(), code));
                    PDFRenderer renderer = new PDFRenderer(charDocument);
                    BufferedImage image = renderer.renderImageWithDPI(0, 96);
                    ImageIO.write(image, "PNG", result);
                    charDocument.save(new File(RESULT_FOLDER, String.format("4700198773-%s-%s.pdf", key.getName(), code)));
                    charDocument.close();
                }
            }
        }
    }

Source File: RenderType3Character.java From testarea-pdfbox2 with Apache License 2.0

4 votes

/**
 * <a href="http://stackoverflow.com/questions/42032729/render-type3-font-character-as-image-using-pdfbox">
 * Render Type3 font character as image using PDFBox
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B0f6X4SAMh2KRDJTbm4tb3E1a1U/view">
 * 4700198773.pdf
 * </a>
 * from
 * <a href="http://stackoverflow.com/questions/37754112/extract-text-with-custom-font-result-non-readble">
 * extract text with custom font result non readble
 * </a>
 * <p>
 * This test shows how one can render individual Type 3 font glyphs as bitmaps.
 * Unfortunately PDFBox out-of-the-box does not provide a class to render contents
 * of arbitrary XObjects, merely for rendering pages; thus, we simply create a page
 * with the glyph in question and render that page.   
 * </p>
 * <p>
 * As the OP did not provide a sample PDF, we simply use one from another
 * stackoverflow question. There obviously might remain issues with the
 * OP's files.
 * </p>
 */
@Test
public void testRenderSdnList() throws IOException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException
{
    Method PDPageContentStreamWrite = PDPageContentStream.class.getSuperclass().getDeclaredMethod("write", String.class);
    PDPageContentStreamWrite.setAccessible(true);

    try (   InputStream resource = getClass().getResourceAsStream("sdnlist.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);

        PDPage page = document.getPage(1);
        PDResources pageResources = page.getResources();
        COSName f1Name = COSName.getPDFName("R144");
        PDType3Font fontF1 = (PDType3Font) pageResources.getFont(f1Name);
        Map<String, Integer> f1NameToCode = fontF1.getEncoding().getNameToCodeMap();

        COSDictionary charProcsDictionary = fontF1.getCharProcs();
        for (COSName key : charProcsDictionary.keySet())
        {
            COSStream stream = (COSStream) charProcsDictionary.getDictionaryObject(key);
            PDType3CharProc charProc = new PDType3CharProc(fontF1, stream);
            PDRectangle bbox = charProc.getGlyphBBox();
            if (bbox == null)
                bbox = charProc.getBBox();
            Integer code = f1NameToCode.get(key.getName());

            if (code != null)
            {
                PDDocument charDocument = new PDDocument();
                PDPage charPage = new PDPage(bbox);
                charDocument.addPage(charPage);
                charPage.setResources(pageResources);
                PDPageContentStream charContentStream = new PDPageContentStream(charDocument, charPage);
                charContentStream.beginText();
                charContentStream.setFont(fontF1, bbox.getHeight());
                //charContentStream.getOutputStream().write(String.format("<%2X> Tj\n", code).getBytes());
                PDPageContentStreamWrite.invoke(charContentStream, String.format("<%2X> Tj\n", code));
                charContentStream.endText();
                charContentStream.close();

                File result = new File(RESULT_FOLDER, String.format("sdnlist-%s-%s.png", key.getName(), code));
                PDFRenderer renderer = new PDFRenderer(charDocument);
                BufferedImage image = renderer.renderImageWithDPI(0, 96);
                ImageIO.write(image, "PNG", result);
                charDocument.save(new File(RESULT_FOLDER, String.format("sdnlist-%s-%s.pdf", key.getName(), code)));
                charDocument.close();
            }
        }
    }
}

Java Code Examples for org.apache.pdfbox.pdmodel.PDPage#getResources()