org.apache.pdfbox.pdfparser.PDFStreamParser Java Exaples

Source File: PDType3CharProc.java From gcs with Mozilla Public License 2.0

6 votes

/**
 * Get the width from a type3 charproc stream.
 *
 * @return the glyph width.
 * @throws IOException if the stream could not be read, or did not have d0 or d1 as first
 * operator, or if their first argument was not a number.
 */
public float getWidth() throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(this);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            return parseWidth((Operator) token, arguments);
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
    throw new IOException("Unexpected end of stream");
}

Source File: PDDefaultAppearanceString.java From gcs with Mozilla Public License 2.0

6 votes

/**
 * Processes the operators of the given content stream.
 *
 * @param content the content to parse.
 * @throws IOException if there is an error reading or parsing the content stream.
 */
private void processAppearanceStringOperators(byte[] content) throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(content);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            processOperator((Operator) token, arguments);
            arguments = new ArrayList<COSBase>();
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
}

Source File: PDFStreamEngine.java From gcs with Mozilla Public License 2.0

6 votes

/**
 * Processes the operators of the given content stream.
 *
 * @param contentStream to content stream to parse.
 * @throws IOException if there is an error reading or parsing the content stream.
 */
private void processStreamOperators(PDContentStream contentStream) throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(contentStream);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            processOperator((Operator) token, arguments);
            arguments = new ArrayList<COSBase>();
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
}

Source File: PdfContentTypeChecker.java From tika-server with Apache License 2.0

5 votes

private void calculateTextObjectsOnPage(PDPage page) throws IOException {
    PDFStreamParser parser = new PDFStreamParser(page);
    parser.parse();
    List<Object> pageTokens = parser.getTokens();
    for (Object token : pageTokens) {
        if (token instanceof Operator) {
            String opName = ((Operator) token).getName();
            if (opName.equals("BT")) // Begin Text
                textBlocks++;
        }
    }
}

Source File: AppearanceGeneratorHelper.java From gcs with Mozilla Public License 2.0

5 votes

/**
 * Parses an appearance stream into tokens.
 */
private List<Object> tokenize(PDAppearanceStream appearanceStream) throws IOException
{
    PDFStreamParser parser = new PDFStreamParser(appearanceStream);
    parser.parse();
    return parser.getTokens();
}

Source File: NurminenDetectionAlgorithm.java From tabula-java with MIT License

5 votes

private PDDocument removeText(PDPage page) throws IOException {

        PDFStreamParser parser = new PDFStreamParser(page);
        parser.parse();
        List<Object> tokens = parser.getTokens();
        List<Object> newTokens = new ArrayList<>();
        for (Object token : tokens) {
            if (token instanceof Operator) {
                Operator op = (Operator) token;
                if (op.getName().equals("TJ") || op.getName().equals("Tj")) {
                    //remove the one argument to this operator
                    newTokens.remove(newTokens.size() - 1);
                    continue;
                }
            }
            newTokens.add(token);
        }

        PDDocument document = new PDDocument();
        PDPage newPage = document.importPage(page);
        newPage.setResources(page.getResources());

        PDStream newContents = new PDStream(document);
        OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter writer = new ContentStreamWriter(out);
        writer.writeTokens(newTokens);
        out.close();
        newPage.setContents(newContents);
        return document;
    }

Source File: PDType3CharProc.java From gcs with Mozilla Public License 2.0

4 votes

/**
 * Calculate the bounding box of this glyph. This will work only if the first operator in the
 * stream is d1.
 *
 * @return the bounding box of this glyph, or null if the first operator is not d1.
 * @throws IOException If an io error occurs while parsing the stream.
 */
public PDRectangle getGlyphBBox() throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(this);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            if (((Operator) token).getName().equals("d1") && arguments.size() == 6)
            {
                for (int i = 0; i < 6; ++i)
                {
                    if (!(arguments.get(i) instanceof COSNumber))
                    {
                        return null;
                    }
                }
                return new PDRectangle(
                        ((COSNumber) arguments.get(2)).floatValue(),
                        ((COSNumber) arguments.get(3)).floatValue(),
                        ((COSNumber) arguments.get(4)).floatValue() - ((COSNumber) arguments.get(2)).floatValue(),
                        ((COSNumber) arguments.get(5)).floatValue() - ((COSNumber) arguments.get(3)).floatValue());
            }
            else
            {
                return null;
            }
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
    return null;
}

Source File: TestGraphicsCounter.java From testarea-pdfbox2 with Apache License 2.0

4 votes

/**
 * <a href="http://stackoverflow.com/questions/28321374/how-to-get-page-content-height-using-pdfbox">
 * How to get page content height using pdfbox
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B65bQnJhC1mvbEVQQ0o0QU9STlU/view?usp=sharing">
 * test.pdf
 * </a>, here as <code>test-rivu.pdf</code>
 * <p>
 * Rivu's code from a comment to count lines etc.
 * </p>
 */
@Test
public void testCountTestLikeRivu() throws IOException
{
    try (InputStream resource = getClass().getResourceAsStream("test-rivu.pdf"))
    {
        System.out.println("test-rivu.pdf");
        PDDocument document = Loader.loadPDF(resource);

        PDPage page = document.getPage(4);
        PDFStreamParser parser = new PDFStreamParser(page.getContents());
        List<Object> tokens = parser.parse();
        int lines=0;
        int curves=0;
        int rectangles=0;
        int doOps=0;
        int clipPaths=0;
        for (Object token:tokens){
            if (token instanceof Operator) {
                Operator op=(Operator) token;
                if ("do".equals(op.getName()))
                    doOps+=1;
                else if ("W".equals(op.getName())|| "W*".equals(op.getName()))
                    clipPaths+=1;
                else if ("l".equals(op.getName()) || "h".equals(op.getName()))
                    lines+=1;
                else if ("c".equals(op.getName())||"y".equals(op.getName()) ||"v".equals(op.getName())){
                    System.out.println(op);
                    curves+=1;
                }
                else if ("re".equals(op.getName()))
                    rectangles+=1;


            }
        }
        System.out.println(lines + " lines, " + curves + " curves, " + rectangles + " rectangles, " + doOps + " xobjects, " + clipPaths + " clip paths");

        document.close();
    }
}

org.apache.pdfbox.pdfparser.PDFStreamParser Java Examples