org.apache.pdfbox.text.TextPosition Java Exaples

Source File: TextSection.java From testarea-pdfbox2 with Apache License 2.0

6 votes

String toString(List<List<TextPosition>> words)
{
    StringBuilder stringBuilder = new StringBuilder();
    boolean first = true;
    for (List<TextPosition> word : words)
    {
        if (first)
            first = false;
        else
            stringBuilder.append(' ');
        for (TextPosition textPosition : word)
        {
            stringBuilder.append(textPosition.getUnicode());
        }
    }
    // cf. http://stackoverflow.com/a/7171932/1729265
    return Normalizer.normalize(stringBuilder, Form.NFKC);
}

Source File: ExtractCharacterCodes.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/50664162/some-glyph-ids-missing-while-trying-to-extract-glyph-id-from-pdf">
 * Some glyph ID's missing while trying to extract glyph ID from pdf
 * </a>
 * <br/>
 * <a href="http://1drv.ms/b/s!AmHcFaD-gMGyhkHr4PY6F4krYJ32">
 * pattern3.pdf
 * </a>
 * <p>
 * This test shows how to access the character codes of the extracted text
 * while preventing the {@link PDFTextStripper} from doing any preprocessing
 * steps, in particular from doing any diacritics merges.
 * </p>
 */
@Test
public void testExtractFromPattern3() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("pattern3.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);
        PDFTextStripper stripper = new PDFTextStripper() {
            
            @Override
            protected void processTextPosition(TextPosition textPosition) {
                try {
                    writeString(String.format("%s%s", textPosition.getUnicode(), Arrays.toString(textPosition.getCharacterCodes())));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        };
        String text = stripper.getText(document);

        System.out.printf("\n*\n* pattern3.pdf\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "pattern3.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ExtractWordCoordinates.java From testarea-pdfbox2 with Apache License 2.0

6 votes

@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    String wordSeparator = getWordSeparator();
    List<TextPosition> word = new ArrayList<>();
    for (TextPosition text : textPositions) {
        String thisChar = text.getUnicode();
        if (thisChar != null) {
            if (thisChar.length() >= 1) {
                if (!thisChar.equals(wordSeparator)) {
                    word.add(text);
                } else if (!word.isEmpty()) {
                    printWord(word);
                    word.clear();
                }
            }
        }
    }
    if (!word.isEmpty()) {
        printWord(word);
        word.clear();
    }
}

Source File: RectanglesOverText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
    TextLine tmpline = null;

    if (startOfLine) {
        tmpline = new TextLine();
        tmpline.text = text;
        tmpline.textPositions = textPositions;
        lines.add(tmpline);
    } else {
        tmpline = lines.get(lines.size() - 1);
        tmpline.text += text;
        tmpline.textPositions.addAll(textPositions);
    }

    if (startOfLine) {
        startOfLine = false;
    }
    super.writeString(text, textPositions);
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

6 votes

private void iterateThroughTextList(Iterator<TextPosition> textIterator) {
    List<TextPosition> textPositionList = new ArrayList<TextPosition>();

    while ( textIterator.hasNext() ) {
        TextPosition textPosition = (TextPosition)textIterator.next();
        int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition);
        if ( numberOfNewLines == 0 ) {
            textPositionList.add(textPosition);
        } else {
            this.writeTextPositionList(textPositionList);
            this.createNewEmptyNewLines(numberOfNewLines);
            textPositionList.add(textPosition);
        }
        this.setPreviousTextPosition(textPosition);
    }
    if (!textPositionList.isEmpty()) {
        this.writeTextPositionList(textPositionList);
    }
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

6 votes

private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textPosition) {
    TextPosition previousTextPosition = this.getPreviousTextPosition();
    if ( previousTextPosition == null ) {
        return 1;
    }

    float textYPosition = Math.round( textPosition.getY() );
    float previousTextYPosition = Math.round( previousTextPosition.getY() );

    if ( textYPosition > previousTextYPosition && (textYPosition - previousTextYPosition > 5.5) ) {
        double height = textPosition.getHeight();
        int numberOfLines = (int) (Math.floor( textYPosition - previousTextYPosition) / height );
        numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line
        if (DEBUG) System.out.println(height + " " + numberOfLines);
        return numberOfLines ;
    } else {
        return 0;
    }
}

Source File: PDFLayoutTextStripper.java From quarkus-pdf-extract with Apache License 2.0

6 votes

private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textPosition) {
    TextPosition previousTextPosition = this.getPreviousTextPosition();
    if ( previousTextPosition == null ) {
        return 1;
    }

    float textYPosition = Math.round( textPosition.getY() );
    float previousTextYPosition = Math.round( previousTextPosition.getY() );

    if ( textYPosition > previousTextYPosition && (textYPosition - previousTextYPosition > 5.5) ) {
        double height = textPosition.getHeight();
        int numberOfLines = (int) (Math.floor( textYPosition - previousTextYPosition) / height );
        numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line
        if (DEBUG) System.out.println(height + " " + numberOfLines);
        return numberOfLines ;
    } else {
        return 0;
    }
}

Source File: PDFLayoutTextStripper.java From quarkus-pdf-extract with Apache License 2.0

6 votes

private void iterateThroughTextList(Iterator<TextPosition> textIterator) {
    List<TextPosition> textPositionList = new ArrayList<TextPosition>();

    while ( textIterator.hasNext() ) {
        TextPosition textPosition = textIterator.next();
        int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition);
        if ( numberOfNewLines == 0 ) {
            textPositionList.add(textPosition);
        } else {
            this.writeTextPositionList(textPositionList);
            this.createNewEmptyNewLines(numberOfNewLines);
            textPositionList.add(textPosition);
        }
        this.setPreviousTextPosition(textPosition);
    }
    if (!textPositionList.isEmpty()) {
        this.writeTextPositionList(textPositionList);
    }
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

5 votes

private boolean isCharacterCloseToPreviousWord(final TextPosition textPosition) {
    if ( ! firstCharacterOfLineFound ) {
        return false;
    }
    double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
    return (numberOfSpaces > 1 && numberOfSpaces <= PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT);
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

5 votes

private boolean isCharacterPartOfPreviousWord(final TextPosition textPosition) {
    TextPosition previousTextPosition = this.getPreviousTextPosition();
    if ( previousTextPosition.getUnicode().equals(" ") ) {
        return false;
    }
    double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
    return (numberOfSpaces <= 1);
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

5 votes

private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPosition1, final TextPosition textPosition2) {
    double previousTextXPosition = textPosition1.getX();
    double previousTextWidth = textPosition1.getWidth();
    double previousTextEndXPosition = (previousTextXPosition + previousTextWidth);
    double numberOfSpaces = Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition));
    return numberOfSpaces;
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

5 votes

private void writeLine(final List<TextPosition> textPositionList) {
    if ( textPositionList.size() > 0 ) {
        TextLine textLine = this.addNewLine();
        boolean firstCharacterOfLineFound = false;
        for (TextPosition textPosition : textPositionList ) {
            CharacterFactory characterFactory = new CharacterFactory(firstCharacterOfLineFound);
            Character character = characterFactory.createCharacterFromTextPosition(textPosition, this.getPreviousTextPosition());
            textLine.writeCharacterAtIndex(character);
            this.setPreviousTextPosition(textPosition);
            firstCharacterOfLineFound = true;
        }
    } else {
        this.addNewLine(); // white line
    }
}

Source File: TextMetrics.java From Pdf2Dom with GNU Lesser General Public License v3.0

5 votes

public TextMetrics(TextPosition tp)
{
    x = tp.getX();
    baseline = tp.getY();
    font = tp.getFont();
    width = tp.getWidth();
    height = tp.getHeight();
    pointSize = tp.getFontSizeInPt();
    fontSize = tp.getYScale();
    ascent = getAscent();
    descent = getDescent();
}

Source File: TextMetrics.java From Pdf2Dom with GNU Lesser General Public License v3.0

5 votes

public void append(TextPosition tp)
{
    width += tp.getX() - (x + width) + tp.getWidth();
    height = Math.max(height, tp.getHeight());
    ascent = Math.max(ascent, getAscent(tp.getFont(), tp.getYScale()));
    descent = Math.min(descent, getDescent(tp.getFont(), tp.getYScale()));
}

Source File: VisualizeMarkedContent.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * This method shows the text content for a MCID and determines its
 * bounding box. It also recurses.
 */
Rectangle2D showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) throws IOException {
    Rectangle2D box = null;
    PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
    List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
    StringBuilder textContent =  new StringBuilder();
    for (Object object : contents) {
        if (object instanceof TextPosition) {
            TextPosition textPosition = (TextPosition)object;
            textContent.append(textPosition.getUnicode());

            int[] codes = textPosition.getCharacterCodes();
            if (codes.length != 1) {
                System.out.printf("<!-- text position with unexpected number of codes: %d -->", codes.length);
            } else {
                box = union(box, calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0]).getBounds2D());
            }
        } else if (object instanceof PDMarkedContent) {
            PDMarkedContent thisMarkedContent = (PDMarkedContent) object;
            box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents));
        } else {
            textContent.append("?" + object);
        }
    }
    System.out.printf("%s\n", textContent);
    return box;
}

Source File: TextStripper.java From tabula-java with MIT License

5 votes

@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
    for (TextPosition textPosition: textPositions)
    {
        if (textPosition == null) {
            continue;
        }

        String c = textPosition.getUnicode();

        // if c not printable, return
        if (!isPrintable(c)) {
            continue;
        }

        Float h = textPosition.getHeightDir();

        if (c.equals(NBSP)) { // replace non-breaking space for space
            c = " ";
        }

        float wos = textPosition.getWidthOfSpace();

        TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
                Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
                Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSize(), c,
                // workaround a possible bug in PDFBox:
                // https://issues.apache.org/jira/browse/PDFBOX-1755
                wos, textPosition.getDir());

        this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
        this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());

        this.spatialIndex.add(te);
        this.textElements.add(te);
    }
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

5 votes

private boolean isFirstCharacterOfAWord(final TextPosition textPosition) {
    if ( ! firstCharacterOfLineFound ) {
        return true;
    }
    double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
    return (numberOfSpaces > 1) || this.isCharacterAtTheBeginningOfNewLine(textPosition);
}

Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0

5 votes

@Override
protected void writePage() throws IOException {
    List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
    for( int i = 0; i < charactersByArticle.size(); i++) {
        List<TextPosition> textList = charactersByArticle.get(i);
        try {
            this.sortTextPositionList(textList);
        } catch ( java.lang.IllegalArgumentException e) {
            System.err.println(e);
        }
        this.iterateThroughTextList(textList.iterator()) ;
    }
    this.writeToOutputStream(this.getTextLineList());
}

Source File: ExtractMarkedContent.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * @see #showStructure(PDStructureNode, Map)
 * @see #testExtractTestWPhromma()
 */
void showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) {
    PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
    List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
    StringBuilder textContent =  new StringBuilder();
    for (Object object : contents) {
        if (object instanceof TextPosition) {
            textContent.append(((TextPosition)object).getUnicode());
        } else {
            textContent.append("?" + object);
        }
    }
    System.out.printf("%s\n", textContent);
}

Source File: SearchSubword.java From testarea-pdfbox2 with Apache License 2.0

5 votes

List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException
{
    final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>();
    PDFTextStripper stripper = new PDFTextStripper()
    {
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            System.out.printf("  -- %s\n", text);

            TextPositionSequence word = new TextPositionSequence(textPositions);
            String string = word.toString();

            int fromIndex = 0;
            int index;
            while ((index = string.indexOf(searchTerm, fromIndex)) > -1)
            {
                hits.add(word.subSequence(index, index + searchTerm.length()));
                fromIndex = index + 1;
            }
            super.writeString(text, textPositions);
        }
    };
    
    stripper.setSortByPosition(true);
    stripper.setStartPage(page);
    stripper.setEndPage(page);
    stripper.getText(document);
    return hits;
}

Source File: SearchSubword.java From testarea-pdfbox2 with Apache License 2.0

5 votes

void printSubwords(PDDocument document, String searchTerm) throws IOException
{
    System.out.printf("* Looking for '%s'\n", searchTerm);
    for (int page = 1; page <= document.getNumberOfPages(); page++)
    {
        List<TextPositionSequence> hits = findSubwords(document, page, searchTerm);
        for (TextPositionSequence hit : hits)
        {
            if (!searchTerm.equals(hit.toString()))
                System.out.printf("  Invalid (%s) ", hit.toString());
            TextPosition lastPosition = hit.textPositionAt(hit.length() - 1);
            System.out.printf("  Page %s at %s, %s with width %s and last letter '%s' at %s, %s\n",
                    page, hit.getX(), hit.getY(), hit.getWidth(),
                    lastPosition.getUnicode(), lastPosition.getXDirAdj(), lastPosition.getYDirAdj());
        }
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

5 votes

/**
 * <a href="https://stackoverflow.com/questions/51672080/pdfdomtree-does-not-detecting-white-spaces-while-converting-a-pdf-file-to-html">
 * PDFDomTree does not detecting white spaces while converting a pdf file to html
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/1SZNFCvGVbQzCxJiRr8HlW99ravC_Cm71/view?usp=sharing">
 * demo.pdf
 * </a>
 * <p>
 * This improved version does not ignore white space glyphs but
 * instead translates them into gaps. This is a work-around and
 * not a fix, different kinds of white spaces need to be handled
 * differently.
 * </p>
 * @see #testDemo()
 */
@Test
public void testDemoImproved() throws IOException, ParserConfigurationException
{
    System.out.printf("\n*\n* demo.pdf improved\n*\n");
    try (   InputStream resource = getClass().getResourceAsStream("/mkl/testarea/pdfbox2/extract/demo.pdf")    ) {
        PDDocument document = Loader.loadPDF(resource);

        PDFDomTree parser = new PDFDomTree(PDFDomTreeConfig.createDefaultConfig()) {
            @Override
            protected void processTextPosition(TextPosition text) {
                if (text.getUnicode().trim().isEmpty()) {
                    //finish current box (if any)
                    if (lastText != null)
                    {
                        finishBox();
                    }
                    //start a new box
                    curstyle = new BoxStyle(style);
                    lastText = null;
                } else {
                    super.processTextPosition(text);
                }
            }
        };
        Writer output = new PrintWriter(new File(RESULT_FOLDER, "demo-improved.html"), "utf-8");

        parser.writeText(document, output);
        output.close();
    }
}

Source File: PDFVisibleTextStripper.java From testarea-pdfbox2 with Apache License 2.0

5 votes

void deleteCharsInPath() {
    for (List<TextPosition> list : charactersByArticle) {
        List<TextPosition> toRemove = new ArrayList<>();
        for (TextPosition text : list) {
            Matrix textMatrix = text.getTextMatrix();
            Vector start = textMatrix.transform(new Vector(0, 0));
            Vector end = new Vector(start.getX() + text.getWidth(), start.getY());
            if (linePath.contains(lowerLeftX + start.getX(), lowerLeftY + start.getY()) ||
                    (checkEndPointToo && linePath.contains(lowerLeftX + end.getX(), lowerLeftY + end.getY()))) {
                toRemove.add(text);
            }
        }
        if (toRemove.size() != 0) {
            System.out.println(toRemove.size());
            list.removeAll(toRemove);
        }
    }
}

Source File: PDFVisibleTextStripper.java From testarea-pdfbox2 with Apache License 2.0

5 votes

@Override
protected void processTextPosition(TextPosition text) {
    Matrix textMatrix = text.getTextMatrix();
    Vector start = textMatrix.transform(new Vector(0, 0));
    Vector end = new Vector(start.getX() + text.getWidth(), start.getY());

    PDGraphicsState gs = getGraphicsState();
    Area area = gs.getCurrentClippingPath();
    if (area == null ||
            (contains(area, lowerLeftX + start.getX(), lowerLeftY + start.getY()) &&
                    ((!checkEndPointToo) || contains(area, lowerLeftX + end.getX(), lowerLeftY + end.getY()))))
        super.processTextPosition(text);
    else if (dropStream != null)
        dropStream.printf("Clipped '%s' at %s,%s\n", text.getUnicode(), lowerLeftX + start.getX(), lowerLeftY + start.getY());
}

Source File: TextSection.java From testarea-pdfbox2 with Apache License 2.0

5 votes

@Override
public String toString()
{
    StringBuilder stringBuilder = new StringBuilder();
    stringBuilder.append(definition.name).append(": ");
    if (!header.isEmpty())
        stringBuilder.append(toString(header));
    stringBuilder.append('\n');
    for (List<List<TextPosition>> bodyLine : body)
    {
        stringBuilder.append("    ").append(toString(bodyLine)).append('\n');
    }
    return stringBuilder.toString();
}

Source File: PDFLayoutTextStripper.java From quarkus-pdf-extract with Apache License 2.0

5 votes

@Override
protected void writePage() throws IOException {
    List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
    for( int i = 0; i < charactersByArticle.size(); i++) {
        List<TextPosition> textList = charactersByArticle.get(i);
        try {
            this.sortTextPositionList(textList);
        } catch ( java.lang.IllegalArgumentException e) {
            System.err.println(e);
        }
        this.iterateThroughTextList(textList.iterator()) ;
    }
    this.writeToOutputStream(this.getTextLineList());
}

Source File: ColorTextStripper.java From testarea-pdfbox2 with Apache License 2.0

5 votes

@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
    for (TextPosition textPosition: textPositions)
    {
        RenderingMode charRenderingMode = renderingMode.get(textPosition);
        float[] charStrokingColor = strokingColor.get(textPosition);
        float[] charNonStrokingColor = nonStrokingColor.get(textPosition);

        StringBuilder textBuilder = new StringBuilder();
        textBuilder.append(textPosition.getUnicode())
                   .append("{");

        if (FILLING_MODES.contains(charRenderingMode))
        {
            textBuilder.append("FILL:")
                       .append(toString(charNonStrokingColor))
                       .append(';');
        }
        
        if (STROKING_MODES.contains(charRenderingMode))
        {
            textBuilder.append("STROKE:")
                       .append(toString(charStrokingColor))
                       .append(';');
        }

        if (CLIPPING_MODES.contains(charRenderingMode))
        {
            textBuilder.append("CLIP;");
        }

        textBuilder.append("}");
        writeString(textBuilder.toString());
    }
}

Source File: ColorTextStripper.java From testarea-pdfbox2 with Apache License 2.0

5 votes

@Override
protected void processTextPosition(TextPosition text)
{
    renderingMode.put(text, getGraphicsState().getTextState().getRenderingMode());
    strokingColor.put(text, getGraphicsState().getStrokingColor().getComponents());
    nonStrokingColor.put(text, getGraphicsState().getNonStrokingColor().getComponents());

    super.processTextPosition(text);
}

Source File: PdfToTextInfoConverter.java From testarea-pdfbox2 with Apache License 2.0

5 votes

@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
    for (TextPosition text : textPositions) {           
        Integer characterColor = getCharacterColor(text);
        Integer characterBackgroundColor = getCharacterBackgroundColor(text);
        if ((characterColor != null && characterColor.equals(characterBackgroundColor)) || characterColor == characterBackgroundColor) {
            logger.info(String.format("Color and background coincide for '%s' at %3.2f, %3.2f : %h", text.getUnicode(), text.getX(), text.getY(), characterColor));
        }
    }
}

Source File: PdfToTextInfoConverter.java From testarea-pdfbox2 with Apache License 2.0

5 votes

@Override
protected void processTextPosition(TextPosition text) {
    PDGraphicsState gs = getGraphicsState();
    // check opacity for stroke and fill text 
    if (gs.getAlphaConstant() < Constants.EPSILON && gs.getNonStrokeAlphaConstant() < Constants.EPSILON) {
        return;
    }                       

    Vector center = getTextPositionCenterPoint(text);
    Area area = gs.getCurrentClippingPath();
    if (area == null || area.contains(lowerLeftX + center.getX(), lowerLeftY + center.getY())) {            
        nonStrokingColors.put(text, gs.getNonStrokingColor());
        super.processTextPosition(text);
    }
}

org.apache.pdfbox.text.TextPosition Java Examples