org.apache.pdfbox.text.TextPosition Java Examples
The following examples show how to use
org.apache.pdfbox.text.TextPosition.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextSection.java From testarea-pdfbox2 with Apache License 2.0 | 6 votes |
String toString(List<List<TextPosition>> words) { StringBuilder stringBuilder = new StringBuilder(); boolean first = true; for (List<TextPosition> word : words) { if (first) first = false; else stringBuilder.append(' '); for (TextPosition textPosition : word) { stringBuilder.append(textPosition.getUnicode()); } } // cf. http://stackoverflow.com/a/7171932/1729265 return Normalizer.normalize(stringBuilder, Form.NFKC); }
Example #2
Source File: ExtractCharacterCodes.java From testarea-pdfbox2 with Apache License 2.0 | 6 votes |
/** * <a href="https://stackoverflow.com/questions/50664162/some-glyph-ids-missing-while-trying-to-extract-glyph-id-from-pdf"> * Some glyph ID's missing while trying to extract glyph ID from pdf * </a> * <br/> * <a href="http://1drv.ms/b/s!AmHcFaD-gMGyhkHr4PY6F4krYJ32"> * pattern3.pdf * </a> * <p> * This test shows how to access the character codes of the extracted text * while preventing the {@link PDFTextStripper} from doing any preprocessing * steps, in particular from doing any diacritics merges. * </p> */ @Test public void testExtractFromPattern3() throws IOException { try ( InputStream resource = getClass().getResourceAsStream("pattern3.pdf") ) { PDDocument document = Loader.loadPDF(resource); PDFTextStripper stripper = new PDFTextStripper() { @Override protected void processTextPosition(TextPosition textPosition) { try { writeString(String.format("%s%s", textPosition.getUnicode(), Arrays.toString(textPosition.getCharacterCodes()))); } catch (IOException e) { e.printStackTrace(); } } }; String text = stripper.getText(document); System.out.printf("\n*\n* pattern3.pdf\n*\n%s\n", text); Files.write(new File(RESULT_FOLDER, "pattern3.txt").toPath(), Collections.singleton(text)); } }
Example #3
Source File: ExtractWordCoordinates.java From testarea-pdfbox2 with Apache License 2.0 | 6 votes |
@Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { String wordSeparator = getWordSeparator(); List<TextPosition> word = new ArrayList<>(); for (TextPosition text : textPositions) { String thisChar = text.getUnicode(); if (thisChar != null) { if (thisChar.length() >= 1) { if (!thisChar.equals(wordSeparator)) { word.add(text); } else if (!word.isEmpty()) { printWord(word); word.clear(); } } } } if (!word.isEmpty()) { printWord(word); word.clear(); } }
Example #4
Source File: RectanglesOverText.java From testarea-pdfbox2 with Apache License 2.0 | 6 votes |
@Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException { TextLine tmpline = null; if (startOfLine) { tmpline = new TextLine(); tmpline.text = text; tmpline.textPositions = textPositions; lines.add(tmpline); } else { tmpline = lines.get(lines.size() - 1); tmpline.text += text; tmpline.textPositions.addAll(textPositions); } if (startOfLine) { startOfLine = false; } super.writeString(text, textPositions); }
Example #5
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 6 votes |
private void iterateThroughTextList(Iterator<TextPosition> textIterator) { List<TextPosition> textPositionList = new ArrayList<TextPosition>(); while ( textIterator.hasNext() ) { TextPosition textPosition = (TextPosition)textIterator.next(); int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition); if ( numberOfNewLines == 0 ) { textPositionList.add(textPosition); } else { this.writeTextPositionList(textPositionList); this.createNewEmptyNewLines(numberOfNewLines); textPositionList.add(textPosition); } this.setPreviousTextPosition(textPosition); } if (!textPositionList.isEmpty()) { this.writeTextPositionList(textPositionList); } }
Example #6
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 6 votes |
private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textPosition) { TextPosition previousTextPosition = this.getPreviousTextPosition(); if ( previousTextPosition == null ) { return 1; } float textYPosition = Math.round( textPosition.getY() ); float previousTextYPosition = Math.round( previousTextPosition.getY() ); if ( textYPosition > previousTextYPosition && (textYPosition - previousTextYPosition > 5.5) ) { double height = textPosition.getHeight(); int numberOfLines = (int) (Math.floor( textYPosition - previousTextYPosition) / height ); numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line if (DEBUG) System.out.println(height + " " + numberOfLines); return numberOfLines ; } else { return 0; } }
Example #7
Source File: PDFLayoutTextStripper.java From quarkus-pdf-extract with Apache License 2.0 | 6 votes |
private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textPosition) { TextPosition previousTextPosition = this.getPreviousTextPosition(); if ( previousTextPosition == null ) { return 1; } float textYPosition = Math.round( textPosition.getY() ); float previousTextYPosition = Math.round( previousTextPosition.getY() ); if ( textYPosition > previousTextYPosition && (textYPosition - previousTextYPosition > 5.5) ) { double height = textPosition.getHeight(); int numberOfLines = (int) (Math.floor( textYPosition - previousTextYPosition) / height ); numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line if (DEBUG) System.out.println(height + " " + numberOfLines); return numberOfLines ; } else { return 0; } }
Example #8
Source File: PDFLayoutTextStripper.java From quarkus-pdf-extract with Apache License 2.0 | 6 votes |
private void iterateThroughTextList(Iterator<TextPosition> textIterator) { List<TextPosition> textPositionList = new ArrayList<TextPosition>(); while ( textIterator.hasNext() ) { TextPosition textPosition = textIterator.next(); int numberOfNewLines = this.getNumberOfNewLinesFromPreviousTextPosition(textPosition); if ( numberOfNewLines == 0 ) { textPositionList.add(textPosition); } else { this.writeTextPositionList(textPositionList); this.createNewEmptyNewLines(numberOfNewLines); textPositionList.add(textPosition); } this.setPreviousTextPosition(textPosition); } if (!textPositionList.isEmpty()) { this.writeTextPositionList(textPositionList); } }
Example #9
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 5 votes |
private boolean isCharacterCloseToPreviousWord(final TextPosition textPosition) { if ( ! firstCharacterOfLineFound ) { return false; } double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition); return (numberOfSpaces > 1 && numberOfSpaces <= PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT); }
Example #10
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 5 votes |
private boolean isCharacterPartOfPreviousWord(final TextPosition textPosition) { TextPosition previousTextPosition = this.getPreviousTextPosition(); if ( previousTextPosition.getUnicode().equals(" ") ) { return false; } double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition); return (numberOfSpaces <= 1); }
Example #11
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 5 votes |
private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPosition1, final TextPosition textPosition2) { double previousTextXPosition = textPosition1.getX(); double previousTextWidth = textPosition1.getWidth(); double previousTextEndXPosition = (previousTextXPosition + previousTextWidth); double numberOfSpaces = Math.abs(Math.round(textPosition2.getX() - previousTextEndXPosition)); return numberOfSpaces; }
Example #12
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 5 votes |
private void writeLine(final List<TextPosition> textPositionList) { if ( textPositionList.size() > 0 ) { TextLine textLine = this.addNewLine(); boolean firstCharacterOfLineFound = false; for (TextPosition textPosition : textPositionList ) { CharacterFactory characterFactory = new CharacterFactory(firstCharacterOfLineFound); Character character = characterFactory.createCharacterFromTextPosition(textPosition, this.getPreviousTextPosition()); textLine.writeCharacterAtIndex(character); this.setPreviousTextPosition(textPosition); firstCharacterOfLineFound = true; } } else { this.addNewLine(); // white line } }
Example #13
Source File: TextMetrics.java From Pdf2Dom with GNU Lesser General Public License v3.0 | 5 votes |
public TextMetrics(TextPosition tp) { x = tp.getX(); baseline = tp.getY(); font = tp.getFont(); width = tp.getWidth(); height = tp.getHeight(); pointSize = tp.getFontSizeInPt(); fontSize = tp.getYScale(); ascent = getAscent(); descent = getDescent(); }
Example #14
Source File: TextMetrics.java From Pdf2Dom with GNU Lesser General Public License v3.0 | 5 votes |
public void append(TextPosition tp) { width += tp.getX() - (x + width) + tp.getWidth(); height = Math.max(height, tp.getHeight()); ascent = Math.max(ascent, getAscent(tp.getFont(), tp.getYScale())); descent = Math.min(descent, getDescent(tp.getFont(), tp.getYScale())); }
Example #15
Source File: VisualizeMarkedContent.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
/** * This method shows the text content for a MCID and determines its * bounding box. It also recurses. */ Rectangle2D showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) throws IOException { Rectangle2D box = null; PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null; List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList(); StringBuilder textContent = new StringBuilder(); for (Object object : contents) { if (object instanceof TextPosition) { TextPosition textPosition = (TextPosition)object; textContent.append(textPosition.getUnicode()); int[] codes = textPosition.getCharacterCodes(); if (codes.length != 1) { System.out.printf("<!-- text position with unexpected number of codes: %d -->", codes.length); } else { box = union(box, calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0]).getBounds2D()); } } else if (object instanceof PDMarkedContent) { PDMarkedContent thisMarkedContent = (PDMarkedContent) object; box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents)); } else { textContent.append("?" + object); } } System.out.printf("%s\n", textContent); return box; }
Example #16
Source File: TextStripper.java From tabula-java with MIT License | 5 votes |
@Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { for (TextPosition textPosition: textPositions) { if (textPosition == null) { continue; } String c = textPosition.getUnicode(); // if c not printable, return if (!isPrintable(c)) { continue; } Float h = textPosition.getHeightDir(); if (c.equals(NBSP)) { // replace non-breaking space for space c = " "; } float wos = textPosition.getWidthOfSpace(); TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2), Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2), Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSize(), c, // workaround a possible bug in PDFBox: // https://issues.apache.org/jira/browse/PDFBOX-1755 wos, textPosition.getDir()); this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth()); this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight()); this.spatialIndex.add(te); this.textElements.add(te); } }
Example #17
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 5 votes |
private boolean isFirstCharacterOfAWord(final TextPosition textPosition) { if ( ! firstCharacterOfLineFound ) { return true; } double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition); return (numberOfSpaces > 1) || this.isCharacterAtTheBeginningOfNewLine(textPosition); }
Example #18
Source File: PDFLayoutTextStripper.java From PDFLayoutTextStripper with Apache License 2.0 | 5 votes |
@Override protected void writePage() throws IOException { List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle(); for( int i = 0; i < charactersByArticle.size(); i++) { List<TextPosition> textList = charactersByArticle.get(i); try { this.sortTextPositionList(textList); } catch ( java.lang.IllegalArgumentException e) { System.err.println(e); } this.iterateThroughTextList(textList.iterator()) ; } this.writeToOutputStream(this.getTextLineList()); }
Example #19
Source File: ExtractMarkedContent.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
/** * @see #showStructure(PDStructureNode, Map) * @see #testExtractTestWPhromma() */ void showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) { PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null; List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList(); StringBuilder textContent = new StringBuilder(); for (Object object : contents) { if (object instanceof TextPosition) { textContent.append(((TextPosition)object).getUnicode()); } else { textContent.append("?" + object); } } System.out.printf("%s\n", textContent); }
Example #20
Source File: SearchSubword.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
List<TextPositionSequence> findSubwords(PDDocument document, int page, String searchTerm) throws IOException { final List<TextPositionSequence> hits = new ArrayList<TextPositionSequence>(); PDFTextStripper stripper = new PDFTextStripper() { @Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException { System.out.printf(" -- %s\n", text); TextPositionSequence word = new TextPositionSequence(textPositions); String string = word.toString(); int fromIndex = 0; int index; while ((index = string.indexOf(searchTerm, fromIndex)) > -1) { hits.add(word.subSequence(index, index + searchTerm.length())); fromIndex = index + 1; } super.writeString(text, textPositions); } }; stripper.setSortByPosition(true); stripper.setStartPage(page); stripper.setEndPage(page); stripper.getText(document); return hits; }
Example #21
Source File: SearchSubword.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
void printSubwords(PDDocument document, String searchTerm) throws IOException { System.out.printf("* Looking for '%s'\n", searchTerm); for (int page = 1; page <= document.getNumberOfPages(); page++) { List<TextPositionSequence> hits = findSubwords(document, page, searchTerm); for (TextPositionSequence hit : hits) { if (!searchTerm.equals(hit.toString())) System.out.printf(" Invalid (%s) ", hit.toString()); TextPosition lastPosition = hit.textPositionAt(hit.length() - 1); System.out.printf(" Page %s at %s, %s with width %s and last letter '%s' at %s, %s\n", page, hit.getX(), hit.getY(), hit.getWidth(), lastPosition.getUnicode(), lastPosition.getXDirAdj(), lastPosition.getYDirAdj()); } } }
Example #22
Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
/** * <a href="https://stackoverflow.com/questions/51672080/pdfdomtree-does-not-detecting-white-spaces-while-converting-a-pdf-file-to-html"> * PDFDomTree does not detecting white spaces while converting a pdf file to html * </a> * <br/> * <a href="https://drive.google.com/file/d/1SZNFCvGVbQzCxJiRr8HlW99ravC_Cm71/view?usp=sharing"> * demo.pdf * </a> * <p> * This improved version does not ignore white space glyphs but * instead translates them into gaps. This is a work-around and * not a fix, different kinds of white spaces need to be handled * differently. * </p> * @see #testDemo() */ @Test public void testDemoImproved() throws IOException, ParserConfigurationException { System.out.printf("\n*\n* demo.pdf improved\n*\n"); try ( InputStream resource = getClass().getResourceAsStream("/mkl/testarea/pdfbox2/extract/demo.pdf") ) { PDDocument document = Loader.loadPDF(resource); PDFDomTree parser = new PDFDomTree(PDFDomTreeConfig.createDefaultConfig()) { @Override protected void processTextPosition(TextPosition text) { if (text.getUnicode().trim().isEmpty()) { //finish current box (if any) if (lastText != null) { finishBox(); } //start a new box curstyle = new BoxStyle(style); lastText = null; } else { super.processTextPosition(text); } } }; Writer output = new PrintWriter(new File(RESULT_FOLDER, "demo-improved.html"), "utf-8"); parser.writeText(document, output); output.close(); } }
Example #23
Source File: PDFVisibleTextStripper.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
void deleteCharsInPath() { for (List<TextPosition> list : charactersByArticle) { List<TextPosition> toRemove = new ArrayList<>(); for (TextPosition text : list) { Matrix textMatrix = text.getTextMatrix(); Vector start = textMatrix.transform(new Vector(0, 0)); Vector end = new Vector(start.getX() + text.getWidth(), start.getY()); if (linePath.contains(lowerLeftX + start.getX(), lowerLeftY + start.getY()) || (checkEndPointToo && linePath.contains(lowerLeftX + end.getX(), lowerLeftY + end.getY()))) { toRemove.add(text); } } if (toRemove.size() != 0) { System.out.println(toRemove.size()); list.removeAll(toRemove); } } }
Example #24
Source File: PDFVisibleTextStripper.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
@Override protected void processTextPosition(TextPosition text) { Matrix textMatrix = text.getTextMatrix(); Vector start = textMatrix.transform(new Vector(0, 0)); Vector end = new Vector(start.getX() + text.getWidth(), start.getY()); PDGraphicsState gs = getGraphicsState(); Area area = gs.getCurrentClippingPath(); if (area == null || (contains(area, lowerLeftX + start.getX(), lowerLeftY + start.getY()) && ((!checkEndPointToo) || contains(area, lowerLeftX + end.getX(), lowerLeftY + end.getY())))) super.processTextPosition(text); else if (dropStream != null) dropStream.printf("Clipped '%s' at %s,%s\n", text.getUnicode(), lowerLeftX + start.getX(), lowerLeftY + start.getY()); }
Example #25
Source File: TextSection.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
@Override public String toString() { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append(definition.name).append(": "); if (!header.isEmpty()) stringBuilder.append(toString(header)); stringBuilder.append('\n'); for (List<List<TextPosition>> bodyLine : body) { stringBuilder.append(" ").append(toString(bodyLine)).append('\n'); } return stringBuilder.toString(); }
Example #26
Source File: PDFLayoutTextStripper.java From quarkus-pdf-extract with Apache License 2.0 | 5 votes |
@Override protected void writePage() throws IOException { List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle(); for( int i = 0; i < charactersByArticle.size(); i++) { List<TextPosition> textList = charactersByArticle.get(i); try { this.sortTextPositionList(textList); } catch ( java.lang.IllegalArgumentException e) { System.err.println(e); } this.iterateThroughTextList(textList.iterator()) ; } this.writeToOutputStream(this.getTextLineList()); }
Example #27
Source File: ColorTextStripper.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
@Override protected void writeString(String text, List<TextPosition> textPositions) throws IOException { for (TextPosition textPosition: textPositions) { RenderingMode charRenderingMode = renderingMode.get(textPosition); float[] charStrokingColor = strokingColor.get(textPosition); float[] charNonStrokingColor = nonStrokingColor.get(textPosition); StringBuilder textBuilder = new StringBuilder(); textBuilder.append(textPosition.getUnicode()) .append("{"); if (FILLING_MODES.contains(charRenderingMode)) { textBuilder.append("FILL:") .append(toString(charNonStrokingColor)) .append(';'); } if (STROKING_MODES.contains(charRenderingMode)) { textBuilder.append("STROKE:") .append(toString(charStrokingColor)) .append(';'); } if (CLIPPING_MODES.contains(charRenderingMode)) { textBuilder.append("CLIP;"); } textBuilder.append("}"); writeString(textBuilder.toString()); } }
Example #28
Source File: ColorTextStripper.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
@Override protected void processTextPosition(TextPosition text) { renderingMode.put(text, getGraphicsState().getTextState().getRenderingMode()); strokingColor.put(text, getGraphicsState().getStrokingColor().getComponents()); nonStrokingColor.put(text, getGraphicsState().getNonStrokingColor().getComponents()); super.processTextPosition(text); }
Example #29
Source File: PdfToTextInfoConverter.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
@Override protected void writeString(String string, List<TextPosition> textPositions) throws IOException { for (TextPosition text : textPositions) { Integer characterColor = getCharacterColor(text); Integer characterBackgroundColor = getCharacterBackgroundColor(text); if ((characterColor != null && characterColor.equals(characterBackgroundColor)) || characterColor == characterBackgroundColor) { logger.info(String.format("Color and background coincide for '%s' at %3.2f, %3.2f : %h", text.getUnicode(), text.getX(), text.getY(), characterColor)); } } }
Example #30
Source File: PdfToTextInfoConverter.java From testarea-pdfbox2 with Apache License 2.0 | 5 votes |
@Override protected void processTextPosition(TextPosition text) { PDGraphicsState gs = getGraphicsState(); // check opacity for stroke and fill text if (gs.getAlphaConstant() < Constants.EPSILON && gs.getNonStrokeAlphaConstant() < Constants.EPSILON) { return; } Vector center = getTextPositionCenterPoint(text); Area area = gs.getCurrentClippingPath(); if (area == null || area.contains(lowerLeftX + center.getX(), lowerLeftY + center.getY())) { nonStrokingColors.put(text, gs.getNonStrokingColor()); super.processTextPosition(text); } }