Python pdfminer.layout.LTTextBox() Examples
The following are 6
code examples of pdfminer.layout.LTTextBox().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pdfminer.layout
, or try the search function
.
Example #1
Source File: pile.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License | 7 votes |
def parse_layout(self, layout): obj_stack = list(reversed(list(layout))) while obj_stack: obj = obj_stack.pop() if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]: obj_stack.extend(reversed(list(obj))) elif type(obj) == LTTextLineHorizontal: self.texts.append(obj) elif type(obj) == LTRect: if obj.width < 1.0: self._adjust_to_close(obj, self.verticals, 'x0') self.verticals.append(obj) elif obj.height < 1.0: self._adjust_to_close(obj, self.horizontals, 'y0') self.horizontals.append(obj) elif type(obj) == LTImage: self.images.append(obj) elif type(obj) == LTCurve: pass elif type(obj) == LTChar: pass elif type(obj) == LTLine: pass else: assert False, "Unrecognized type: %s" % type(obj)
Example #2
Source File: pdf.py From ChemDataExtractor with MIT License | 5 votes |
def _process_layout(self, layout): """Process an LTPage layout and return a list of elements.""" # Here we just group text into paragraphs elements = [] for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): elements.append(Paragraph(lt_obj.get_text().strip())) elif isinstance(lt_obj, LTFigure): # Recursive... elements.extend(self._process_layout(lt_obj)) return elements
Example #3
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 5 votes |
def get_text_obj(cls, obj, index, regexp, text): otext = cls.get_entry_text(obj) if otext == text: return obj else: if isinstance(obj, layout.LTTextBox): i = 0 for l in obj: ret = cls.get_text_obj(l, text) if ret: return ret return None
Example #4
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 5 votes |
def try_add_field(cls, t, obj, results, nrows, nameoffset=0): if isinstance(obj, layout.LTTextLine): cls._try_add(t, obj, results, nrows, nameoffset) elif isinstance(obj, layout.LTTextBox): if not cls._try_add(t, obj, results, nrows, nameoffset): #only if add fails recurse for i in obj: cls.try_add_field(t, i, results, nrows, nameoffset)
Example #5
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 5 votes |
def count_rows(cls, t, o, offset=0): info = t.col_info[TITable.NAME] count = 0 if isinstance(o, layout.LTTextBox): for i in o: count += cls.count_rows(t, i, offset) return count elif isinstance(o, layout.LTTextLine): text = cls.get_entry_text(o) if abs(info.l - (o.bbox[0] + offset)) < 0.2: if info.regex.match(text): return 1 return 0
Example #6
Source File: pdfConverter.py From Forager with MIT License | 4 votes |
def convert_pdf_to_txt(path): fp = open(path, 'rb') txt = '' parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): txt += lt_obj.get_text() return(txt)