Python pdfminer.layout.LTTextLine() Examples
The following are 15
code examples of pdfminer.layout.LTTextLine().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pdfminer.layout
, or try the search function
.
Example #1
Source File: pile.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License | 7 votes |
def parse_layout(self, layout): obj_stack = list(reversed(list(layout))) while obj_stack: obj = obj_stack.pop() if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]: obj_stack.extend(reversed(list(obj))) elif type(obj) == LTTextLineHorizontal: self.texts.append(obj) elif type(obj) == LTRect: if obj.width < 1.0: self._adjust_to_close(obj, self.verticals, 'x0') self.verticals.append(obj) elif obj.height < 1.0: self._adjust_to_close(obj, self.horizontals, 'y0') self.horizontals.append(obj) elif type(obj) == LTImage: self.images.append(obj) elif type(obj) == LTCurve: pass elif type(obj) == LTChar: pass elif type(obj) == LTLine: pass else: assert False, "Unrecognized type: %s" % type(obj)
Example #2
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 6 votes |
def split_text(cls, line, text1, text2): textbox = not isinstance(line, layout.LTTextLine) if textbox: box = line line = line._objs[0] second = object.__new__(line.__class__) second.__dict__ = dict(line.__dict__) (o1, o2) = (cls.strip_text_line(line, text1), cls.strip_text_line(second, text2)) if textbox: box2 = object.__new__(box.__class__) box2.__dict__ = dict(box.__dict__) box._objs = [o1] box2._objs = [o2] return (box, box2) else: return (o1, o2)
Example #3
Source File: node.py From pdftotree with MIT License | 5 votes |
def elem_type(elem): if isinstance(elem, LTLine): return "line" if isinstance(elem, LTCurve): return "curve" if isinstance(elem, LTTextLine): return "text" if isinstance(elem, LTFigure): return "figure" return "unkown"
Example #4
Source File: node.py From pdftotree with MIT License | 5 votes |
def __str__(self, *args, **kwargs): return "\t".join( r.get_text().encode("utf8", "replace") for r in self.elems if isinstance(r, LTTextLine) ) ############################################# # Static utilities #############################################
Example #5
Source File: node.py From pdftotree with MIT License | 5 votes |
def _split_text_n_lines(elems): texts = [] lines = [] for e in elems: if isinstance(e, LTTextLine): texts.append(e) elif isinstance(e, LTLine): lines.append(e) return texts, lines
Example #6
Source File: node.py From pdftotree with MIT License | 5 votes |
def _left_bar(content, default_val): last_bar = default_val for _coord, val in content: if not isinstance(val, LTTextLine): last_bar = val yield last_bar
Example #7
Source File: node.py From pdftotree with MIT License | 5 votes |
def _row_str(row_content): def strfy(r): if r is None: return "None" if isinstance(r, tuple): _c, r = r if isinstance(r, LTTextLine): return r.get_text().encode("utf8", "replace") if isinstance(r, numbers.Number): return "|" return str(r) return "\t".join(strfy(r) for r in row_content)
Example #8
Source File: pdf_parsers.py From pdftotree with MIT License | 5 votes |
def parse_layout(elems, font_stat, combine=False): """ Parses pdf texts into a hypergraph grouped into rows and columns and then output """ boxes_segments = elems.segments boxes_curves = elems.curves boxes_figures = elems.figures page_width = elems.layout.width # page_height = elems.layout.height boxes = elems.mentions avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat) width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves) char_width = get_char_width(boxes) grid_size = avg_font_pts / 2.0 for i, m in enumerate(boxes + elems.figures): m.id = i m.feats = defaultdict(bool) prefix = "" if isinstance(m, LTTextLine) and m.font_name: prefix = m.font_name + "-" + str(m.font_size) + "-" m.xc = (m.x0 + m.x1) / 2.0 m.yc = (m.y0 + m.y1) / 2.0 m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size tbls, tbl_features = cluster_vertically_aligned_boxes( boxes, elems.layout.bbox, avg_font_pts, width, char_width, boxes_segments, boxes_curves, boxes_figures, page_width, combine, ) return tbls, tbl_features
Example #9
Source File: parse_pdf.py From GraphIE with GNU General Public License v3.0 | 5 votes |
def parse_text(layout): """Function to recursively parse the layout tree.""" result = [] if not hasattr(layout, '__iter__'): return result for lt_obj in layout: if isinstance(lt_obj, LTTextLine): bbox = lt_obj.bbox text = lt_obj.get_text().strip() if text != '': result += [(bbox, text)] else: result += parse_text(lt_obj) return result
Example #10
Source File: pdf.py From ChemDataExtractor with MIT License | 5 votes |
def _process_layout(self, layout): """Process an LTPage layout and return a list of elements.""" # Here we just group text into paragraphs elements = [] for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): elements.append(Paragraph(lt_obj.get_text().strip())) elif isinstance(lt_obj, LTFigure): # Recursive... elements.extend(self._process_layout(lt_obj)) return elements
Example #11
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 5 votes |
def try_add_field(cls, t, obj, results, nrows, nameoffset=0): if isinstance(obj, layout.LTTextLine): cls._try_add(t, obj, results, nrows, nameoffset) elif isinstance(obj, layout.LTTextBox): if not cls._try_add(t, obj, results, nrows, nameoffset): #only if add fails recurse for i in obj: cls.try_add_field(t, i, results, nrows, nameoffset)
Example #12
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 5 votes |
def count_rows(cls, t, o, offset=0): info = t.col_info[TITable.NAME] count = 0 if isinstance(o, layout.LTTextBox): for i in o: count += cls.count_rows(t, i, offset) return count elif isinstance(o, layout.LTTextLine): text = cls.get_entry_text(o) if abs(info.l - (o.bbox[0] + offset)) < 0.2: if info.regex.match(text): return 1 return 0
Example #13
Source File: pdfConverter.py From Forager with MIT License | 4 votes |
def convert_pdf_to_txt(path): fp = open(path, 'rb') txt = '' parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): txt += lt_obj.get_text() return(txt)
Example #14
Source File: features.py From pdftotree with MIT License | 4 votes |
def get_alignment_features(line_bboxes, elems, font_stat): alignment_features = [] for line_bbox in line_bboxes: line_bbox_ordered = (line_bbox[4], line_bbox[3], line_bbox[6], line_bbox[5]) boxes = [ elem for elem in elems.mentions if intersect(line_bbox_ordered, elem.bbox) ] boxes_segments = [ elem for elem in elems.segments if intersect(line_bbox_ordered, elem.bbox) ] boxes_figures = [ elem for elem in elems.figures if intersect(line_bbox_ordered, elem.bbox) ] boxes_curves = [ elem for elem in elems.curves if intersect(line_bbox_ordered, elem.bbox) ] page_width = elems.layout.width # page_height = elems.layout.height avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat) width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves) if len(boxes) == 0: alignment_features += [[0] * 17] continue char_width = get_char_width(boxes) grid_size = avg_font_pts / 2.0 for i, m in enumerate(boxes + elems.figures): m.id = i m.feats = defaultdict(bool) prefix = "" if isinstance(m, LTTextLine) and m.font_name: prefix = m.font_name + "-" + str(m.font_size) + "-" m.xc = (m.x0 + m.x1) / 2.0 m.yc = (m.y0 + m.y1) / 2.0 m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size nodes, nodes_features = cluster_vertically_aligned_boxes( boxes, elems.layout.bbox, avg_font_pts, width, char_width, boxes_segments, boxes_curves, boxes_figures, page_width, True, ) if len(nodes_features) == 0: alignment_features += [[0] * 17] else: alignment_features += [nodes_features] return alignment_features
Example #15
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 4 votes |
def _try_add(cls, t, obj, results, nrows, nameoffset): if obj.bbox[0] < ((t.col_info[TITable.NAME].l - nameoffset)- 0.5): # don't consider items that are past the left of the table return False text = cls.get_entry_text(obj) added = False center = cls.calculate_center(obj) closest_field = None min_diff = sys.maxint field_info = None for (field, info) in t.col_info.iteritems(): if field == TITable.NAME: center -= nameoffset diff = abs(center - info.c) if diff < min_diff: min_diff = diff closest_field = field field_info = info #print "%s closest to %s (%s)" % (obj, closest_field, field_info.regex.pattern) addrfield = [j for j in t.col_info.itervalues() if j.typ == TITable.ADDRESS] if isinstance(obj, layout.LTText): text = cls.get_entry_text(obj) if field_info.regex.search(text): if len(results[closest_field]) >= nrows: added = False else: results[closest_field] += [obj] added = True elif isinstance(obj, layout.LTTextLine) and \ ((closest_field == TITable.OFFSET) or \ (closest_field in [a.name for a in addrfield])): fields = [j for j in text.rsplit(")", 2) if len(j) > 0] if len(fields) == 2: fields = [f+")" for f in fields] off = fields[0].strip() adr = fields[1].strip() if adr[0] == '+': # move + to end of off if @ start of adr adr = adr[1:].strip() if TITable.OFFSET in t.col_info: col1 = t.col_info[TITable.OFFSET] col2 = addrfield[0] elif len(addrfield) == 2: col1 = addrfield[0] col2 = addrfield[1] else: return False if col1.regex.match(off) \ and col2.regex.match(adr): #print "splitting objects" (oobj, aobj) = cls.split_text(obj, off, adr) # TODO: split text into two obbjects results[col1.name].append(oobj) results[col2.name].append(aobj) added = True return added