Python pdfminer.layout.LTTextLine() Examples

The following are 15 code examples of pdfminer.layout.LTTextLine(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.layout , or try the search function .
Example #1
Source File: pile.py    From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def parse_layout(self, layout):
        obj_stack = list(reversed(list(layout)))
        while obj_stack:
            obj = obj_stack.pop()
            if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
                obj_stack.extend(reversed(list(obj)))
            elif type(obj) == LTTextLineHorizontal:
                self.texts.append(obj)
            elif type(obj) == LTRect:
                if obj.width < 1.0:
                    self._adjust_to_close(obj, self.verticals, 'x0')
                    self.verticals.append(obj)
                elif obj.height < 1.0:
                    self._adjust_to_close(obj, self.horizontals, 'y0')
                    self.horizontals.append(obj)
            elif type(obj) == LTImage:
                self.images.append(obj)
            elif type(obj) == LTCurve:
                pass
            elif type(obj) == LTChar:
                pass
            elif type(obj) == LTLine:
                pass                    
            else:
                assert False, "Unrecognized type: %s" % type(obj) 
Example #2
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 6 votes vote down vote up
def split_text(cls, line, text1, text2):
        textbox = not isinstance(line, layout.LTTextLine)
        if textbox:
            box = line
            line = line._objs[0]
        second = object.__new__(line.__class__)
        second.__dict__ = dict(line.__dict__)
        (o1, o2) = (cls.strip_text_line(line, text1),
                    cls.strip_text_line(second, text2))
        if textbox:
            box2 = object.__new__(box.__class__)
            box2.__dict__ = dict(box.__dict__)
            box._objs = [o1]
            box2._objs = [o2]
            return (box, box2)
        else:
            return (o1, o2) 
Example #3
Source File: node.py    From pdftotree with MIT License 5 votes vote down vote up
def elem_type(elem):
    if isinstance(elem, LTLine):
        return "line"
    if isinstance(elem, LTCurve):
        return "curve"
    if isinstance(elem, LTTextLine):
        return "text"
    if isinstance(elem, LTFigure):
        return "figure"
    return "unkown" 
Example #4
Source File: node.py    From pdftotree with MIT License 5 votes vote down vote up
def __str__(self, *args, **kwargs):
        return "\t".join(
            r.get_text().encode("utf8", "replace")
            for r in self.elems
            if isinstance(r, LTTextLine)
        )


#############################################
#    Static utilities
############################################# 
Example #5
Source File: node.py    From pdftotree with MIT License 5 votes vote down vote up
def _split_text_n_lines(elems):
    texts = []
    lines = []
    for e in elems:
        if isinstance(e, LTTextLine):
            texts.append(e)
        elif isinstance(e, LTLine):
            lines.append(e)
    return texts, lines 
Example #6
Source File: node.py    From pdftotree with MIT License 5 votes vote down vote up
def _left_bar(content, default_val):
    last_bar = default_val
    for _coord, val in content:
        if not isinstance(val, LTTextLine):
            last_bar = val
        yield last_bar 
Example #7
Source File: node.py    From pdftotree with MIT License 5 votes vote down vote up
def _row_str(row_content):
    def strfy(r):
        if r is None:
            return "None"
        if isinstance(r, tuple):
            _c, r = r
        if isinstance(r, LTTextLine):
            return r.get_text().encode("utf8", "replace")
        if isinstance(r, numbers.Number):
            return "|"
        return str(r)

    return "\t".join(strfy(r) for r in row_content) 
Example #8
Source File: pdf_parsers.py    From pdftotree with MIT License 5 votes vote down vote up
def parse_layout(elems, font_stat, combine=False):
    """
    Parses pdf texts into a hypergraph grouped into rows
    and columns and then output
    """
    boxes_segments = elems.segments
    boxes_curves = elems.curves
    boxes_figures = elems.figures
    page_width = elems.layout.width
    #  page_height = elems.layout.height
    boxes = elems.mentions
    avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
    width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
    char_width = get_char_width(boxes)
    grid_size = avg_font_pts / 2.0
    for i, m in enumerate(boxes + elems.figures):
        m.id = i
        m.feats = defaultdict(bool)
        prefix = ""
        if isinstance(m, LTTextLine) and m.font_name:
            prefix = m.font_name + "-" + str(m.font_size) + "-"
        m.xc = (m.x0 + m.x1) / 2.0
        m.yc = (m.y0 + m.y1) / 2.0
        m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
        m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
        m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
        m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size

    tbls, tbl_features = cluster_vertically_aligned_boxes(
        boxes,
        elems.layout.bbox,
        avg_font_pts,
        width,
        char_width,
        boxes_segments,
        boxes_curves,
        boxes_figures,
        page_width,
        combine,
    )
    return tbls, tbl_features 
Example #9
Source File: parse_pdf.py    From GraphIE with GNU General Public License v3.0 5 votes vote down vote up
def parse_text(layout):
    """Function to recursively parse the layout tree."""
    result = []
    if not hasattr(layout, '__iter__'):
        return result
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextLine):
            bbox = lt_obj.bbox
            text = lt_obj.get_text().strip()
            if text != '':
                result += [(bbox, text)]
        else:
            result += parse_text(lt_obj)
    return result 
Example #10
Source File: pdf.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def _process_layout(self, layout):
        """Process an LTPage layout and return a list of elements."""
        # Here we just group text into paragraphs
        elements = []
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                elements.append(Paragraph(lt_obj.get_text().strip()))
            elif isinstance(lt_obj, LTFigure):
                # Recursive...
                elements.extend(self._process_layout(lt_obj))
        return elements 
Example #11
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 5 votes vote down vote up
def try_add_field(cls, t, obj, results, nrows, nameoffset=0):
        if isinstance(obj, layout.LTTextLine):
            cls._try_add(t, obj, results, nrows, nameoffset)
        elif isinstance(obj, layout.LTTextBox):
            if not cls._try_add(t, obj, results, nrows, nameoffset): #only if add fails recurse
                for i in obj:
                    cls.try_add_field(t, i, results, nrows, nameoffset) 
Example #12
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 5 votes vote down vote up
def count_rows(cls, t, o, offset=0):
        info = t.col_info[TITable.NAME]
        count = 0
        if isinstance(o, layout.LTTextBox):
            for i in o:
                count += cls.count_rows(t, i, offset)
            return count
        elif isinstance(o, layout.LTTextLine):
            text = cls.get_entry_text(o)
            if abs(info.l - (o.bbox[0] + offset)) < 0.2:
                if info.regex.match(text):
                    return 1
        return 0 
Example #13
Source File: pdfConverter.py    From Forager with MIT License 4 votes vote down vote up
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return(txt) 
Example #14
Source File: features.py    From pdftotree with MIT License 4 votes vote down vote up
def get_alignment_features(line_bboxes, elems, font_stat):
    alignment_features = []
    for line_bbox in line_bboxes:
        line_bbox_ordered = (line_bbox[4], line_bbox[3], line_bbox[6], line_bbox[5])
        boxes = [
            elem for elem in elems.mentions if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_segments = [
            elem for elem in elems.segments if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_figures = [
            elem for elem in elems.figures if intersect(line_bbox_ordered, elem.bbox)
        ]
        boxes_curves = [
            elem for elem in elems.curves if intersect(line_bbox_ordered, elem.bbox)
        ]
        page_width = elems.layout.width
        #  page_height = elems.layout.height
        avg_font_pts = get_most_common_font_pts(elems.mentions, font_stat)
        width = get_page_width(boxes + boxes_segments + boxes_figures + boxes_curves)
        if len(boxes) == 0:
            alignment_features += [[0] * 17]
            continue
        char_width = get_char_width(boxes)
        grid_size = avg_font_pts / 2.0
        for i, m in enumerate(boxes + elems.figures):
            m.id = i
            m.feats = defaultdict(bool)
            prefix = ""
            if isinstance(m, LTTextLine) and m.font_name:
                prefix = m.font_name + "-" + str(m.font_size) + "-"
            m.xc = (m.x0 + m.x1) / 2.0
            m.yc = (m.y0 + m.y1) / 2.0
            m.feats[prefix + "x0"] = m.x0_grid = m.x0 // grid_size
            m.feats[prefix + "x1"] = m.x1_grid = m.x1 // grid_size
            m.feats[prefix + "xc"] = m.xc_grid = m.xc // grid_size
            m.feats[prefix + "yc"] = m.yc_grid = m.yc // grid_size

        nodes, nodes_features = cluster_vertically_aligned_boxes(
            boxes,
            elems.layout.bbox,
            avg_font_pts,
            width,
            char_width,
            boxes_segments,
            boxes_curves,
            boxes_figures,
            page_width,
            True,
        )
        if len(nodes_features) == 0:
            alignment_features += [[0] * 17]
        else:
            alignment_features += [nodes_features]
    return alignment_features 
Example #15
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 4 votes vote down vote up
def _try_add(cls, t, obj, results, nrows, nameoffset):
        if obj.bbox[0] < ((t.col_info[TITable.NAME].l - nameoffset)- 0.5): # don't consider items that are past the left of the table
            return False

        text = cls.get_entry_text(obj)
        added = False
        center = cls.calculate_center(obj)
        closest_field = None
        min_diff = sys.maxint
        field_info = None
        for (field, info) in t.col_info.iteritems():
            if field == TITable.NAME:
                center -= nameoffset
            diff = abs(center - info.c)
            if diff < min_diff:
                min_diff = diff
                closest_field = field
                field_info = info

        #print "%s closest to %s (%s)" % (obj, closest_field, field_info.regex.pattern)

        addrfield = [j for j in t.col_info.itervalues() if j.typ == TITable.ADDRESS]
        if isinstance(obj, layout.LTText):
            text = cls.get_entry_text(obj)
            if field_info.regex.search(text):
                if len(results[closest_field]) >= nrows:
                    added = False
                else:
                    results[closest_field] += [obj]
                    added = True
            elif isinstance(obj, layout.LTTextLine) and \
                            ((closest_field == TITable.OFFSET) or \
                             (closest_field in [a.name for a in addrfield])):

                fields = [j for j in text.rsplit(")", 2) if len(j) > 0]
                if len(fields) == 2:
                    fields = [f+")" for f in fields]
                    off = fields[0].strip()
                    adr = fields[1].strip()
                    if adr[0] == '+':  # move + to end of off if @ start of adr
                        adr = adr[1:].strip()
                    if TITable.OFFSET in t.col_info:
                        col1 = t.col_info[TITable.OFFSET]
                        col2 = addrfield[0]
                    elif len(addrfield) == 2:
                        col1 = addrfield[0]
                        col2 = addrfield[1]
                    else:
                        return False

                    if col1.regex.match(off) \
                       and col2.regex.match(adr):
                        #print "splitting objects"
                        (oobj, aobj) = cls.split_text(obj, off, adr)
                        # TODO: split text into two obbjects
                        results[col1.name].append(oobj)
                        results[col2.name].append(aobj)
                        added = True
        return added