Python pdfminer.layout.LTFigure() Examples
The following are 5
code examples of pdfminer.layout.LTFigure().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pdfminer.layout
, or try the search function
.
Example #1
Source File: pile.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License | 7 votes |
def parse_layout(self, layout): obj_stack = list(reversed(list(layout))) while obj_stack: obj = obj_stack.pop() if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]: obj_stack.extend(reversed(list(obj))) elif type(obj) == LTTextLineHorizontal: self.texts.append(obj) elif type(obj) == LTRect: if obj.width < 1.0: self._adjust_to_close(obj, self.verticals, 'x0') self.verticals.append(obj) elif obj.height < 1.0: self._adjust_to_close(obj, self.horizontals, 'y0') self.horizontals.append(obj) elif type(obj) == LTImage: self.images.append(obj) elif type(obj) == LTCurve: pass elif type(obj) == LTChar: pass elif type(obj) == LTLine: pass else: assert False, "Unrecognized type: %s" % type(obj)
Example #2
Source File: node.py From pdftotree with MIT License | 5 votes |
def elem_type(elem): if isinstance(elem, LTLine): return "line" if isinstance(elem, LTCurve): return "curve" if isinstance(elem, LTTextLine): return "text" if isinstance(elem, LTFigure): return "figure" return "unkown"
Example #3
Source File: pdf.py From PassportEye with MIT License | 5 votes |
def extract_first_jpeg_in_pdf(fstream): """ Reads a given PDF file and scans for the first valid embedded JPEG image. Returns either None (if none found) or a string of data for the image. There is no 100% guarantee for this code, yet it seems to work fine with most scanner-produced images around. More testing might be needed though. Note that in principle there is no serious problem extracting PNGs or other image types from PDFs, however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable for PDFMiner. :param fstream: Readable binary stream of the PDF :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed. """ parser = PDFParser(fstream) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) for page in pages: interpreter.process_page(page) layout = device.result for el in layout: if isinstance(el, LTFigure): for im in el: if isinstance(im, LTImage): # Found one! st = None try: imdata = im.stream.get_data() except: # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well... imdata = im.stream.get_rawdata() if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'): return imdata return None
Example #4
Source File: pdf.py From ChemDataExtractor with MIT License | 5 votes |
def _process_layout(self, layout): """Process an LTPage layout and return a list of elements.""" # Here we just group text into paragraphs elements = [] for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): elements.append(Paragraph(lt_obj.get_text().strip())) elif isinstance(lt_obj, LTFigure): # Recursive... elements.extend(self._process_layout(lt_obj)) return elements
Example #5
Source File: loaders.py From py-pdf-parser with MIT License | 4 votes |
def load( pdf_file: IO, pdf_file_path: Optional[str] = None, la_params: Optional[Dict] = None, **kwargs, ) -> PDFDocument: """ Loads the pdf file into a PDFDocument. Args: pdf_file (io): The PDF file. la_params (dict): The layout parameters passed to PDF Miner for analysis. See the PDFMiner documentation here: https://pdfminersix.readthedocs.io/en/latest/api/composable.html#laparams. Note that py_pdf_parser will re-order the elements it receives from PDFMiner so options relating to element ordering will have no effect. pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation for `PDFDocument`. kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`. Returns: PDFDocument: A PDFDocument with the file loaded. """ if la_params is None: la_params = {} la_params = {**DEFAULT_LA_PARAMS, **la_params} pages: Dict[int, Page] = {} for page in extract_pages(pdf_file, laparams=LAParams(**la_params)): elements = [element for element in page if isinstance(element, LTTextContainer)] # If all_texts=True then we may get some text from inside figures if la_params.get("all_texts"): figures = (element for element in page if isinstance(element, LTFigure)) for figure in figures: elements += [ element for element in figure if isinstance(element, LTTextContainer) ] if not elements: logger.warning( f"No elements detected on page {page.pageid}, skipping this page." ) continue pages[page.pageid] = Page( width=page.width, height=page.height, elements=elements ) return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)