Python pdfminer.converter.TextConverter() Examples
The following are 27
code examples of pdfminer.converter.TextConverter().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pdfminer.converter
, or try the search function
.
Example #1
Source File: utils.py From ResumeParser with MIT License | 12 votes |
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close()
Example #2
Source File: pdf.py From blueflower with GNU General Public License v3.0 | 11 votes |
def pdf_do_pdf(astream, afile): outstream = io.BytesIO() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, device) try: for page in PDFPage.get_pages(astream, set(), maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) except PDFTextExtractionNotAllowed as e: log_error(str(e), afile) return text = outstream.getvalue() text_do_data(text, afile) outstream.close()
Example #3
Source File: autosumpdf.py From autosum with MIT License | 7 votes |
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() with open(path, 'rb') as fp: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
Example #4
Source File: resumeparser.py From resume-parser with MIT License | 7 votes |
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text #Function to extract names from the string using spacy
Example #5
Source File: pdf_to_txt.py From DLink_Harvester with GNU General Public License v3.0 | 7 votes |
def convert(fp): logger = logging.getLogger() logger.propagate = False logging.getLogger().setLevel(logging.ERROR) caching = True rsrcmgr = PDFResourceManager(caching=caching) pagenos=set() maxpages=0 password='' with StringIO() as output: try: device = TextConverter(rsrcmgr, output, laparams=LAParams()) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) return output.getvalue() finally: device.close()
Example #6
Source File: Parser.py From ioc_parser with MIT License | 6 votes |
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise
Example #7
Source File: metadataPDF.py From EasY_HaCk with Apache License 2.0 | 6 votes |
def getTexts(self): try: password ='' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt','w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname= self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt','rb') test=infp.read() infp.close() os.remove('temppdf.txt') self.text=test return "ok" except Exception,e: return e
Example #8
Source File: BaseTestClasses.py From email2pdf with MIT License | 6 votes |
def getPDFText(self, filename): try: with io.StringIO() as retstr: with open(filename, 'rb') as filehandle: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, retstr, laparams=LAParams()) pagenos = set() process_pdf(rsrcmgr, device, filehandle, pagenos, maxpages=0, password="", caching=True, check_extractable=True) device.close() string = retstr.getvalue() return string except PSException: return None
Example #9
Source File: pdf_miner.py From ocr-table with MIT License | 6 votes |
def convert(fname): pages=None if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close print(text) # write to .txt text_file = open("output.txt", "w") text = re.sub("\s\s+", " ", text.decode('utf-8')) text_file.write("%s" % text) text_file.close()
Example #10
Source File: pdf_utils.py From keras-english-resume-parser-and-analyzer with MIT License | 6 votes |
def pdf_to_text(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() result = [] for line in text.split('\n'): line2 = line.strip() if line2 != '': result.append(line2) return result
Example #11
Source File: carpe_pdf.py From carpe with Apache License 2.0 | 6 votes |
def parse_content(self): if self.document or self.parse(): caching = True # normal pdf rsrcmgr = PDFResourceManager(caching=caching) retstr = io.StringIO() # codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in WrapperPDFPage.get_pages(self.pdf, parser=self.parser, doc=self.document, caching=caching): interpreter.process_page(page) self.content = retstr.getvalue() device.close() retstr.close() else: # damaged pdf self.restore_content()
Example #12
Source File: oa_pdf.py From oadoi with MIT License | 6 votes |
def convert_pdf_to_txt(r, max_pages=3): text = None rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) if r.status_code != 200: logger.info(u"error: status code {} in convert_pdf_to_txt".format(r.status_code)) return None if not r.encoding: r.encoding = "utf-8" fp = StringIO(r.content_big()) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=max_pages, password=password, caching=caching, check_extractable=True) for page in pages: interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() # logger.info(text) return text
Example #13
Source File: pdfToText.py From python-automation-scripts with GNU General Public License v3.0 | 6 votes |
def convertPdfToText(path): #converts all pdf pages to text rsrcmgr=PDFResourceManager() retstr=StringIO() codec='utf-8' laparams=LAParams() device=TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp=file(path, 'rb') filename=path interpreter=PDFPageInterpreter(rsrcmgr, device) maxpages=0 caching=True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password="",caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() writeToText(text,absolute_path_shortner(path))
Example #14
Source File: main.py From pdf2word with MIT License | 6 votes |
def read_from_pdf(file_path): with open(file_path, 'rb') as file: resource_manager = PDFResourceManager() return_str = StringIO() lap_params = LAParams() device = TextConverter( resource_manager, return_str, laparams=lap_params) process_pdf(resource_manager, device, file) device.close() content = return_str.getvalue() return_str.close() return content
Example #15
Source File: converter.py From cvscan with MIT License | 6 votes |
def pdf_to_txt(file_name): try: file_pointer = open(file_name,'rb') # Setting up pdf reader pdf_resource_manager = PDFResourceManager() return_string = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(pdf_resource_manager, return_string, codec=codec, \ laparams=laparams) interpreter = PDFPageInterpreter(pdf_resource_manager, device) for page in PDFPage.get_pages(file_pointer, set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) file_pointer.close() device.close() # Get full string from PDF pdf_txt = return_string.getvalue() return_string.close() # logging.debug(pdf_txt) # Formatting removing and replacing special characters pdf_txt = pdf_txt.replace("\r", "\n") pdf_txt = re.sub(regex.bullet, " ", pdf_txt) return pdf_txt.decode('ascii', errors='ignore') except Exception, exception_instance: logging.error('Error converting pdf to txt: '+str(exception_instance)) return ''
Example #16
Source File: pdf.py From yeti with Apache License 2.0 | 6 votes |
def do_import(self, results, filepath): buff = StringIO() fp = open(filepath, 'rb') laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() page_num = 0 for page in PDFPage.get_pages(fp, pagenos, check_extractable=True): page_num += 1 device = TextConverter( rsrcmgr, buff, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) buff.write("\n") results.investigation.update(import_text=buff.getvalue()) fp.close() buff.close()
Example #17
Source File: youdao.py From FengTools with MIT License | 6 votes |
def pdf_2_txt(pdf) : outfile = pdf + '.txt' args = [pdf] debug = 0 pagenos = set() password = '' maxpages = 0 rotation = 0 codec = 'utf-8' #输出编码 caching = True imagewriter = None laparams = LAParams() PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) outfp = open(outfile,'w',encoding="utf8") device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,imagewriter=imagewriter) for fname in args: fp = open(fname,'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) #处理文档对象中每一页的内容 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) : page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return outfile
Example #18
Source File: Converter.py From SimplyEmail with GNU General Public License v3.0 | 6 votes |
def convert_pdf_to_txt(self, path): """ A very simple conversion function which returns text for parsing from PDF. path = The path to the file """ try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text except Exception as e: text = "" return text self.logger.error( "Failed to PDF to text: " + str(e))
Example #19
Source File: metadataPDF.py From ITWSV with MIT License | 6 votes |
def getTexts(self): try: password ='' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt','w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname= self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt','rb') test=infp.read() infp.close() os.remove('temppdf.txt') self.text=test return "ok" except Exception,e: return e
Example #20
Source File: metadataPDF.py From Yuki-Chan-The-Auto-Pentest with MIT License | 6 votes |
def getTexts(self): try: password ='' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt','w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname= self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt','rb') test=infp.read() infp.close() os.remove('temppdf.txt') self.text=test return "ok" except Exception,e: return e
Example #21
Source File: convert_pdf.py From python-tools with MIT License | 6 votes |
def convert_pdf(input_file, format='text', codec='utf-8'): """Convert PDF file to text or html. Args: input_file (str): Input PDF file. format (str): Format text or html. codec (str): Codec for encode the text. Returns: str: Return text or html from PDF file. """ manager = PDFResourceManager() output = BytesIO() laparams = LAParams() if format == 'text': converter = TextConverter(manager, output, codec=codec, laparams=laparams) elif format == 'html': converter = HTMLConverter(manager, output, codec=codec, laparams=laparams) with open(input_file, 'rb') as f1: interpreter = PDFPageInterpreter(manager, converter) for page in PDFPage.get_pages(f1, caching=True, check_extractable=True): interpreter.process_page(page) converter.close() text = output.getvalue() output.close() return text.decode()
Example #22
Source File: iocp.py From connectors with Apache License 2.0 | 5 votes |
def parse_pdf_pdfminer(self, f, fpath): try: list_pages = [] laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() list_pages.append(self.parse_page(fpath, data, page_num)) self.handler.print_footer(fpath) return list_pages except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
Example #23
Source File: helper.py From resilient-community-apps with MIT License | 5 votes |
def extract_text_from_pdf(cls, attachment_input): """ Wrapper to convert bytes data in into PDF file and extracting the text data from .pdf file :param attachment_input: attachment Bytes data from resilient api call :return: Text Data """ # Set logs for pdfminer to ERROR as too much noise in logs logging.getLogger('pdfminer').setLevel(logging.ERROR) resource_manager = PDFResourceManager() # To Handle unicode conversion in python 2 and python 3 if six.PY2: fake_file_handle = io.BytesIO() else: fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) extracted_input = u"" with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as temp_pdf_file: try: # Write and close temp file temp_pdf_file.write(attachment_input) # Reading the Data from Created Temp File for page in PDFPage.get_pages(temp_pdf_file, caching=True, check_extractable=True): page_interpreter.process_page(page) extracted_input = fake_file_handle.getvalue() except Exception as error_msg: raise ValueError("Failed Convert .pdf files data to string format. Error: {0}".format(error_msg)) finally: # close open handles converter.close() fake_file_handle.close() return extracted_input
Example #24
Source File: web_pdf_reading.py From accel-brain-code with GNU General Public License v2.0 | 4 votes |
def path_to_text(self, path): ''' Transform local PDF file to string. Args: path: path to PDF file. Returns: string. ''' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages_data = PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) for page in pages_data: interpreter.process_page(page) text = retstr.getvalue() text = text.replace("\n", "") fp.close() device.close() retstr.close() return text
Example #25
Source File: pdfminer_wrapper.py From invoice2data with MIT License | 4 votes |
def to_text(path): """Wrapper around `pdfminer`. Parameters ---------- path : str path of electronic invoice in PDF Returns ------- str : str returns extracted text from pdf """ try: # python 2 from StringIO import StringIO import sys reload(sys) # noqa: F821 sys.setdefaultencoding("utf8") except ImportError: from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, "rb") as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ) for page in pages: interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str.encode("utf-8")
Example #26
Source File: pdftitle.py From pdftitle with GNU General Public License v3.0 | 4 votes |
def get_title_from_io(pdf_io): # pylint: disable=too-many-locals parser = PDFParser(pdf_io) # if pdf is protected with a pwd, 2nd param here is password doc = PDFDocument(parser) # pdf may not allow extraction # pylint: disable=no-else-return if doc.is_extractable: rm = PDFResourceManager() dev = TextOnlyDevice(rm) interpreter = TextOnlyInterpreter(rm, dev) first_page = StringIO() converter = TextConverter(rm, first_page, laparams=LAParams()) page_interpreter = PDFPageInterpreter(rm, converter) for page in PDFPage.create_pages(doc): interpreter.process_page(page) page_interpreter.process_page(page) break converter.close() first_page_text = first_page.getvalue() first_page.close() dev.recover_last_paragraph() verbose('all blocks') for b in dev.blocks: verbose(b) # find max font size max_tfs = max(dev.blocks, key=lambda x: x[1])[1] verbose('max_tfs: ', max_tfs) # find max blocks with max font size max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks)) # find the one with the highest y coordinate # this is the most close to top max_y = max(max_blocks, key=lambda x: x[3])[3] verbose('max_y: ', max_y) found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks)) verbose('found blocks') for b in found_blocks: verbose(b) block = found_blocks[0] title = ''.join(block[4]).strip() # Retrieve missing spaces if needed if " " not in title: title = retrieve_spaces(first_page_text, title) # Remove duplcate spaces if any are present if " " in title: title = " ".join(title.split()) return title else: return None
Example #27
Source File: update_itunes.py From xlinkBook with MIT License | 4 votes |
def itunesPdfParser(self, data): outfile = data+'.txt' fp = file(data, 'rb') outfp = file(outfile,'w') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() device.close() outfp.close() f = open(outfile,'r') result = '' course_list = [] i = 0 for line in f.readlines(): if len(line.strip()) > 0: i += 1 if i < 3 and line.find('…') == -1: result += line.strip() + ' ' else: if result.strip() == 'Explore App Store': break if result.find('iTunes U') != -1 or result.find('Featured Featured') != -1: result = '' i = 0 continue course_list.append(result) #print result result = '' i = 0 f.close() os.remove(outfile) return course_list