Python newspaper.Article() Examples
The following are 30
code examples of newspaper.Article().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
newspaper
, or try the search function
.
Example #1
Source File: core.py From Python-DevOps with MIT License | 7 votes |
def get_article(link, news, date): article = Article(link) article.download() article.parse() article.nlp() lang = 'eng' if len(article.title) < 5 or len(article.text) < 5: print('found BM/ID article') article = Article(link, language = 'id') article.download() article.parse() article.nlp() lang = 'id' return { 'title': article.title, 'url': link, 'authors': article.authors, 'top-image': article.top_image, 'text': article.text, 'keyword': article.keywords, 'summary': article.summary, 'news': news, 'date': date, 'language': lang, }
Example #2
Source File: cnn_dm_downloader.py From TransferRL with MIT License | 6 votes |
def run(param): (article_dir, title_dir, html_path) = param try: raw_html = open(html_path, encoding="ascii", errors="surrogateescape").read().strip() except: raw_html = open(html_path, encoding=encoding_detector(html_path), errors="surrogateescape").read().strip() id = html_path.split('/')[-1].split('.')[0] a = Article('http:/www.dummy.com', language='en') a.download(input_html=raw_html) a.parse() title = a.title text = a.text title = remove_non_ascii(title) text = remove_non_ascii(text) fw = open('{}/{}'.format(article_dir, id),'w',encoding='utf-8') fw.write(text) fw.close() fw = open('{}/{}'.format(title_dir, id),'w',encoding='utf-8') fw.write(title) fw.close()
Example #3
Source File: scrapers.py From openwebtext with GNU General Public License v3.0 | 6 votes |
def bs4_scraper(url, memoize): t1 = time.time() try: article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() html = article.html soup = bs4.BeautifulSoup(html, "lxml") text, count = find_and_filter_tag("p", soup) # DDB: keep text as a single string for consistency with # newspaper_scraper text = " ".join(text) except: return None, None metadata = { "url": url, "word_count": count, "elapsed": time.time() - t1, "scraper": "bs4", } return text, metadata
Example #4
Source File: scrapers.py From openwebtext with GNU General Public License v3.0 | 6 votes |
def newspaper_scraper(url, memoize): t1 = time.time() try: article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() article.parse() text = article.text count = len(text.split()) except: return None, None metadata = { "url": url, "word_count": count, "elapsed": time.time() - t1, "scraper": "newspaper", } return text, metadata
Example #5
Source File: watch.py From Stockeye with MIT License | 6 votes |
def summarizeArticles(articles, length, firstlast = False): summedArticles = [] for a in articles: try: A = Article(a.link) A.download() A.parse() text = "" paragraphs = A.text.split('\n') for p in paragraphs: if len(p) > 100: a.body.append(p) text += p + ' ' sentences = summarize(text, length, firstlast) for s in sentences: a.summary.append(s) summedArticles.append(a) except: pass return summedArticles
Example #6
Source File: scrapers.py From openwebtext with GNU General Public License v3.0 | 6 votes |
def raw_scraper(url, memoize): t1 = time.time() try: cleaner = Cleaner() cleaner.javascript = True cleaner.style = True article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize) article.download() html = minify(article.html) html = cleaner.clean_html(html) article.parse() except: return None, None if article.text == "": return None, None metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"} return html, metadata
Example #7
Source File: process_ccrawl.py From grover with Apache License 2.0 | 5 votes |
def __init__(self, html): self.html = html if html is not None else "" self.dummy_article = newspaper.Article(url='', fetch_images=False, verbose=True) self.dummy_article.set_html(html) self.dummy_article.parse() self.text = _filter_excessive_newlines(self.dummy_article.text) self.authors = self.dummy_article.authors self.authors = [x for x in self.authors if len(x.split(' ')) < 10] self.title = self.dummy_article.title # sometimes the text started with the title... that's bad if self.text.startswith(self.title + '\n'): self.text = self.text[len(self.title):].lstrip('\n') if self.dummy_article.publish_date and not isinstance(self.dummy_article.publish_date, str): try: self.publish_date = self.dummy_article.publish_date.date().strftime( "%m-%d-%Y") except AttributeError: self.publish_date = None else: self.publish_date = None self._extract_summary()
Example #8
Source File: extract_text.py From openwebtext with GNU General Public License v3.0 | 5 votes |
def parse_file(filename): with open(filename, "rt") as f: html = f.read() url_hash = md5(html.encode("utf-8")).hexdigest() article = newspaper.Article(url=url_hash, fetch_images=False) article.set_html(html) article.parse() return filename, article.text
Example #9
Source File: samacharbot2.py From samacharbot2 with GNU General Public License v3.0 | 5 votes |
def summarizeOther(url): article = Article(url) article.download() article.parse() text = article.text keypoints = summary(text) summ = article.title keypoints = keypoints.replace("`", "") keypoints = keypoints.replace("#", "\#") return summ, keypoints, text
Example #10
Source File: core.py From samacharbot2 with GNU General Public License v3.0 | 5 votes |
def newspaper_extractor(url): try: article = Article(url) article.download() article.parse() except Exception as e: print e return None, None, None return article.title, article.meta_description, article.text
Example #11
Source File: topic_extraction.py From nlp-architect with Apache License 2.0 | 5 votes |
def load_url_content(url_list): """ Load articles content into a list of docs (texts) Args: url_list (List[String]): A list of urls Returns: A list of documents (List[String]) """ files_content = [] url = "" try: for url in url_list: try: url = str(url) logger.info("loading %s", url) article = Article(url) article.download() article.parse() files_content.append(article.title + " " + article.text) except Exception as e: logger.error(str(e)) except Exception as e: logger.error("Error in load_text: %s, for url: %s", str(e), str(url)) return files_content
Example #12
Source File: newspaper_extractor.py From news-please with Apache License 2.0 | 5 votes |
def extract(self, item): """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of parsing the HTML-Code. :param item: A NewscrawlerItem to parse. :return: ArticleCandidate containing the recovered article data. """ article_candidate = ArticleCandidate() article_candidate.extractor = self._name() article = Article('') article.set_html(item['spider_response'].body) article.parse() article_candidate.title = article.title article_candidate.description = article.meta_description article_candidate.text = article.text article_candidate.topimage = article.top_image article_candidate.author = article.authors if article.publish_date is not None: try: article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S') except ValueError as exception: self.log.debug('%s: Newspaper failed to extract the date in the supported format,' 'Publishing date set to None' % item['url']) article_candidate.language = article.meta_lang return article_candidate
Example #13
Source File: sentimark.py From python-qutescript with BSD 2-Clause "Simplified" License | 5 votes |
def sentiment_markup(request): article = Article(request.url) # article.download(request.html, request.title) article.download() article.parse() html = generate_html(article.text.split('\n\n'), article.title).render() request.send_html(html)
Example #14
Source File: news.py From vexbot with GNU General Public License v3.0 | 5 votes |
def summarize_article(self, url: str, *args, **kwargs): article = newspaper.Article(url) article.download() article.parse() summarization = gensim.summarization.summarize(article.text) return summarization
Example #15
Source File: audio_actions.py From pockebot with MIT License | 5 votes |
def make_an_audio(url, filename, lang=None): if lang is None: lang = 'en' article = Article(url) article.download() article.parse() tts = gTTS(text=article.text, lang=lang) f = open(join('audio', filename), 'wb') tts.write_to_fp(f) f.close()
Example #16
Source File: newsdownload.py From File-Maker with GNU Affero General Public License v3.0 | 5 votes |
def newspaper_init(self): self.newsdata = newspaper.Article(self.url, language=self.language) self.newsdata.download() try: self.newsdata.parse() except newspaper.article.ArticleException: # trying again self.newsdata.parse() except newspaper.article.ArticleException: return [] self.article = self.newsdata.text self.picture = self.newsdata.top_image self.html = self.newsdata.html self.soup = BeautifulSoup(self.html, "lxml")
Example #17
Source File: preprocess.py From PyTLDR with GNU General Public License v3.0 | 5 votes |
def parse_input(text, extractor='newspaper'): if isinstance(text, str) or isinstance(text, unicode): if text.startswith(('http://', 'https://')): # Input is a link - need to extract the text from html if extractor.lower() == 'goose': from goose import Goose urlparse = Goose() article = urlparse.extract(url=text) return unicode_to_ascii(article.cleaned_text) else: from newspaper import Article article = Article(text) article.download() article.parse() return unicode_to_ascii(article.text) elif text.endswith('.txt'): # Input is a file - need to read it textfile = open(text, 'rb') article = textfile.read() textfile.close() return unicode_to_ascii(article) else: # Input is a string containing the raw text return unicode_to_ascii(text) else: raise ValueError('Input text must be of type str or unicode.')
Example #18
Source File: generic.py From mma-dexter with Apache License 2.0 | 5 votes |
def crawl(self, doc): """ Crawl this document. """ # instantiate and download article article = Article(url=doc.url, language='en', fetch_images=False, request_timeout=10) article.download() # extract content self.extract(doc, article)
Example #19
Source File: __init__.py From sneakpeek with MIT License | 5 votes |
def handle(cls, url): article = Article(url) article.download() article.parse() title = article.title body = article.text return Comment(title, body)
Example #20
Source File: __init__.py From sneakpeek with MIT License | 5 votes |
def make_comment(cls, best_candidate): url = f"https://www.pressreader.com{best_candidate}" article = Article(url, browser_user_agent="Googlebot-News", keep_article_html=True) article.download() try: article.parse() except: return Comment('', '') title = article.title.replace("\xad", "") # clean the text body = article.text.replace("\xad", "") # clean the text print(f"checking the article in this url: {url} with title {title}") return Comment(title, body)
Example #21
Source File: __init__.py From sneakpeek with MIT License | 5 votes |
def handle_non_premium(cls): """Handle a non-premium article.""" article = Article(cls.url) article.download() article.parse() title = article.title body = article.text return Comment(title, body)
Example #22
Source File: extract_text.py From GPT2 with MIT License | 5 votes |
def parse_file(file_entry): file_name, html = file_entry url_hash = md5(html).hexdigest() article = newspaper.Article(url=url_hash, fetch_images=False) article.set_html(html) article.parse() return (file_name, article.text)
Example #23
Source File: text_analysis.py From OctoBot-Tentacles with GNU Lesser General Public License v3.0 | 5 votes |
def analyse_web_page_article(self, url): article = Article(url) article.download() article.parse() return article, self.analyse(article.text) # return a list of high influential value websites
Example #24
Source File: extraction.py From geograpy2 with MIT License | 5 votes |
def download_text(self): """Downloads text from self.url and strip HTML tags. """ if not self.text and self.url: a = Article(self.url) a.download() a.parse() self.text = a.text
Example #25
Source File: news.py From W.I.L.L with MIT License | 4 votes |
def news_reader(event): '''Use the excellent newspaper module to fetch the news from the readers favorite site''' response = {"type": "success", "text": None, "data": {}} db = event['db'] event_user = event['username'] user_table = db['users'].find_one(username=event_user) user_news_site = user_table["news_site"] news_table = db["news"] cached_sites = [list(site.values())[0] for site in db.query("SELECT site from `news`")] log.debug("Cached sites are {0}".format(cached_sites)) if user_news_site in cached_sites: site_row = news_table.find_one(site=user_news_site) site_time = site_row["time"] if time.time()<site_time+43200: log.info("Using cached news for site {0}".format(user_news_site)) news_str = site_row["news_str"] response["text"] = news_str log.info("Parsing news site {0} for user {1}".format(user_news_site, event_user)) site_object = newspaper.build(user_news_site, memoize_articles=False) log.debug("Finished building newspaper object") top_articles = site_object.articles[0:4] log.debug("Top articles are {0}".format(list(top_articles))) output_strs = [] #Use multithreading to build the objects for the articles def build_article_object(article_url): '''Build a formatted string with the article title, summary, and url''' log.debug("Building article object for article {0}".format(article_url)) article = newspaper.Article(article_url) log.debug("Downloading article {0}".format(article_url)) article.download() log.debug("Finished downloading article {0}, parsing".format(article_url)) try: article.parse() log.debug("Finished debugging {0}, running nlp".format(article_url)) article.nlp() article_str = "{0} ({1})\n{2}\n".format( str(article.title).encode('ascii', 'ignore'), article_url, str(article.summary)) output_strs.append(str(article_str)) except newspaper.article.ArticleException: log.info(":{0}:Article exception with url {1}".format(event["session"]["id"], article_url)) article_threads = [] for article in top_articles: article_thread = threading.Thread(target=build_article_object, args=(article.url, )) article_threads.append(article_thread) [thread.start() for thread in article_threads] log.debug("Started news parsing threads, waiting for parsing to finish") [thread.join() for thread in article_threads] log.debug("Compiling article output {0} into string".format(output_strs)) output_str = '\n'.join(output_strs) log.debug("Returning output string {0}".format(output_str)) db["news"].upsert(dict(site=user_news_site,time=time.time(),news_str=output_str), ['site']) response["text"] = output_str return response
Example #26
Source File: search.py From W.I.L.L with MIT License | 4 votes |
def search_google(query): '''Search google and determine if wikipedia is in it''' search_object = google.search(query) #Determine if a wikipedia url is in the first 5 searches urls = [] for i in range(0, 4): url = search_object.__next__() urls.append(url) if "wikipedia.org/wiki" in url: wikipedia_search = wikipedia.search(query)[0] url = wikipedia.page(wikipedia_search).url response = wikipedia.summary(wikipedia_search) + " ({0})".format(url) return response #If there were no wikipedia pages first_url = urls[0] try: article = Article(first_url) article.download() article.parse() article.nlp() article_summary = article.summary article_title = article.title return "{0}\n{1} - ({2})".format( article_summary, article_title, first_url ) except Exception as article_exception: try: log.debug("Got error {0}, {1} while using newspaper, switching to bs4".format( article_exception.message,article_exception.args )) html = requests.get(first_url).text #Parse the html using bs4 soup = BeautifulSoup(html, "html.parser") [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] text = soup.getText() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines soup_text = '\n'.join(chunk for chunk in chunks if " " in chunk) response = format(soup_text) + " ({0})".format(first_url) return response except Exception as search_exception: log.info("Error {0},{1} occurred while searching query {2}".format( search_exception.message, search_exception.args, query )) return "Error encountered on query {0}".format(query)
Example #27
Source File: sentiment.py From stocksight with Apache License 2.0 | 4 votes |
def tweeklink_sentiment_analysis(url): # get text summary of tweek link web page and run sentiment analysis on it try: logger.info('Following tweet link %s to get sentiment..' % url) article = Article(url) article.download() article.parse() # check if twitter web page if "Tweet with a location" in article.text: logger.info('Link to Twitter web page, skipping') return None article.nlp() tokens = article.keywords print("Tweet link nltk tokens:", tokens) # check for min token length if len(tokens) < 5: logger.info("Tweet link does not contain min. number of tokens, not adding") return None # check ignored tokens from config for t in nltk_tokens_ignored: if t in tokens: logger.info("Tweet link contains token from ignore list, not adding") return None # check required tokens from config tokenspass = False tokensfound = 0 for t in nltk_tokens_required: if t in tokens: tokensfound += 1 if tokensfound == nltk_min_tokens: tokenspass = True break if not tokenspass: logger.info("Tweet link does not contain token from required list or min required, not adding") return None summary = article.summary if summary == '': logger.info('No text found in tweet link url web page') return None summary_clean = clean_text(summary) summary_clean = clean_text_sentiment(summary_clean) print("Tweet link Clean Summary (sentiment): " + summary_clean) polarity, subjectivity, sentiment = sentiment_analysis(summary_clean) return polarity, subjectivity, sentiment except ArticleException as e: logger.warning('Exception: error getting text on Twitter link caused by: %s' % e) return None
Example #28
Source File: summarizer.py From ns with MIT License | 4 votes |
def generate_summary(topic, words): """Return summary of the topic subjected to word limit.""" print("Generate Summary %s" % topic) def query_links(topic): query = urllib.parse.urlencode({ "q": "'" + topic + "'", "count": 4 }) headers = { "Ocp-Apim-Subscription-Key": API_KEY } url = API_ROOT + "?%s" % query r = requests.get(url, headers=headers) return r query_job = gevent.spawn(query_links, topic) gevent.joinall([query_job],5000) result = query_job.value.json() links = [x["url"] for x in result["value"]] names = [x["name"] for x in result["value"]] lines = [] def download_and_clean(url): try: print("Download " + url) article = Article(url) article.download() print("Parse " + url) article.parse() text = article.text top_image = article.top_image except: print("Failed to get " + url) text = "" top_image = "" return text, top_image jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]] gevent.joinall(jobs, timeout=10) lines = [job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100] top_images = [job.value[1] for job in jobs if job.value and job.value[1]] gc.collect() try: summary = sumbasic.orig(lines, words) except ValueError: print("Generate Summary failed for " + str(links)) traceback.print_exc() summary = "Generating summary failed" print("Generate Summary complete for " + str(links)) return summary, top_images, links, names, topic, words
Example #29
Source File: analyse.py From Stock-Analysis with MIT License | 4 votes |
def SaSentimentRSS(symbol): url = "http://seekingalpha.com/symbol/" + symbol + ".xml" url2 = "http://feeds.finance.yahoo.com/rss/2.0/headline?s=" + symbol + "®ion=US&lang=en-US" url3 = "http://www.google.ca/finance/company_news?q=" + symbol + "&output=rss" # gets list of links from above RSS feed NewsURLs = getSaURL(url) NewsURLs += RSS_URL.getURLs2(url2) NewsURLs += RSS_URL.getURLs2(url3) # String to be written to file toBeWrittenToFile = '' for link in NewsURLs: try: # gets article portion of the htmltext a = Article(link) a.download() a.parse() # not working if it's RSS title link or has no title or cannot be accessed if symbol in a.title and not 'Earnings Call Webcast' in a.title and not 'Stock Market Insights' in a.title and not '400 Bad Request' in a.title and not '403 Forbidden' in a.title and a.title != '': UnicodeArticle = a.text StringArticle = UnicodeArticle.encode('ascii', 'ignore') StrippedArticle = StringArticle.replace('\n', '') # not working with articles less than 300 words if len(StrippedArticle) > 200: # remove ascii symbols ArticleTitle = a.title.encode('ascii', 'ignore').replace(',', '') # filters out irrelevant articles if 'Transcript' not in ArticleTitle and 'Summary' not in ArticleTitle: # writes sentiment from sentiment API to file # locks this block so that only one thread can write to file at a time # vader sentiment dictionary s = vaderSentiment.sentiment(StrippedArticle) # not writing articles with zero sentiments # collect a string to be written to file if s['compound'] != 0: # print(ArticleTitle) toBeWrittenToFile += ( str(symbol) + ',' + str(s['neg']) + ',' + str(s['neu']) + ',' + str(s['pos']) + ',' + str( s['compound']) + ',' + ArticleTitle + ',' + str(link) + '\n') except Exception as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message) # write variable to file lock.acquire() try: myfile.write(toBeWrittenToFile) finally: lock.release()
Example #30
Source File: samacharbot2.py From samacharbot2 with GNU General Public License v3.0 | 4 votes |
def start(): r, oauth_helper, subreddit = init() while True: try: submissions = subreddit.get_new(limit=50) except praw.errors.HTTPException: print "HTTP Exception" sleep(300) continue nothing = True for submission in submissions: print "Working on - " + str(submission.id), fo, ids = postTracker() oauth_helper.refresh() if int(submission.score) >= 0: if checkConditions(submission, ids): nothing = False try: processSummarization(fo, submission, 'smrzr') except smrzr.ArticleExtractionFail: print "Article Extraction Failed" continue # in case Smrzr fails to get summary, try another method except AssertionError: print "Assertion Error" processSummarization(fo, submission, 'other') except Exception as e: print "Unknown ERROR" print type(e) print e.args print e print submission.id print "\n" continue fo.close() else: print "Nothing to do, checking messages" # if no more posts to summarize, go through unread messages and see if any posts to delete. checkDelete(r) continue if nothing: print "Nothing to do, sleeping for 1 minute" sleep(60)