Python Examples of newspaper.Article

Source File: core.py From Python-DevOps with MIT License

7 votes

def get_article(link, news, date):
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    lang = 'eng'
    if len(article.title) < 5 or len(article.text) < 5:
        print('found BM/ID article')
        article = Article(link, language = 'id')
        article.download()
        article.parse()
        article.nlp()
        lang = 'id'
    return {
        'title': article.title,
        'url': link,
        'authors': article.authors,
        'top-image': article.top_image,
        'text': article.text,
        'keyword': article.keywords,
        'summary': article.summary,
        'news': news,
        'date': date,
        'language': lang,
    }

Source File: cnn_dm_downloader.py From TransferRL with MIT License

6 votes

def run(param):
    (article_dir, title_dir, html_path) = param
    try:
        raw_html = open(html_path, encoding="ascii", errors="surrogateescape").read().strip()
    except:
        raw_html = open(html_path, encoding=encoding_detector(html_path), errors="surrogateescape").read().strip()

    id = html_path.split('/')[-1].split('.')[0]
    a = Article('http:/www.dummy.com', language='en')
    a.download(input_html=raw_html)
    a.parse()
    title = a.title
    text = a.text
    title = remove_non_ascii(title)
    text = remove_non_ascii(text)
    fw = open('{}/{}'.format(article_dir, id),'w',encoding='utf-8')
    fw.write(text)
    fw.close()
    fw = open('{}/{}'.format(title_dir, id),'w',encoding='utf-8')
    fw.write(title)
    fw.close()

Source File: scrapers.py From openwebtext with GNU General Public License v3.0

6 votes

def bs4_scraper(url, memoize):
    t1 = time.time()

    try:
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = article.html
        soup = bs4.BeautifulSoup(html, "lxml")
        text, count = find_and_filter_tag("p", soup)
        # DDB: keep text as a single string for consistency with
        # newspaper_scraper
        text = " ".join(text)
    except:
        return None, None

    metadata = {
        "url": url,
        "word_count": count,
        "elapsed": time.time() - t1,
        "scraper": "bs4",
    }
    return text, metadata

Source File: scrapers.py From openwebtext with GNU General Public License v3.0

6 votes

def newspaper_scraper(url, memoize):
    t1 = time.time()

    try:
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        article.parse()
        text = article.text
        count = len(text.split())
    except:
        return None, None

    metadata = {
        "url": url,
        "word_count": count,
        "elapsed": time.time() - t1,
        "scraper": "newspaper",
    }
    return text, metadata

Source File: watch.py From Stockeye with MIT License

6 votes

def summarizeArticles(articles, length, firstlast = False):
    summedArticles = []
    for a in articles:
        try: 
            A = Article(a.link)
            A.download()
            A.parse()
            text = ""
            paragraphs = A.text.split('\n')
            for p in paragraphs:
                if len(p) > 100:
                    a.body.append(p)
                    text += p + ' ' 
            sentences = summarize(text, length, firstlast)
            for s in sentences:
                a.summary.append(s) 
            summedArticles.append(a)    
        except: pass
    return summedArticles

Source File: scrapers.py From openwebtext with GNU General Public License v3.0

6 votes

def raw_scraper(url, memoize):
    t1 = time.time()

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, None
    if article.text == "":
        return None, None

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata

Source File: process_ccrawl.py From grover with Apache License 2.0

5 votes

def __init__(self, html):
        self.html = html if html is not None else ""

        self.dummy_article = newspaper.Article(url='', fetch_images=False, verbose=True)
        self.dummy_article.set_html(html)
        self.dummy_article.parse()

        self.text = _filter_excessive_newlines(self.dummy_article.text)
        self.authors = self.dummy_article.authors
        self.authors = [x for x in self.authors if len(x.split(' ')) < 10]
        self.title = self.dummy_article.title

        # sometimes the text started with the title... that's bad
        if self.text.startswith(self.title + '\n'):
            self.text = self.text[len(self.title):].lstrip('\n')

        if self.dummy_article.publish_date and not isinstance(self.dummy_article.publish_date, str):
            try:
                self.publish_date = self.dummy_article.publish_date.date().strftime(
                    "%m-%d-%Y")
            except AttributeError:
                self.publish_date = None
        else:
            self.publish_date = None

        self._extract_summary()

Source File: extract_text.py From openwebtext with GNU General Public License v3.0

5 votes

def parse_file(filename):
    with open(filename, "rt") as f:
        html = f.read()
        url_hash = md5(html.encode("utf-8")).hexdigest()
        article = newspaper.Article(url=url_hash, fetch_images=False)
        article.set_html(html)
        article.parse()
        return filename, article.text

Source File: samacharbot2.py From samacharbot2 with GNU General Public License v3.0

5 votes

def summarizeOther(url):
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    keypoints = summary(text)
    summ = article.title
    keypoints = keypoints.replace("`", "")
    keypoints = keypoints.replace("#", "\#")

    return summ, keypoints, text

Source File: core.py From samacharbot2 with GNU General Public License v3.0

5 votes

def newspaper_extractor(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
    except Exception as e:
        print e
        return None, None, None
    return article.title, article.meta_description, article.text

Source File: topic_extraction.py From nlp-architect with Apache License 2.0

5 votes

def load_url_content(url_list):
    """
    Load articles content into a list of docs (texts)

    Args:
        url_list (List[String]): A list of urls

    Returns:
        A list of documents (List[String])
    """
    files_content = []
    url = ""
    try:
        for url in url_list:
            try:
                url = str(url)
                logger.info("loading %s", url)
                article = Article(url)
                article.download()
                article.parse()
                files_content.append(article.title + " " + article.text)
            except Exception as e:
                logger.error(str(e))
    except Exception as e:
        logger.error("Error in load_text: %s, for url: %s", str(e), str(url))

    return files_content

Source File: newspaper_extractor.py From news-please with Apache License 2.0

5 votes

def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date is not None:
            try:
                article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
                              'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate

Source File: sentimark.py From python-qutescript with BSD 2-Clause "Simplified" License

5 votes

def sentiment_markup(request):
    article = Article(request.url)
    # article.download(request.html, request.title)
    article.download()
    article.parse()
    html = generate_html(article.text.split('\n\n'), article.title).render()
    request.send_html(html)

Source File: news.py From vexbot with GNU General Public License v3.0

5 votes

def summarize_article(self, url: str, *args, **kwargs):
    article = newspaper.Article(url)
    article.download()
    article.parse()
    summarization = gensim.summarization.summarize(article.text)
    return summarization

Source File: audio_actions.py From pockebot with MIT License

5 votes

def make_an_audio(url, filename, lang=None):
    if lang is None:
        lang = 'en'
    article = Article(url)
    article.download()
    article.parse()

    tts = gTTS(text=article.text, lang=lang)
    f = open(join('audio', filename), 'wb')
    tts.write_to_fp(f)
    f.close()

Source File: newsdownload.py From File-Maker with GNU Affero General Public License v3.0

5 votes

def newspaper_init(self):
        self.newsdata = newspaper.Article(self.url, language=self.language)
        self.newsdata.download()
        try:
            self.newsdata.parse()
        except newspaper.article.ArticleException: # trying again
            self.newsdata.parse()
        except newspaper.article.ArticleException:
            return []

        self.article = self.newsdata.text
        self.picture = self.newsdata.top_image
        self.html = self.newsdata.html
        self.soup = BeautifulSoup(self.html, "lxml")

Source File: preprocess.py From PyTLDR with GNU General Public License v3.0

5 votes

def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')

Source File: generic.py From mma-dexter with Apache License 2.0

5 votes

def crawl(self, doc):
        """ Crawl this document. """

        # instantiate and download article
        article = Article(url=doc.url, language='en', fetch_images=False, request_timeout=10)
        article.download()

        # extract content
        self.extract(doc, article)

Source File: __init__.py From sneakpeek with MIT License

5 votes

def handle(cls, url):
        article = Article(url)
        article.download()
        article.parse()

        title = article.title
        body = article.text

        return Comment(title, body)

Source File: __init__.py From sneakpeek with MIT License

5 votes

def make_comment(cls, best_candidate):
        url = f"https://www.pressreader.com{best_candidate}"
        article = Article(url, browser_user_agent="Googlebot-News", keep_article_html=True)
        article.download()
        try:
            article.parse()
        except:
            return Comment('', '')

        title = article.title.replace("\xad", "")  # clean the text
        body = article.text.replace("\xad", "")  # clean the text

        print(f"checking the article in this url: {url} with title {title}")
        return Comment(title, body)

Source File: __init__.py From sneakpeek with MIT License

5 votes

def handle_non_premium(cls):
        """Handle a non-premium article."""
        article = Article(cls.url)
        article.download()
        article.parse()

        title = article.title
        body = article.text

        return Comment(title, body)

Source File: extract_text.py From GPT2 with MIT License

5 votes

def parse_file(file_entry):
    file_name, html = file_entry
    url_hash = md5(html).hexdigest()
    article = newspaper.Article(url=url_hash, fetch_images=False)
    article.set_html(html)
    article.parse()
    return (file_name, article.text)

Source File: text_analysis.py From OctoBot-Tentacles with GNU Lesser General Public License v3.0

5 votes

def analyse_web_page_article(self, url):
        article = Article(url)
        article.download()
        article.parse()
        return article, self.analyse(article.text)

    # return a list of high influential value websites

Source File: extraction.py From geograpy2 with MIT License

5 votes

def download_text(self):
        """Downloads text from self.url and strip HTML tags.
        """
        if not self.text and self.url:
            a = Article(self.url)
            a.download()
            a.parse()
            self.text = a.text

Source File: news.py From W.I.L.L with MIT License

4 votes

def news_reader(event):
    '''Use the excellent newspaper module to fetch the news from the readers favorite site'''
    response = {"type": "success", "text": None, "data": {}}
    db = event['db']
    event_user = event['username']
    user_table = db['users'].find_one(username=event_user)
    user_news_site = user_table["news_site"]
    news_table = db["news"]
    cached_sites = [list(site.values())[0] for site in db.query("SELECT site from `news`")]
    log.debug("Cached sites are {0}".format(cached_sites))
    if user_news_site in cached_sites:
        site_row = news_table.find_one(site=user_news_site)
        site_time = site_row["time"]
        if time.time()<site_time+43200:
            log.info("Using cached news for site {0}".format(user_news_site))
            news_str = site_row["news_str"]
            response["text"] = news_str
    log.info("Parsing news site {0} for user {1}".format(user_news_site, event_user))
    site_object = newspaper.build(user_news_site, memoize_articles=False)
    log.debug("Finished building newspaper object")
    top_articles = site_object.articles[0:4]
    log.debug("Top articles are {0}".format(list(top_articles)))
    output_strs = []
    #Use multithreading to build the objects for the articles
    def build_article_object(article_url):
        '''Build a formatted string with the article title, summary, and url'''
        log.debug("Building article object for article {0}".format(article_url))
        article = newspaper.Article(article_url)
        log.debug("Downloading article {0}".format(article_url))
        article.download()
        log.debug("Finished downloading article {0}, parsing".format(article_url))
        try:
            article.parse()
            log.debug("Finished debugging {0}, running nlp".format(article_url))
            article.nlp()
            article_str = "{0} ({1})\n{2}\n".format(
                str(article.title).encode('ascii', 'ignore'), article_url, str(article.summary))
            output_strs.append(str(article_str))
        except newspaper.article.ArticleException:
            log.info(":{0}:Article exception with url {1}".format(event["session"]["id"], article_url))
    article_threads = []
    for article in top_articles:
        article_thread = threading.Thread(target=build_article_object, args=(article.url, ))
        article_threads.append(article_thread)
    [thread.start() for thread in article_threads]
    log.debug("Started news parsing threads, waiting for parsing to finish")
    [thread.join() for thread in article_threads]
    log.debug("Compiling article output {0} into string".format(output_strs))
    output_str = '\n'.join(output_strs)
    log.debug("Returning output string {0}".format(output_str))
    db["news"].upsert(dict(site=user_news_site,time=time.time(),news_str=output_str), ['site'])
    response["text"] = output_str
    return response

Source File: search.py From W.I.L.L with MIT License

4 votes

def search_google(query):
    '''Search google and determine if wikipedia is in it'''
    search_object = google.search(query)
    #Determine if a wikipedia url is in the first 5 searches
    urls = []
    for i in range(0, 4):
        url = search_object.__next__()
        urls.append(url)
        if "wikipedia.org/wiki" in url:
            wikipedia_search = wikipedia.search(query)[0]
            url = wikipedia.page(wikipedia_search).url
            response = wikipedia.summary(wikipedia_search) + " ({0})".format(url)
            return response
    #If there were no wikipedia pages
    first_url = urls[0]
    try:
        article = Article(first_url)
        article.download()
        article.parse()
        article.nlp()
        article_summary = article.summary
        article_title = article.title
        return "{0}\n{1} - ({2})".format(
            article_summary, article_title, first_url
        )

    except Exception as article_exception:
        try:
            log.debug("Got error {0}, {1} while using newspaper, switching to bs4".format(
            article_exception.message,article_exception.args
            ))
            html = requests.get(first_url).text
            #Parse the html using bs4
            soup = BeautifulSoup(html, "html.parser")
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
            text = soup.getText()
         # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # drop blank lines
            soup_text = '\n'.join(chunk for chunk in chunks if " " in chunk)
            response = format(soup_text) + " ({0})".format(first_url)
            return response
        except Exception as search_exception:
            log.info("Error {0},{1} occurred while searching query {2}".format(
                search_exception.message, search_exception.args, query
            ))
            return "Error encountered on query {0}".format(query)

Source File: sentiment.py From stocksight with Apache License 2.0

4 votes

def tweeklink_sentiment_analysis(url):
    # get text summary of tweek link web page and run sentiment analysis on it
    try:
        logger.info('Following tweet link %s to get sentiment..' % url)
        article = Article(url)
        article.download()
        article.parse()
        # check if twitter web page
        if "Tweet with a location" in article.text:
            logger.info('Link to Twitter web page, skipping')
            return None
        article.nlp()
        tokens = article.keywords
        print("Tweet link nltk tokens:", tokens)

        # check for min token length
        if len(tokens) < 5:
            logger.info("Tweet link does not contain min. number of tokens, not adding")
            return None
        # check ignored tokens from config
        for t in nltk_tokens_ignored:
            if t in tokens:
                logger.info("Tweet link contains token from ignore list, not adding")
                return None
        # check required tokens from config
        tokenspass = False
        tokensfound = 0
        for t in nltk_tokens_required:
            if t in tokens:
                tokensfound += 1
                if tokensfound == nltk_min_tokens:
                    tokenspass = True
                    break
        if not tokenspass:
            logger.info("Tweet link does not contain token from required list or min required, not adding")
            return None

        summary = article.summary
        if summary == '':
            logger.info('No text found in tweet link url web page')
            return None
        summary_clean = clean_text(summary)
        summary_clean = clean_text_sentiment(summary_clean)
        print("Tweet link Clean Summary (sentiment): " + summary_clean)
        polarity, subjectivity, sentiment = sentiment_analysis(summary_clean)
        
        return polarity, subjectivity, sentiment

    except ArticleException as e:
        logger.warning('Exception: error getting text on Twitter link caused by: %s' % e)
        return None

Source File: summarizer.py From ns with MIT License

4 votes

def generate_summary(topic, words):
    """Return summary of the topic subjected to word limit."""
    print("Generate Summary %s" % topic)

    def query_links(topic):

        query = urllib.parse.urlencode({
            "q": "'" + topic + "'",
            "count": 4
        })
        headers = {
            "Ocp-Apim-Subscription-Key": API_KEY
        }
        url = API_ROOT + "?%s" % query
        r = requests.get(url, headers=headers)
        return r

    query_job = gevent.spawn(query_links, topic)
    gevent.joinall([query_job],5000)
    result = query_job.value.json()
    links = [x["url"] for x in result["value"]]
    names = [x["name"] for x in result["value"]]

    lines = []

    def download_and_clean(url):
        try:
            print("Download " + url)
            article = Article(url)
            article.download()
            print("Parse " + url)
            article.parse()
            text = article.text
            top_image = article.top_image
        except:
            print("Failed to get " + url)
            text = ""
            top_image = ""
        return text, top_image

    jobs = [gevent.spawn(download_and_clean, url) for url in links[:4]]
    gevent.joinall(jobs, timeout=10)
    lines = [job.value[0] for job in jobs if job.value and job.value[0] and len(job.value[0]) > 100]
    top_images = [job.value[1] for job in jobs if job.value and job.value[1]]

    gc.collect()
    try:
        summary = sumbasic.orig(lines, words)
    except ValueError:
        print("Generate Summary failed for " + str(links))
        traceback.print_exc()
        summary = "Generating summary failed"
    print("Generate Summary complete for " + str(links))
    return summary, top_images, links, names, topic, words

Source File: analyse.py From Stock-Analysis with MIT License

4 votes

def SaSentimentRSS(symbol):
    url = "http://seekingalpha.com/symbol/" + symbol + ".xml"
    url2 = "http://feeds.finance.yahoo.com/rss/2.0/headline?s=" + symbol + "&region=US&lang=en-US"
    url3 = "http://www.google.ca/finance/company_news?q=" + symbol + "&output=rss"
    # gets list of links from above RSS feed
    NewsURLs = getSaURL(url)
    NewsURLs += RSS_URL.getURLs2(url2)
    NewsURLs += RSS_URL.getURLs2(url3)

    # String to be written to file
    toBeWrittenToFile = ''

    for link in NewsURLs:
        try:
            # gets article portion of the htmltext
            a = Article(link)
            a.download()
            a.parse()

            # not working if it's RSS title link or has no title or cannot be accessed
            if symbol in a.title and not 'Earnings Call Webcast' in a.title and not 'Stock Market Insights' in a.title and not '400 Bad Request' in a.title and not '403 Forbidden' in a.title and a.title != '':
                UnicodeArticle = a.text
                StringArticle = UnicodeArticle.encode('ascii', 'ignore')
                StrippedArticle = StringArticle.replace('\n', '')

                # not working with articles less than 300 words
                if len(StrippedArticle) > 200:

                    # remove ascii symbols
                    ArticleTitle = a.title.encode('ascii', 'ignore').replace(',', '')

                    # filters out irrelevant articles
                    if 'Transcript' not in ArticleTitle and 'Summary' not in ArticleTitle:

                        # writes sentiment from sentiment API to file
                        # locks this block so that only one thread can write to file at a time

                        # vader sentiment dictionary
                        s = vaderSentiment.sentiment(StrippedArticle)

                        # not writing articles with zero sentiments
                        # collect a string to be written to file
                        if s['compound'] != 0:
                            # print(ArticleTitle)
                            toBeWrittenToFile += (
                                str(symbol) + ',' + str(s['neg']) + ',' + str(s['neu']) + ',' + str(s['pos']) + ',' + str(
                                    s['compound']) + ',' + ArticleTitle + ',' + str(link) + '\n')

        except Exception as ex:
            template = "An exception of type {0} occured. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
    # write variable to file
    lock.acquire()
    try:
        myfile.write(toBeWrittenToFile)
    finally:
        lock.release()

Source File: samacharbot2.py From samacharbot2 with GNU General Public License v3.0

4 votes

def start():

    r, oauth_helper, subreddit = init()

    while True:

        try:
            submissions = subreddit.get_new(limit=50)
        except praw.errors.HTTPException:
            print "HTTP Exception"
            sleep(300)
            continue

        nothing = True
        for submission in submissions:

            print "Working on - " + str(submission.id),
            fo, ids = postTracker()
            oauth_helper.refresh()

            if int(submission.score) >= 0:

                if checkConditions(submission, ids):
                    nothing = False
                    try:

                        processSummarization(fo, submission, 'smrzr')

                    except smrzr.ArticleExtractionFail:
                        print "Article Extraction Failed"
                        continue

                    # in case Smrzr fails to get summary, try another method
                    except AssertionError:
                        print "Assertion Error"
                        processSummarization(fo, submission, 'other')

                    except Exception as e:
                        print "Unknown ERROR"
                        print type(e)
                        print e.args
                        print e
                        print submission.id
                        print "\n"
                        continue

                    fo.close()

                else:
                    print "Nothing to do, checking messages"
                    # if no more posts to summarize, go through unread messages and see if any posts to delete.
                    checkDelete(r)
                    continue

        if nothing:
            print "Nothing to do, sleeping for 1 minute"
            sleep(60)

Python newspaper.Article() Examples