Python feedparser.parse() Examples

The following are 30 code examples of feedparser.parse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module feedparser , or try the search function .
Example #1
Source File: Feed.py    From python-in-practice with GNU General Public License v3.0 8 votes vote down vote up
def _parse(data, limit):
        output = []
        feed = feedparser.parse(data) # Atom + RSS
        for entry in feed["entries"]:
            title = entry.get("title")
            link = entry.get("link")
            if title:
                if link:
                    output.append('<li><a href="{}">{}</a></li>'.format(
                            link, escape(title)))
                else:
                    output.append('<li>{}</li>'.format(escape(title)))
            if limit and len(output) == limit:
                break
        if output:
            return ["<ul>"] + output + ["</ul>"] 
Example #2
Source File: twitterbot.py    From twitterbot with GNU General Public License v2.0 7 votes vote down vote up
def read_rss_and_tweet(url: str):
    """Read RSS and post feed items as a tweet.

    Parameters
    ----------
    url: str
        URL to RSS feed.
    """
    feed = feedparser.parse(url)
    if feed:
        for item in feed["items"]:
            link = item["link"]
            if is_in_logfile(link, Settings.posted_urls_output_file):
                print("Already posted:", link)
            else:
                post_tweet(message=compose_message(item))
                write_to_logfile(link, Settings.posted_urls_output_file)
                print("Posted:", link)
    else:
        print("Nothing found in feed", url) 
Example #3
Source File: redirects.py    From janeway with GNU Affero General Public License v3.0 7 votes vote down vote up
def handle(self, *args, **options):
        """Delete duplicate named settings in the core.Setting model.

        :param args: None
        :param options: None
        :return: None
        """
        paged = 1

        while True:
            d = feedparser.parse('https://about.openlibhums.org/feed/?paged=%s' % paged)

            for item in d['entries']:
                try:
                    news_item = comms_models.NewsItem.objects.get(title=item['title'])
                    if news_item:
                        reversal = reverse('core_news_item', kwargs={'news_pk': news_item.pk})
                        url_path = urlparse(item['link']).path

                        print("Redirect 301 {0} {1}".format(url_path, reversal))
                except BaseException as e:
                    print(e)

            break
            paged = paged + 1 
Example #4
Source File: ukweather.py    From scroll-phat with MIT License 7 votes vote down vote up
def get_wet():
    # Get the weather data
    print("Updating weather for", postcode)
    d = feedparser.parse(url)
    entries = int(len(d['entries']))
    val = "        " + d['entries'][0]['title']
    val += "        " + d['entries'][1]['title']
    val += "        " + d['entries'][2]['title']
    # Tidy & shorten the message for the scroll display
    val = val.replace("Maximum", "Max")
    val = val.replace("Minimum", "Min")
    val = val.replace("Temperature: ", "")
    val = val.replace(u"\u00B0", "")
    val = val.replace(",", "")
    val = val.replace("(", "")
    val = val.replace(")", "")
    return val 
Example #5
Source File: papersbot.py    From PapersBot with MIT License 7 votes vote down vote up
def findImage(entry):
    if "description" not in entry:
        return

    soup = bs4.BeautifulSoup(entry.description, "html.parser")
    img = soup.find("img")
    if img:
        img = img["src"]
        if len(img) == 0:
            return
        # If address is relative, append root URL
        if img[0] == "/":
            p = urllib.parse.urlparse(entry.id)
            img = f"{p.scheme}://{p.netloc}" + img

    return img


# Convert string from HTML to plain text 
Example #6
Source File: test_dataset_frontend.py    From udata with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_recent_feed_org(self):
        owner = UserFactory()
        org = OrganizationFactory()
        DatasetFactory(
            owner=owner, organization=org, resources=[ResourceFactory()])

        response = self.get(url_for('datasets.recent_feed'))

        self.assert200(response)

        feed = feedparser.parse(response.data)

        self.assertEqual(len(feed.entries), 1)
        entry = feed.entries[0]
        self.assertEqual(len(entry.authors), 1)
        author = entry.authors[0]
        self.assertEqual(author.name, org.name)
        self.assertEqual(author.href,
                         self.full_url('organizations.show', org=org.id)) 
Example #7
Source File: runescape.py    From Squid-Plugins with MIT License 6 votes vote down vote up
def alog(self, *, username):
        """Gets a users recent adventure log"""
        username = username.replace(" ", "_")
        if feedparser is None:
            await self.bot.say("You'll need to run `pip3 install feedparser` "
                               "before you can get a user's adventure log.")
            return
        url = self.alog_url + username
        try:
            page = await aiohttp.get(url)
            text = await page.text()
            text = text.replace("\r", "")
        except:
            await self.bot.say("No user found.")

        feed = feedparser.parse(text)
        titles = [post.title for post in feed.entries]

        await self.bot.say(self._fmt_alog(username, titles)) 
Example #8
Source File: core.py    From SecRss with GNU General Public License v3.0 6 votes vote down vote up
def get_rss_title_and_url(self):
        """
        根据规则获取rss的标题和url
        :return:
        """
        try:
            rss_data = json.load(open(BASE_PATH + "/core/data.json", "r", encoding="utf-8"))
            for item in rss_data:
                rss = feedparser.parse(requests.get(item['rss']).content)['entries']
                push_new_list = {"source": item, "target": []}

                for it in rss:
                    datetime_struct = parser.parse(it['published'])
                    published = datetime_struct.strftime("%Y-%m-%d")

                    today = time.strftime("%Y-%m-%d")

                    if today == published:
                        if item['has_content'] in it['title']:
                            push_new_list["target"].append(it)
                self.news_list.append(push_new_list)
        except Exception as e:
            logger.warn("获取RSS标题和URL异常:" + str(e)) 
Example #9
Source File: feeds.py    From CloudBot with GNU General Public License v3.0 6 votes vote down vote up
def rss(text):
    """<feed> - Gets the first three items from the RSS/ATOM feed <feed>."""
    t = text.lower().strip()
    if t in ALIASES:
        alias = ALIASES[t]
        addr = alias.url
        limit = alias.limit
    else:
        addr = text
        limit = 3

    feed = feedparser.parse(addr)
    if not feed.entries:
        return "Feed not found."

    out = []
    for item in feed.entries[:limit]:
        out.append(format_item(item))

    if 'title' in feed.feed:
        start = "\x02{}\x02: ".format(feed.feed.title)
    else:
        start = ""

    return start + ", ".join(out) 
Example #10
Source File: rss.py    From omnibus with MIT License 6 votes vote down vote up
def run(self):
        try:
            feed = feedparser.parse(self.url)

            for idx, item in enumerate(feed['entries']):
                if idx == 19:
                    break

                else:
                    self.results.append({
                        'url': item['url'],
                        'title': item['title']
                    })

        except Exception as err:
            warning('Caught exception in module (%s)' % str(err)) 
Example #11
Source File: feedhandler.py    From telegram-robot-rss with Mozilla Public License 2.0 6 votes vote down vote up
def is_parsable(url):
        """
        Checks wether the given url provides a news feed. Return True if news are available, else False
        """

        url_pattern = re.compile("((http(s?))):\/\/.*")
        if not url_pattern.match(url):
            return False

        feed = feedparser.parse(url)

        # Check if result is empty
        if not feed.entries:
            return False
        # Check if entries provide updated attribute
        for post in feed.entries:
            if not hasattr(post, "updated"):
                return False
        return True 
Example #12
Source File: lambda_function.py    From amazon-polly-sample with Apache License 2.0 6 votes vote down vote up
def get_entries(feed):
    NEW_POST = u"""New post, author {author}, title {title} {content}"""
    for entry in feed.entries:
        if "http" in entry.id:
            nid = hashlib.md5(str(entry.id))
            entry.id = nid.hexdigest()
        entry_content = entry.content[0].value
        soup = BeautifulSoup(entry_content, 'html.parser')
        chunks = split_content_by_dot(soup, REQUEST_LIMIT-len(NEW_POST))
        chunks = list(chunks)
        published = dateutil.parser.parse(entry.published)
        for i, chunk in enumerate(chunks):
            if i == 0:
                chunk = NEW_POST.format(
                        author=entry.author,
                        title=entry.title,
                        content=chunk)
            yield dict(
                content=chunk,
                id="%s_%d" % (entry.id, i),
                title=entry.title,
                published=published - datetime.timedelta(0, i),
            )
            remaining = chunk 
Example #13
Source File: papersbot.py    From PapersBot with MIT License 6 votes vote down vote up
def run(self):
        for feed in self.feeds:
            parsed_feed = feedparser.parse(feed)
            for entry in parsed_feed.entries:
                if entryMatches(entry):
                    self.n_seen += 1
                    # If no ID provided, use the link as ID
                    if "id" not in entry:
                        entry.id = entry.link
                    if entry.id not in self.posted:
                        self.sendTweet(entry)
                        # Bail out if we have reached max number of tweets
                        if self.throttle > 0 and self.n_tweeted >= self.throttle:
                            print(f"Max number of papers met ({self.throttle}), stopping now")
                            return

    # Print statistics of a given run 
Example #14
Source File: smartmirror.py    From Smart-Mirror with MIT License 6 votes vote down vote up
def get_headlines(self):
        try:
            # remove all children
            for widget in self.headlinesContainer.winfo_children():
                widget.destroy()
            if news_country_code == None:
                headlines_url = "https://news.google.com/news?ned=us&output=rss"
            else:
                headlines_url = "https://news.google.com/news?ned=%s&output=rss" % news_country_code

            feed = feedparser.parse(headlines_url)

            for post in feed.entries[0:5]:
                headline = NewsHeadline(self.headlinesContainer, post.title)
                headline.pack(side=TOP, anchor=W)
        except Exception as e:
            traceback.print_exc()
            print "Error: %s. Cannot get news." % e

        self.after(600000, self.get_headlines) 
Example #15
Source File: parsers.py    From critics with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_ios_reviews(app_id, language, limit=100):
    url = 'https://itunes.apple.com/%(language)srss/customerreviews/id=%(app_id)s/sortBy=mostRecent/xml' % {
        'language': '%s/' % language if language else '', 'app_id': app_id}
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'},
                            timeout=timeout)
    response.encoding = 'utf-8'  # avoid chardet not guessing correctly
    feed = feedparser.parse(response.text)
    reviews = [Review(
        id=entry.id,
        platform='ios',
        title=entry.title,
        rating=int(entry.im_rating),
        summary=entry.summary,
        url=entry.href,
        author=entry.author,  # author url: entry.href
        date=datetime.datetime.fromtimestamp(mktime(entry.updated_parsed)),
        language=language,
        version=entry.im_version
    ) for entry in feed['entries'][1:1 + limit]]
    return reviews 
Example #16
Source File: __init__.py    From chirp with MIT License 6 votes vote down vote up
def process_all_rss(reprocess=False):
    """Gather all RSS feeds and articles, then process."""
    sources = list()
    logger.debug("Collecting sources")
    monitors = mongo.db[app.config['MONITORS_COLLECTION']]
    for item in monitors.find({'active': True}):
        sources.append(item['metadata'].get('rss_link'))

    contents = [feedparser.parse(x) for x in sources]
    logger.debug("Processing sources")
    for source in contents:
        for idx, item in enumerate(source.get('entries')):
            response = get_article(item, source['href'], reprocess)
            if response['from_store'] or reprocess:
                continue
            clean_link = response['article']['feed_source']
            monitors.update({'metadata.rss_link': clean_link},
                            {'$set': {'checked': now_time()}})
    correct_counts() 
Example #17
Source File: rss.py    From SkittBot with GNU General Public License v3.0 6 votes vote down vote up
def remove_url(bot, update, args):
    if len(args) >= 1:
        tg_chat_id = str(update.effective_chat.id)

        tg_feed_link = args[0]

        link_processed = parse(tg_feed_link)

        if link_processed.bozo == 0:
            user_data = sql.check_url_availability(tg_chat_id, tg_feed_link)

            if user_data:
                sql.remove_url(tg_chat_id, tg_feed_link)

                update.effective_message.reply_text("Removed URL from subscription")
            else:
                update.effective_message.reply_text("You haven't subscribed to this URL yet")
        else:
            update.effective_message.reply_text("This link is not an RSS Feed link")
    else:
        update.effective_message.reply_text("URL missing") 
Example #18
Source File: atom.py    From oh-my-rss with MIT License 6 votes vote down vote up
def parse_self_atom(feed_url):
    """
    解析本站提供的 RSS 源
    :param feed_url:
    :return: 解析结果,成功字典;失败 None
    """

    feed_path = urllib.parse.urlparse(feed_url).path

    try:
        name = resolve(feed_path).kwargs.get('name')
    except:
        name = None

    if name:
        try:
            Site.objects.get(name=name, status='active')
            return {"name": name}
        except:
            logger.warning(f'订阅源不存在:`{feed_url}')

    return None 
Example #19
Source File: search_api.py    From CodeScraper with MIT License 6 votes vote down vote up
def searchGithub(word, day, level):
  searchlevel = {
    1: ['in:name,description', 'created'],
    2: ['in:name,description,readme', 'created'],
    3: ['in:name,description', 'pushed'],
    4: ['in:name,description,readme', 'pushed']}
  github_url = 'https://api.github.com/search/repositories?q='
  try:
    if word.find(' ') > 0:
      word.replace(' ', '\" \"')
    word = urllib.parse.quote('\"' + word + '\"')
    url = github_url + word + '+' + searchlevel[level][0] + '+' + searchlevel[level][1] + ':>' + day + '&s=updated&o=asc'
    headers = {"Accept": "application/vnd.github.mercy-preview+json"}
    result = requests.get(url, timeout=10, headers=headers)
    statuscode = result.status_code
    resultdata = result.json()
    codes = []
    for a in resultdata['items']:
      name = a['full_name']
      if a['size'] > 0:
        codes.append(name)
    return codes, statuscode
  except:
    return [], -1 
Example #20
Source File: search_api.py    From CodeScraper with MIT License 6 votes vote down vote up
def searchGist(word, day):
  if word.find(' ') > 0:
    word.replace(' ', '\" \"')
  word = urllib.parse.quote('\"' + word + '\"')
  url = 'https://gist.github.com/search?utf8=%E2%9C%93&q=' + word + '+created%3A>' + day + '&ref=searchresults'
  try:
    result = requests.get(url, timeout=10)
    statuscode = result.status_code
    root = lxml.html.fromstring(result.text)
    codes = []
    for a in root.xpath('//div/a[@class="link-overlay"]'):
#      name = a.text_content()
      link = a.get('href')
      codes.append(link)
    return codes, statuscode
  except:
    return [], -1 
Example #21
Source File: search_api.py    From CodeScraper with MIT License 6 votes vote down vote up
def searchGitlab(word):
  try:
    if word.find(' ') > 0:
      word.replace(' ', '\" \"')
    word = urllib.parse.quote('\"' + word + '\"')
    url = 'https://gitlab.com/explore/projects?utf8=%E2%9C%93&name=' + word + '&sort=latest_activity_desc'
    result = requests.get(url, timeout=10)
    statuscode = result.status_code
    root = lxml.html.fromstring(result.text)
    codes = []
    for a in root.xpath('//div/a[@class="project"]'):
  #    name = a.text_content()
      link = a.get('href')
      codes.append(link)
    return codes, statuscode
  except:
    return [], -1 
Example #22
Source File: search_api.py    From CodeScraper with MIT License 6 votes vote down vote up
def googleCustomSearch(word, engine_id, api_key):
  try:
    if word.find(' ') > 0:
      word.replace(' ', '\" \"')
    word = urllib.parse.quote('\"' + word + '\"')
    headers = {"content-type": "application/json"}
    url = 'https://www.googleapis.com/customsearch/v1?key=' + api_key + '&rsz=filtered_cse&num=10&hl=en&prettyPrint=false&cx=' + engine_id + '&q=' + word + '&sort=date'
    result = requests.get(url, timeout=10, headers=headers)
    statuscode = result.status_code
    codes = {}
    if statuscode == 200:
      jsondata = result.json()
      if 'items' in jsondata.keys():
        for item in jsondata['items']:
          name = item['title']
          sub = item['snippet']
          link = item['link']
          codes[link] = [name, sub]
    return codes, statuscode
  except:
    return {}, -1 
Example #23
Source File: search_api.py    From CodeScraper with MIT License 6 votes vote down vote up
def getRSSFeeds(url, lastpost):
  try:
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    response = requests.get(url, timeout=10, headers=headers)
    updateditems = []
    statuscode = response.status_code
    if statuscode == 200:
      rss = feedparser.parse(response.text)
      result = parseRSS(rss['entries'])
      for entry in result:
        if entry['link'] == lastpost['link']:
          break
        else:
          if entry['timestamp'] != None and lastpost['timestamp'] != None:
            if datetime.datetime.strptime(entry['timestamp'], '%Y-%m-%d %H:%M:%S') < datetime.datetime.strptime(lastpost['timestamp'], '%Y-%m-%d %H:%M:%S'):
              break
          updateditems.append(entry)
    return updateditems, statuscode
  except:
    return [], -1 
Example #24
Source File: getCommand.py    From CodeScraper with MIT License 6 votes vote down vote up
def checkRSSUrl(url):
  try:
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    response = requests.get(url, timeout=4, headers=headers)
    rss = feedparser.parse(response.text)
    rssurl = None
    if rss['version'] == 'rss10' or rss['version'] == 'rss20' or rss['version'] == 'atom10':
      rssurl = url
    else:
      root = lxml.html.fromstring(response.text)
      for link in root.xpath('//link[@type="application/rss+xml"]'):
        url = link.get('href')
      rss = feedparser.parse(url)
      if rss['version'] == 'rss10' or rss['version'] == 'rss20' or rss['version'] == 'atom10':
        rssurl = url
    return rssurl
  except:
    return None 
Example #25
Source File: test_reuse_frontend.py    From udata with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_recent_feed_owner(self):
        owner = UserFactory()
        ReuseFactory(owner=owner, datasets=[DatasetFactory()])

        response = self.get(url_for('reuses.recent_feed'))

        self.assert200(response)

        feed = feedparser.parse(response.data)

        self.assertEqual(len(feed.entries), 1)
        entry = feed.entries[0]
        self.assertEqual(len(entry.authors), 1)
        author = entry.authors[0]
        self.assertEqual(author.name, owner.fullname)
        self.assertEqual(author.href,
                         self.full_url('users.show', user=owner.id)) 
Example #26
Source File: test_dataset_frontend.py    From udata with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_recent_feed_owner(self):
        owner = UserFactory()
        DatasetFactory(owner=owner, resources=[ResourceFactory()])

        response = self.get(url_for('datasets.recent_feed'))

        self.assert200(response)

        feed = feedparser.parse(response.data)

        self.assertEqual(len(feed.entries), 1)
        entry = feed.entries[0]
        self.assertEqual(len(entry.authors), 1)
        author = entry.authors[0]
        self.assertEqual(author.name, owner.fullname)
        self.assertEqual(author.href,
                         self.full_url('users.show', user=owner.id)) 
Example #27
Source File: rss.py    From EmiliaHikari with GNU General Public License v3.0 6 votes vote down vote up
def remove_url(bot, update):
    args = context.args
    if len(args) >= 1:
        tg_chat_id = str(update.effective_chat.id)

        tg_feed_link = args[0]

        link_processed = parse(tg_feed_link)

        if link_processed.bozo == 0:
            user_data = sql.check_url_availability(tg_chat_id, tg_feed_link)

            if user_data:
                sql.remove_url(tg_chat_id, tg_feed_link)

                send_message(update.effective_message, tl(update.effective_message, "URL dihapus dari langganan"))
            else:
                send_message(update.effective_message, tl(update.effective_message, "Anda belum berlangganan ke URL ini"))
        else:
            send_message(update.effective_message, tl(update.effective_message, "Tautan ini bukan tautan Umpan RSS"))
    else:
        send_message(update.effective_message, tl(update.effective_message, "URL hilang")) 
Example #28
Source File: raw_parser.py    From rssant with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _load_json(self, response: FeedResponse) -> dict:
        try:
            text = response.content.decode(response.encoding)
        except UnicodeDecodeError as ex:
            raise FeedParserError("Unicode decode error: {}".format(ex)) from ex
        try:
            data = json.loads(text)
        except json.JSONDecodeError as ex:
            raise FeedParserError("JSON parse error: {}".format(ex)) from ex
        return data 
Example #29
Source File: IdentifyingTopic.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def getDocuments(self):
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetching documents from {} completed".format(url)) 
Example #30
Source File: deals.py    From CloudBot with GNU General Public License v3.0 5 votes vote down vote up
def slickdeals():
    """- List the top 3 frontpage slickdeals.net deals."""
    url = "https://slickdeals.net/newsearch.php?mode=frontpage&searcharea=deals&searchin=first&rss=1"

    feed = feedparser.parse(url)
    items = (
        "{} ({})".format(item.title, web.try_shorten(item.link))
        for item in feed.entries[:3]
    )

    out = "slickdeals.net: " + ' \u2022 '.join(items)

    return out