Python feedparser.parse() Examples

The following are 30 code examples of feedparser.parse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module feedparser , or try the search function

Example #1

Source File: Feed.py From python-in-practice with GNU General Public License v3.0

8 votes

def _parse(data, limit):
        output = []
        feed = feedparser.parse(data) # Atom + RSS
        for entry in feed["entries"]:
            title = entry.get("title")
            link = entry.get("link")
            if title:
                if link:
                    output.append('<li><a href="{}">{}</a></li>'.format(
                            link, escape(title)))
                else:
                    output.append('<li>{}</li>'.format(escape(title)))
            if limit and len(output) == limit:
                break
        if output:
            return ["<ul>"] + output + ["</ul>"]

Example #2

Source File: twitterbot.py From twitterbot with GNU General Public License v2.0

7 votes

def read_rss_and_tweet(url: str):
    """Read RSS and post feed items as a tweet.

    Parameters
    ----------
    url: str
        URL to RSS feed.
    """
    feed = feedparser.parse(url)
    if feed:
        for item in feed["items"]:
            link = item["link"]
            if is_in_logfile(link, Settings.posted_urls_output_file):
                print("Already posted:", link)
            else:
                post_tweet(message=compose_message(item))
                write_to_logfile(link, Settings.posted_urls_output_file)
                print("Posted:", link)
    else:
        print("Nothing found in feed", url)

Example #3

Source File: redirects.py From janeway with GNU Affero General Public License v3.0

7 votes

def handle(self, *args, **options):
        """Delete duplicate named settings in the core.Setting model.

        :param args: None
        :param options: None
        :return: None
        """
        paged = 1

        while True:
            d = feedparser.parse('https://about.openlibhums.org/feed/?paged=%s' % paged)

            for item in d['entries']:
                try:
                    news_item = comms_models.NewsItem.objects.get(title=item['title'])
                    if news_item:
                        reversal = reverse('core_news_item', kwargs={'news_pk': news_item.pk})
                        url_path = urlparse(item['link']).path

                        print("Redirect 301 {0} {1}".format(url_path, reversal))
                except BaseException as e:
                    print(e)

            break
            paged = paged + 1

Example #4

Source File: ukweather.py From scroll-phat with MIT License

7 votes

def get_wet():
    # Get the weather data
    print("Updating weather for", postcode)
    d = feedparser.parse(url)
    entries = int(len(d['entries']))
    val = "        " + d['entries'][0]['title']
    val += "        " + d['entries'][1]['title']
    val += "        " + d['entries'][2]['title']
    # Tidy & shorten the message for the scroll display
    val = val.replace("Maximum", "Max")
    val = val.replace("Minimum", "Min")
    val = val.replace("Temperature: ", "")
    val = val.replace(u"\u00B0", "")
    val = val.replace(",", "")
    val = val.replace("(", "")
    val = val.replace(")", "")
    return val

Example #5

Source File: papersbot.py From PapersBot with MIT License

7 votes

def findImage(entry):
    if "description" not in entry:
        return

    soup = bs4.BeautifulSoup(entry.description, "html.parser")
    img = soup.find("img")
    if img:
        img = img["src"]
        if len(img) == 0:
            return
        # If address is relative, append root URL
        if img[0] == "/":
            p = urllib.parse.urlparse(entry.id)
            img = f"{p.scheme}://{p.netloc}" + img

    return img


# Convert string from HTML to plain text

Example #6

Source File: test_dataset_frontend.py From udata with GNU Affero General Public License v3.0

6 votes

def test_recent_feed_org(self):
        owner = UserFactory()
        org = OrganizationFactory()
        DatasetFactory(
            owner=owner, organization=org, resources=[ResourceFactory()])

        response = self.get(url_for('datasets.recent_feed'))

        self.assert200(response)

        feed = feedparser.parse(response.data)

        self.assertEqual(len(feed.entries), 1)
        entry = feed.entries[0]
        self.assertEqual(len(entry.authors), 1)
        author = entry.authors[0]
        self.assertEqual(author.name, org.name)
        self.assertEqual(author.href,
                         self.full_url('organizations.show', org=org.id))

Example #7

Source File: runescape.py From Squid-Plugins with MIT License

6 votes

def alog(self, *, username):
        """Gets a users recent adventure log"""
        username = username.replace(" ", "_")
        if feedparser is None:
            await self.bot.say("You'll need to run `pip3 install feedparser` "
                               "before you can get a user's adventure log.")
            return
        url = self.alog_url + username
        try:
            page = await aiohttp.get(url)
            text = await page.text()
            text = text.replace("\r", "")
        except:
            await self.bot.say("No user found.")

        feed = feedparser.parse(text)
        titles = [post.title for post in feed.entries]

        await self.bot.say(self._fmt_alog(username, titles))

Example #8

Source File: core.py From SecRss with GNU General Public License v3.0

6 votes

def get_rss_title_and_url(self):
        """
        根据规则获取rss的标题和url
        :return:
        """
        try:
            rss_data = json.load(open(BASE_PATH + "/core/data.json", "r", encoding="utf-8"))
            for item in rss_data:
                rss = feedparser.parse(requests.get(item['rss']).content)['entries']
                push_new_list = {"source": item, "target": []}

                for it in rss:
                    datetime_struct = parser.parse(it['published'])
                    published = datetime_struct.strftime("%Y-%m-%d")

                    today = time.strftime("%Y-%m-%d")

                    if today == published:
                        if item['has_content'] in it['title']:
                            push_new_list["target"].append(it)
                self.news_list.append(push_new_list)
        except Exception as e:
            logger.warn("获取RSS标题和URL异常:" + str(e))

Example #9

Source File: feeds.py From CloudBot with GNU General Public License v3.0

6 votes

def rss(text):
    """<feed> - Gets the first three items from the RSS/ATOM feed <feed>."""
    t = text.lower().strip()
    if t in ALIASES:
        alias = ALIASES[t]
        addr = alias.url
        limit = alias.limit
    else:
        addr = text
        limit = 3

    feed = feedparser.parse(addr)
    if not feed.entries:
        return "Feed not found."

    out = []
    for item in feed.entries[:limit]:
        out.append(format_item(item))

    if 'title' in feed.feed:
        start = "\x02{}\x02: ".format(feed.feed.title)
    else:
        start = ""

    return start + ", ".join(out)

Example #10

Source File: rss.py From omnibus with MIT License

6 votes

def run(self):
        try:
            feed = feedparser.parse(self.url)

            for idx, item in enumerate(feed['entries']):
                if idx == 19:
                    break

                else:
                    self.results.append({
                        'url': item['url'],
                        'title': item['title']
                    })

        except Exception as err:
            warning('Caught exception in module (%s)' % str(err))

Example #11

Source File: feedhandler.py From telegram-robot-rss with Mozilla Public License 2.0

6 votes

def is_parsable(url):
        """
        Checks wether the given url provides a news feed. Return True if news are available, else False
        """

        url_pattern = re.compile("((http(s?))):\/\/.*")
        if not url_pattern.match(url):
            return False

        feed = feedparser.parse(url)

        # Check if result is empty
        if not feed.entries:
            return False
        # Check if entries provide updated attribute
        for post in feed.entries:
            if not hasattr(post, "updated"):
                return False
        return True

Example #12

Source File: lambda_function.py From amazon-polly-sample with Apache License 2.0

6 votes

def get_entries(feed):
    NEW_POST = u"""New post, author {author}, title {title} {content}"""
    for entry in feed.entries:
        if "http" in entry.id:
            nid = hashlib.md5(str(entry.id))
            entry.id = nid.hexdigest()
        entry_content = entry.content[0].value
        soup = BeautifulSoup(entry_content, 'html.parser')
        chunks = split_content_by_dot(soup, REQUEST_LIMIT-len(NEW_POST))
        chunks = list(chunks)
        published = dateutil.parser.parse(entry.published)
        for i, chunk in enumerate(chunks):
            if i == 0:
                chunk = NEW_POST.format(
                        author=entry.author,
                        title=entry.title,
                        content=chunk)
            yield dict(
                content=chunk,
                id="%s_%d" % (entry.id, i),
                title=entry.title,
                published=published - datetime.timedelta(0, i),
            )
            remaining = chunk

Example #13

Source File: papersbot.py From PapersBot with MIT License

6 votes

def run(self):
        for feed in self.feeds:
            parsed_feed = feedparser.parse(feed)
            for entry in parsed_feed.entries:
                if entryMatches(entry):
                    self.n_seen += 1
                    # If no ID provided, use the link as ID
                    if "id" not in entry:
                        entry.id = entry.link
                    if entry.id not in self.posted:
                        self.sendTweet(entry)
                        # Bail out if we have reached max number of tweets
                        if self.throttle > 0 and self.n_tweeted >= self.throttle:
                            print(f"Max number of papers met ({self.throttle}), stopping now")
                            return

    # Print statistics of a given run

Example #14

Source File: smartmirror.py From Smart-Mirror with MIT License

6 votes

def get_headlines(self):
        try:
            # remove all children
            for widget in self.headlinesContainer.winfo_children():
                widget.destroy()
            if news_country_code == None:
                headlines_url = "https://news.google.com/news?ned=us&output=rss"
            else:
                headlines_url = "https://news.google.com/news?ned=%s&output=rss" % news_country_code

            feed = feedparser.parse(headlines_url)

            for post in feed.entries[0:5]:
                headline = NewsHeadline(self.headlinesContainer, post.title)
                headline.pack(side=TOP, anchor=W)
        except Exception as e:
            traceback.print_exc()
            print "Error: %s. Cannot get news." % e

        self.after(600000, self.get_headlines)

Example #15

Source File: parsers.py From critics with BSD 3-Clause "New" or "Revised" License

6 votes

def get_ios_reviews(app_id, language, limit=100):
    url = 'https://itunes.apple.com/%(language)srss/customerreviews/id=%(app_id)s/sortBy=mostRecent/xml' % {
        'language': '%s/' % language if language else '', 'app_id': app_id}
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'},
                            timeout=timeout)
    response.encoding = 'utf-8'  # avoid chardet not guessing correctly
    feed = feedparser.parse(response.text)
    reviews = [Review(
        id=entry.id,
        platform='ios',
        title=entry.title,
        rating=int(entry.im_rating),
        summary=entry.summary,
        url=entry.href,
        author=entry.author,  # author url: entry.href
        date=datetime.datetime.fromtimestamp(mktime(entry.updated_parsed)),
        language=language,
        version=entry.im_version
    ) for entry in feed['entries'][1:1 + limit]]
    return reviews

Example #16

Source File: __init__.py From chirp with MIT License

6 votes

def process_all_rss(reprocess=False):
    """Gather all RSS feeds and articles, then process."""
    sources = list()
    logger.debug("Collecting sources")
    monitors = mongo.db[app.config['MONITORS_COLLECTION']]
    for item in monitors.find({'active': True}):
        sources.append(item['metadata'].get('rss_link'))

    contents = [feedparser.parse(x) for x in sources]
    logger.debug("Processing sources")
    for source in contents:
        for idx, item in enumerate(source.get('entries')):
            response = get_article(item, source['href'], reprocess)
            if response['from_store'] or reprocess:
                continue
            clean_link = response['article']['feed_source']
            monitors.update({'metadata.rss_link': clean_link},
                            {'$set': {'checked': now_time()}})
    correct_counts()

Example #17

Source File: rss.py From SkittBot with GNU General Public License v3.0

6 votes

def remove_url(bot, update, args):
    if len(args) >= 1:
        tg_chat_id = str(update.effective_chat.id)

        tg_feed_link = args[0]

        link_processed = parse(tg_feed_link)

        if link_processed.bozo == 0:
            user_data = sql.check_url_availability(tg_chat_id, tg_feed_link)

            if user_data:
                sql.remove_url(tg_chat_id, tg_feed_link)

                update.effective_message.reply_text("Removed URL from subscription")
            else:
                update.effective_message.reply_text("You haven't subscribed to this URL yet")
        else:
            update.effective_message.reply_text("This link is not an RSS Feed link")
    else:
        update.effective_message.reply_text("URL missing")

Example #18

Source File: atom.py From oh-my-rss with MIT License

6 votes

def parse_self_atom(feed_url):
    """
    解析本站提供的 RSS 源
    :param feed_url:
    :return: 解析结果，成功字典；失败 None
    """

    feed_path = urllib.parse.urlparse(feed_url).path

    try:
        name = resolve(feed_path).kwargs.get('name')
    except:
        name = None

    if name:
        try:
            Site.objects.get(name=name, status='active')
            return {"name": name}
        except:
            logger.warning(f'订阅源不存在：`{feed_url}')

    return None

Example #19