Python feedparser.parse() Examples
The following are 30
code examples of feedparser.parse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
feedparser
, or try the search function
.
Example #1
Source File: Feed.py From python-in-practice with GNU General Public License v3.0 | 8 votes |
def _parse(data, limit): output = [] feed = feedparser.parse(data) # Atom + RSS for entry in feed["entries"]: title = entry.get("title") link = entry.get("link") if title: if link: output.append('<li><a href="{}">{}</a></li>'.format( link, escape(title))) else: output.append('<li>{}</li>'.format(escape(title))) if limit and len(output) == limit: break if output: return ["<ul>"] + output + ["</ul>"]
Example #2
Source File: twitterbot.py From twitterbot with GNU General Public License v2.0 | 7 votes |
def read_rss_and_tweet(url: str): """Read RSS and post feed items as a tweet. Parameters ---------- url: str URL to RSS feed. """ feed = feedparser.parse(url) if feed: for item in feed["items"]: link = item["link"] if is_in_logfile(link, Settings.posted_urls_output_file): print("Already posted:", link) else: post_tweet(message=compose_message(item)) write_to_logfile(link, Settings.posted_urls_output_file) print("Posted:", link) else: print("Nothing found in feed", url)
Example #3
Source File: redirects.py From janeway with GNU Affero General Public License v3.0 | 7 votes |
def handle(self, *args, **options): """Delete duplicate named settings in the core.Setting model. :param args: None :param options: None :return: None """ paged = 1 while True: d = feedparser.parse('https://about.openlibhums.org/feed/?paged=%s' % paged) for item in d['entries']: try: news_item = comms_models.NewsItem.objects.get(title=item['title']) if news_item: reversal = reverse('core_news_item', kwargs={'news_pk': news_item.pk}) url_path = urlparse(item['link']).path print("Redirect 301 {0} {1}".format(url_path, reversal)) except BaseException as e: print(e) break paged = paged + 1
Example #4
Source File: ukweather.py From scroll-phat with MIT License | 7 votes |
def get_wet(): # Get the weather data print("Updating weather for", postcode) d = feedparser.parse(url) entries = int(len(d['entries'])) val = " " + d['entries'][0]['title'] val += " " + d['entries'][1]['title'] val += " " + d['entries'][2]['title'] # Tidy & shorten the message for the scroll display val = val.replace("Maximum", "Max") val = val.replace("Minimum", "Min") val = val.replace("Temperature: ", "") val = val.replace(u"\u00B0", "") val = val.replace(",", "") val = val.replace("(", "") val = val.replace(")", "") return val
Example #5
Source File: papersbot.py From PapersBot with MIT License | 7 votes |
def findImage(entry): if "description" not in entry: return soup = bs4.BeautifulSoup(entry.description, "html.parser") img = soup.find("img") if img: img = img["src"] if len(img) == 0: return # If address is relative, append root URL if img[0] == "/": p = urllib.parse.urlparse(entry.id) img = f"{p.scheme}://{p.netloc}" + img return img # Convert string from HTML to plain text
Example #6
Source File: test_dataset_frontend.py From udata with GNU Affero General Public License v3.0 | 6 votes |
def test_recent_feed_org(self): owner = UserFactory() org = OrganizationFactory() DatasetFactory( owner=owner, organization=org, resources=[ResourceFactory()]) response = self.get(url_for('datasets.recent_feed')) self.assert200(response) feed = feedparser.parse(response.data) self.assertEqual(len(feed.entries), 1) entry = feed.entries[0] self.assertEqual(len(entry.authors), 1) author = entry.authors[0] self.assertEqual(author.name, org.name) self.assertEqual(author.href, self.full_url('organizations.show', org=org.id))
Example #7
Source File: runescape.py From Squid-Plugins with MIT License | 6 votes |
def alog(self, *, username): """Gets a users recent adventure log""" username = username.replace(" ", "_") if feedparser is None: await self.bot.say("You'll need to run `pip3 install feedparser` " "before you can get a user's adventure log.") return url = self.alog_url + username try: page = await aiohttp.get(url) text = await page.text() text = text.replace("\r", "") except: await self.bot.say("No user found.") feed = feedparser.parse(text) titles = [post.title for post in feed.entries] await self.bot.say(self._fmt_alog(username, titles))
Example #8
Source File: core.py From SecRss with GNU General Public License v3.0 | 6 votes |
def get_rss_title_and_url(self): """ 根据规则获取rss的标题和url :return: """ try: rss_data = json.load(open(BASE_PATH + "/core/data.json", "r", encoding="utf-8")) for item in rss_data: rss = feedparser.parse(requests.get(item['rss']).content)['entries'] push_new_list = {"source": item, "target": []} for it in rss: datetime_struct = parser.parse(it['published']) published = datetime_struct.strftime("%Y-%m-%d") today = time.strftime("%Y-%m-%d") if today == published: if item['has_content'] in it['title']: push_new_list["target"].append(it) self.news_list.append(push_new_list) except Exception as e: logger.warn("获取RSS标题和URL异常:" + str(e))
Example #9
Source File: feeds.py From CloudBot with GNU General Public License v3.0 | 6 votes |
def rss(text): """<feed> - Gets the first three items from the RSS/ATOM feed <feed>.""" t = text.lower().strip() if t in ALIASES: alias = ALIASES[t] addr = alias.url limit = alias.limit else: addr = text limit = 3 feed = feedparser.parse(addr) if not feed.entries: return "Feed not found." out = [] for item in feed.entries[:limit]: out.append(format_item(item)) if 'title' in feed.feed: start = "\x02{}\x02: ".format(feed.feed.title) else: start = "" return start + ", ".join(out)
Example #10
Source File: rss.py From omnibus with MIT License | 6 votes |
def run(self): try: feed = feedparser.parse(self.url) for idx, item in enumerate(feed['entries']): if idx == 19: break else: self.results.append({ 'url': item['url'], 'title': item['title'] }) except Exception as err: warning('Caught exception in module (%s)' % str(err))
Example #11
Source File: feedhandler.py From telegram-robot-rss with Mozilla Public License 2.0 | 6 votes |
def is_parsable(url): """ Checks wether the given url provides a news feed. Return True if news are available, else False """ url_pattern = re.compile("((http(s?))):\/\/.*") if not url_pattern.match(url): return False feed = feedparser.parse(url) # Check if result is empty if not feed.entries: return False # Check if entries provide updated attribute for post in feed.entries: if not hasattr(post, "updated"): return False return True
Example #12
Source File: lambda_function.py From amazon-polly-sample with Apache License 2.0 | 6 votes |
def get_entries(feed): NEW_POST = u"""New post, author {author}, title {title} {content}""" for entry in feed.entries: if "http" in entry.id: nid = hashlib.md5(str(entry.id)) entry.id = nid.hexdigest() entry_content = entry.content[0].value soup = BeautifulSoup(entry_content, 'html.parser') chunks = split_content_by_dot(soup, REQUEST_LIMIT-len(NEW_POST)) chunks = list(chunks) published = dateutil.parser.parse(entry.published) for i, chunk in enumerate(chunks): if i == 0: chunk = NEW_POST.format( author=entry.author, title=entry.title, content=chunk) yield dict( content=chunk, id="%s_%d" % (entry.id, i), title=entry.title, published=published - datetime.timedelta(0, i), ) remaining = chunk
Example #13
Source File: papersbot.py From PapersBot with MIT License | 6 votes |
def run(self): for feed in self.feeds: parsed_feed = feedparser.parse(feed) for entry in parsed_feed.entries: if entryMatches(entry): self.n_seen += 1 # If no ID provided, use the link as ID if "id" not in entry: entry.id = entry.link if entry.id not in self.posted: self.sendTweet(entry) # Bail out if we have reached max number of tweets if self.throttle > 0 and self.n_tweeted >= self.throttle: print(f"Max number of papers met ({self.throttle}), stopping now") return # Print statistics of a given run
Example #14
Source File: smartmirror.py From Smart-Mirror with MIT License | 6 votes |
def get_headlines(self): try: # remove all children for widget in self.headlinesContainer.winfo_children(): widget.destroy() if news_country_code == None: headlines_url = "https://news.google.com/news?ned=us&output=rss" else: headlines_url = "https://news.google.com/news?ned=%s&output=rss" % news_country_code feed = feedparser.parse(headlines_url) for post in feed.entries[0:5]: headline = NewsHeadline(self.headlinesContainer, post.title) headline.pack(side=TOP, anchor=W) except Exception as e: traceback.print_exc() print "Error: %s. Cannot get news." % e self.after(600000, self.get_headlines)
Example #15
Source File: parsers.py From critics with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_ios_reviews(app_id, language, limit=100): url = 'https://itunes.apple.com/%(language)srss/customerreviews/id=%(app_id)s/sortBy=mostRecent/xml' % { 'language': '%s/' % language if language else '', 'app_id': app_id} response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'}, timeout=timeout) response.encoding = 'utf-8' # avoid chardet not guessing correctly feed = feedparser.parse(response.text) reviews = [Review( id=entry.id, platform='ios', title=entry.title, rating=int(entry.im_rating), summary=entry.summary, url=entry.href, author=entry.author, # author url: entry.href date=datetime.datetime.fromtimestamp(mktime(entry.updated_parsed)), language=language, version=entry.im_version ) for entry in feed['entries'][1:1 + limit]] return reviews
Example #16
Source File: __init__.py From chirp with MIT License | 6 votes |
def process_all_rss(reprocess=False): """Gather all RSS feeds and articles, then process.""" sources = list() logger.debug("Collecting sources") monitors = mongo.db[app.config['MONITORS_COLLECTION']] for item in monitors.find({'active': True}): sources.append(item['metadata'].get('rss_link')) contents = [feedparser.parse(x) for x in sources] logger.debug("Processing sources") for source in contents: for idx, item in enumerate(source.get('entries')): response = get_article(item, source['href'], reprocess) if response['from_store'] or reprocess: continue clean_link = response['article']['feed_source'] monitors.update({'metadata.rss_link': clean_link}, {'$set': {'checked': now_time()}}) correct_counts()
Example #17
Source File: rss.py From SkittBot with GNU General Public License v3.0 | 6 votes |
def remove_url(bot, update, args): if len(args) >= 1: tg_chat_id = str(update.effective_chat.id) tg_feed_link = args[0] link_processed = parse(tg_feed_link) if link_processed.bozo == 0: user_data = sql.check_url_availability(tg_chat_id, tg_feed_link) if user_data: sql.remove_url(tg_chat_id, tg_feed_link) update.effective_message.reply_text("Removed URL from subscription") else: update.effective_message.reply_text("You haven't subscribed to this URL yet") else: update.effective_message.reply_text("This link is not an RSS Feed link") else: update.effective_message.reply_text("URL missing")
Example #18
Source File: atom.py From oh-my-rss with MIT License | 6 votes |
def parse_self_atom(feed_url): """ 解析本站提供的 RSS 源 :param feed_url: :return: 解析结果,成功字典;失败 None """ feed_path = urllib.parse.urlparse(feed_url).path try: name = resolve(feed_path).kwargs.get('name') except: name = None if name: try: Site.objects.get(name=name, status='active') return {"name": name} except: logger.warning(f'订阅源不存在:`{feed_url}') return None
Example #19
Source File: search_api.py From CodeScraper with MIT License | 6 votes |
def searchGithub(word, day, level): searchlevel = { 1: ['in:name,description', 'created'], 2: ['in:name,description,readme', 'created'], 3: ['in:name,description', 'pushed'], 4: ['in:name,description,readme', 'pushed']} github_url = 'https://api.github.com/search/repositories?q=' try: if word.find(' ') > 0: word.replace(' ', '\" \"') word = urllib.parse.quote('\"' + word + '\"') url = github_url + word + '+' + searchlevel[level][0] + '+' + searchlevel[level][1] + ':>' + day + '&s=updated&o=asc' headers = {"Accept": "application/vnd.github.mercy-preview+json"} result = requests.get(url, timeout=10, headers=headers) statuscode = result.status_code resultdata = result.json() codes = [] for a in resultdata['items']: name = a['full_name'] if a['size'] > 0: codes.append(name) return codes, statuscode except: return [], -1
Example #20
Source File: search_api.py From CodeScraper with MIT License | 6 votes |
def searchGist(word, day): if word.find(' ') > 0: word.replace(' ', '\" \"') word = urllib.parse.quote('\"' + word + '\"') url = 'https://gist.github.com/search?utf8=%E2%9C%93&q=' + word + '+created%3A>' + day + '&ref=searchresults' try: result = requests.get(url, timeout=10) statuscode = result.status_code root = lxml.html.fromstring(result.text) codes = [] for a in root.xpath('//div/a[@class="link-overlay"]'): # name = a.text_content() link = a.get('href') codes.append(link) return codes, statuscode except: return [], -1
Example #21
Source File: search_api.py From CodeScraper with MIT License | 6 votes |
def searchGitlab(word): try: if word.find(' ') > 0: word.replace(' ', '\" \"') word = urllib.parse.quote('\"' + word + '\"') url = 'https://gitlab.com/explore/projects?utf8=%E2%9C%93&name=' + word + '&sort=latest_activity_desc' result = requests.get(url, timeout=10) statuscode = result.status_code root = lxml.html.fromstring(result.text) codes = [] for a in root.xpath('//div/a[@class="project"]'): # name = a.text_content() link = a.get('href') codes.append(link) return codes, statuscode except: return [], -1
Example #22
Source File: search_api.py From CodeScraper with MIT License | 6 votes |
def googleCustomSearch(word, engine_id, api_key): try: if word.find(' ') > 0: word.replace(' ', '\" \"') word = urllib.parse.quote('\"' + word + '\"') headers = {"content-type": "application/json"} url = 'https://www.googleapis.com/customsearch/v1?key=' + api_key + '&rsz=filtered_cse&num=10&hl=en&prettyPrint=false&cx=' + engine_id + '&q=' + word + '&sort=date' result = requests.get(url, timeout=10, headers=headers) statuscode = result.status_code codes = {} if statuscode == 200: jsondata = result.json() if 'items' in jsondata.keys(): for item in jsondata['items']: name = item['title'] sub = item['snippet'] link = item['link'] codes[link] = [name, sub] return codes, statuscode except: return {}, -1
Example #23
Source File: search_api.py From CodeScraper with MIT License | 6 votes |
def getRSSFeeds(url, lastpost): try: headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0'} response = requests.get(url, timeout=10, headers=headers) updateditems = [] statuscode = response.status_code if statuscode == 200: rss = feedparser.parse(response.text) result = parseRSS(rss['entries']) for entry in result: if entry['link'] == lastpost['link']: break else: if entry['timestamp'] != None and lastpost['timestamp'] != None: if datetime.datetime.strptime(entry['timestamp'], '%Y-%m-%d %H:%M:%S') < datetime.datetime.strptime(lastpost['timestamp'], '%Y-%m-%d %H:%M:%S'): break updateditems.append(entry) return updateditems, statuscode except: return [], -1
Example #24
Source File: getCommand.py From CodeScraper with MIT License | 6 votes |
def checkRSSUrl(url): try: headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0'} response = requests.get(url, timeout=4, headers=headers) rss = feedparser.parse(response.text) rssurl = None if rss['version'] == 'rss10' or rss['version'] == 'rss20' or rss['version'] == 'atom10': rssurl = url else: root = lxml.html.fromstring(response.text) for link in root.xpath('//link[@type="application/rss+xml"]'): url = link.get('href') rss = feedparser.parse(url) if rss['version'] == 'rss10' or rss['version'] == 'rss20' or rss['version'] == 'atom10': rssurl = url return rssurl except: return None
Example #25
Source File: test_reuse_frontend.py From udata with GNU Affero General Public License v3.0 | 6 votes |
def test_recent_feed_owner(self): owner = UserFactory() ReuseFactory(owner=owner, datasets=[DatasetFactory()]) response = self.get(url_for('reuses.recent_feed')) self.assert200(response) feed = feedparser.parse(response.data) self.assertEqual(len(feed.entries), 1) entry = feed.entries[0] self.assertEqual(len(entry.authors), 1) author = entry.authors[0] self.assertEqual(author.name, owner.fullname) self.assertEqual(author.href, self.full_url('users.show', user=owner.id))
Example #26
Source File: test_dataset_frontend.py From udata with GNU Affero General Public License v3.0 | 6 votes |
def test_recent_feed_owner(self): owner = UserFactory() DatasetFactory(owner=owner, resources=[ResourceFactory()]) response = self.get(url_for('datasets.recent_feed')) self.assert200(response) feed = feedparser.parse(response.data) self.assertEqual(len(feed.entries), 1) entry = feed.entries[0] self.assertEqual(len(entry.authors), 1) author = entry.authors[0] self.assertEqual(author.name, owner.fullname) self.assertEqual(author.href, self.full_url('users.show', user=owner.id))
Example #27
Source File: rss.py From EmiliaHikari with GNU General Public License v3.0 | 6 votes |
def remove_url(bot, update): args = context.args if len(args) >= 1: tg_chat_id = str(update.effective_chat.id) tg_feed_link = args[0] link_processed = parse(tg_feed_link) if link_processed.bozo == 0: user_data = sql.check_url_availability(tg_chat_id, tg_feed_link) if user_data: sql.remove_url(tg_chat_id, tg_feed_link) send_message(update.effective_message, tl(update.effective_message, "URL dihapus dari langganan")) else: send_message(update.effective_message, tl(update.effective_message, "Anda belum berlangganan ke URL ini")) else: send_message(update.effective_message, tl(update.effective_message, "Tautan ini bukan tautan Umpan RSS")) else: send_message(update.effective_message, tl(update.effective_message, "URL hilang"))
Example #28
Source File: raw_parser.py From rssant with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _load_json(self, response: FeedResponse) -> dict: try: text = response.content.decode(response.encoding) except UnicodeDecodeError as ex: raise FeedParserError("Unicode decode error: {}".format(ex)) from ex try: data = json.loads(text) except json.JSONDecodeError as ex: raise FeedParserError("JSON parse error: {}".format(ex)) from ex return data
Example #29
Source File: IdentifyingTopic.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def getDocuments(self): url = 'https://sports.yahoo.com/mlb/rss.xml' feed = feedparser.parse(url) self.documents = [] for entry in feed['entries'][:5]: text = entry['summary'] if 'ex' in text: continue self.documents.append(text) print("-- {}".format(text)) print("INFO: Fetching documents from {} completed".format(url))
Example #30
Source File: deals.py From CloudBot with GNU General Public License v3.0 | 5 votes |
def slickdeals(): """- List the top 3 frontpage slickdeals.net deals.""" url = "https://slickdeals.net/newsearch.php?mode=frontpage&searcharea=deals&searchin=first&rss=1" feed = feedparser.parse(url) items = ( "{} ({})".format(item.title, web.try_shorten(item.link)) for item in feed.entries[:3] ) out = "slickdeals.net: " + ' \u2022 '.join(items) return out