diff --git a/README.md b/README.md index d7b81daa2e..982f494a89 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,21 @@ -# NYTdiff +# NYTdiff+ -Code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff). +Based on @j-e-d's code for the twitter bot [@nyt_diff](https://twitter.com/nyt_diff). +RSS feed fetching added for @xuv's twitter bot [@lesoir_diff](https://twitter.com/lesoir_diff) -The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file. +[Twitter keys](https://dev.twitter.com/) are needed. +[NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" are needed for The New York Times. +An RSS Url is needed for [Le Soir](http://lesoir.be) or any other news website. -[Twitter keys](https://dev.twitter.com/) and the [NYT API](http://developers.nytimes.com/) key for the "Top Stories V2" service are needed, values of this keys need to be entered in the run_diff.sh file. +Installation +------------ ++ The [phantomjs](http://phantomjs.org/) binary needs to be installed and the path updated in the run_diff.sh file. ++ `pip install -r requirements.txt` -Font: [Merriweather](https://fonts.google.com/specimen/Merriweather). Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/). +Credits +------- ++ Original script and idea: @j-e-d Juan E.D. http://unahormiga.com/ ++ RSS fetching: @xuv Julien Deswaef http://xuv.be ++ Font: [Merriweather](https://fonts.google.com/specimen/Merriweather) ++ Background pattern: [Paper Fibers](http://subtlepatterns.com/paper-fibers/). diff --git a/css/styles.css b/css/styles.css index ec24f9e2e6..ed5ec3e8ab 100755 --- a/css/styles.css +++ b/css/styles.css @@ -1,12 +1,12 @@ -@font-face { - font-family: Merriweather; +@font-face { + font-family: Merriweather; font-style: normal; font-weight: normal; - src: url('../fonts/Merriweather-Regular.ttf') format("truetype"); -} + src: url('../fonts/Merriweather-Regular.ttf') format("truetype"); +} -body { - background: lightgray url('../img/paper_fibers.png') repeat; +body { + background: lightgray url('../img/paper_fibers.png') repeat; font-family: Merriweather; font-size: 16px; } @@ -17,6 +17,7 @@ p { margin-top: 1em; margin-bottom: 1em; font-weight: normal; + word-wrap: break-word; } del { diff --git a/nytdiff.py b/nytdiff.py index e5fd615735..c6c8b224af 100644 --- a/nytdiff.py +++ b/nytdiff.py @@ -18,7 +18,9 @@ from simplediff import html_diff from selenium import webdriver -TIMEZONE = 'America/Buenos_Aires' +import feedparser + +TIMEZONE = 'Europe/Brussels' LOCAL_TZ = timezone(TIMEZONE) MAX_RETRIES = 10 RETRY_DELAY = 3 @@ -124,6 +126,7 @@ def tweet_with_media(self, text, images, reply_to=None): def tweet_text(self, text): if TESTING: print (text) + return True try: tweet_id = self.api.update_status(status=text) except: @@ -143,11 +146,17 @@ def tweet(self, text, article_id, url, column='id'): if reply_to is None: logging.info('Tweeting url: %s', url) tweet = self.tweet_text(url) - reply_to = tweet.id + # if TESTING, give a random id based on time + reply_to = tweet.id if not TESTING else time.time() logging.info('Replying to: %s', reply_to) tweet = self.tweet_with_media(text, images, reply_to) - logging.info('Id to store: %s', tweet.id) - self.update_tweet_db(article_id, tweet.id, column) + if TESTING : + # if TESTING, give a random id based on time + tweet_id = time.time() + else: + tweet_id = tweet.id + logging.info('Id to store: %s', tweet_id) + self.update_tweet_db(article_id, tweet_id, column) return def get_page(self, url, header=None, payload=None): @@ -302,7 +311,7 @@ def store_data(self, data): ORDER BY version DESC \ LIMIT 1' % (data['article_id'])) for row in result: - data['version'] = row['version'] + 1 + data['version'] = row['version'] self.versions_table.insert(data) url = data['url'] if row['url'] != data['url']: @@ -363,6 +372,115 @@ def parse_pages(self): if loop: self.remove_old('article_id') +class RSSParser(BaseParser): + def __init__(self, api, rss_url): + BaseParser.__init__(self, api) + self.urls = [rss_url] + self.articles_table = self.db['rss_ids'] + self.versions_table = self.db['rss_versions'] + + def entry_to_dict(self, article): + article_dict = dict() + article_dict['article_id'] = article.id.split(' ')[0] + article_dict['url'] = article.link + article_dict['title'] = article.title + article_dict['abstract'] = self.strip_html(article.description) + article_dict['author'] = article.author + # article_dict['illustration'] = article.media_content[0]['url'] + # article_dict['illustartion_size'] = article.media_content[0]['filesize'] + od = collections.OrderedDict(sorted(article_dict.items())) + article_dict['hash'] = hashlib.sha224( + repr(od.items()).encode('utf-8')).hexdigest() + article_dict['date_time'] = datetime.now(LOCAL_TZ) + return article_dict + + def store_data(self, data): + if self.articles_table.find_one( + article_id=data['article_id']) is None: # New + article = { + 'article_id': data['article_id'], + 'add_dt': data['date_time'], + 'status': 'home', + 'tweet_id': None + } + self.articles_table.insert(article) + logging.info('New article tracked: %s', data['url']) + data['version'] = 1 + self.versions_table.insert(data) + else: + # re insert + if self.articles_table.find_one(article_id=data['article_id'], + status='removed') is not None: + article = { + 'article_id': data['article_id'], + 'add_dt': data['date_time'], + } + + count = self.versions_table.count( + self.versions_table.table.columns.article_id == data[ + 'article_id'], + hash=data['hash']) + if count == 1: # Existing + pass + else: # Changed + result = self.db.query('SELECT * \ + FROM rss_versions\ + WHERE article_id = "%s" \ + ORDER BY version DESC \ + LIMIT 1' % (data['article_id'])) + for row in result: + data['version'] = row['version'] +1 + self.versions_table.insert(data) + url = data['url'] + if row['title'] != data['title']: + if self.show_diff(row['title'], data['title']): + tweet_text = "Modification du Titre" + self.tweet(tweet_text, data['article_id'], url, + 'article_id') + if row['abstract'] != data['abstract']: + if self.show_diff(row['abstract'], data['abstract']): + tweet_text = "Modification de la Description" + self.tweet(tweet_text, data['article_id'], url, + 'article_id') + if row['author'] != data['author']: + if self.show_diff(row['author'], data['author']): + tweet_text = "Modification de l'auteur" + self.tweet(tweet_text, data['article_id'], url, + 'article_id') + if row['url'] != data['url']: + if self.show_diff(row['url'], data['url']): + tweet_text = "Modification d'URL" + self.tweet(tweet_text, data['article_id'], url, + 'article_id') + + def loop_entries(self, entries): + if len(entries) == 0: + return False + for article in entries: + try: + article_dict = self.entry_to_dict(article) + if article_dict is not None: + self.store_data(article_dict) + self.current_ids.add(article_dict['article_id']) + except BaseException as e: + logging.exception('Problem looping RSS: %s', article) + print ('Exception: {}'.format(str(e))) + print('***************') + print(article) + print('***************') + return False + return True + + def parse_rss(self): + r = feedparser.parse(self.urls[0]) + if r is None: + logging.warning('Empty response RSS') + return + else: + logging.info('Parsing %s', r.feed.title) + loop = self.loop_entries(r.entries) + if loop: + self.remove_old('article_id') def main(): # logging @@ -380,17 +498,19 @@ def main(): auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.secure = True auth.set_access_token(access_token, access_token_secret) - nyt_api = tweepy.API(auth) - logging.debug('NYT Twitter API configured') + twitter_api = tweepy.API(auth) + logging.debug('Twitter API configured') try: - logging.debug('Starting NYT') - nyt_api_key = os.environ['NYT_API_KEY'] - nyt = NYTParser(nyt_api, nyt_api_key) - nyt.parse_pages() - logging.debug('Finished NYT') + logging.debug('Starting RSS') + #nyt_api_key = os.environ['NYT_API_KEY'] + #nyt = NYTParser(nyt_api, nyt_api_key) + rss_url = os.environ['RSS_URL'] + rss = RSSParser(twitter_api, rss_url) + rss.parse_rss() + logging.debug('Finished RSS') except: - logging.exception('NYT') + logging.exception('RSS') logging.info('Finished script') diff --git a/requirements.txt b/requirements.txt index 667343ad95..cf889d3892 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ alembic==0.8.7 bleach==1.4.3 dataset==0.6.4 +feedparser==5.2.1 html5lib==0.9999999 Mako==1.0.4 MarkupSafe==0.23 diff --git a/run_diff.sh b/run_diff.sh index 1ab5bf0082..a3fa18aac7 100755 --- a/run_diff.sh +++ b/run_diff.sh @@ -7,6 +7,7 @@ export NYT_TWITTER_ACCESS_TOKEN="" export NYT_TWITTER_ACCESS_TOKEN_SECRET="" export NYT_API_KEY="" +export RSS_URL="" export PHANTOMJS_PATH="./"