From b6f0d8ef185a11489feab64c50b3981586d174ad Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:26:19 +0530 Subject: [PATCH 1/6] rss-bot: Rename feed_file to feed_hashes_file. rss-bot had 2 different feed_file variables: 1. The user provided file with the list of feed URLs. 2. The file for each feed URL, to store the feed entries' hashes. To clearly differentiate between them, the latter has been renamed to feed_hashes_file. --- zulip/integrations/rss/rss-bot | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 49c82fb62..faca40b51 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -209,10 +209,12 @@ client: zulip.Client = zulip.Client( first_message = True for feed_url in feed_urls: - feed_file = os.path.join(opts.data_dir, urllib.parse.urlparse(feed_url).netloc) # Type: str + feed_hashes_file = os.path.join( + opts.data_dir, urllib.parse.urlparse(feed_url).netloc + ) # Type: str try: - with open(feed_file) as f: + with open(feed_hashes_file) as f: old_feed_hashes = {line.strip(): True for line in f.readlines()} except OSError: old_feed_hashes = {} @@ -256,7 +258,7 @@ for feed_url in feed_urls: new_hashes.append(entry_hash) first_message = False - with open(feed_file, "a") as f: + with open(feed_hashes_file, "a") as f: for hash in new_hashes: f.write(hash + "\n") From bb6eaf3e71607769a5c74f73753ef902c44d4d25 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:30:41 +0530 Subject: [PATCH 2/6] rss-bot: Assign feed_name only once per feed URL. Previously, it was being set for every entry. --- zulip/integrations/rss/rss-bot | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index faca40b51..c58b57595 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -221,6 +221,7 @@ for feed_url in feed_urls: new_hashes: List[str] = [] data = feedparser.parse(feed_url) + feed_name: str = data.feed.title or feed_url for entry in data.entries: entry_hash = compute_entry_hash(entry) @@ -243,8 +244,6 @@ for feed_url in feed_urls: # entries in reverse chronological order. break - feed_name: str = data.feed.title or feed_url - response: Dict[str, Any] = send_zulip(entry, feed_name) if response["result"] != "success": logger.error("Error processing %s", feed_url) From cd48ec2993b9c6da19620b1aad3bcf30876452c6 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:52:57 +0530 Subject: [PATCH 3/6] rss-bot: Introduce a max_batch_size option to prevent spamming messages. --- zulip/integrations/rss/rss-bot | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index c58b57595..29cce4b5e 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -24,6 +24,7 @@ import zulip VERSION = "0.9" RSS_DATA_DIR = os.path.expanduser(os.path.join("~", ".cache", "zulip-rss")) OLDNESS_THRESHOLD = 30 +MAX_BATCH_SIZE = 100 usage = """Usage: Send summaries of RSS entries for your favorite feeds to Zulip. @@ -92,6 +93,14 @@ parser.add_argument( help="Convert $ to $$ (for KaTeX processing)", default=False, ) +parser.add_argument( + "--max-batch-size", + dest="max_batch_size", + type=int, + help="The maximum number of messages to send at once", + default=MAX_BATCH_SIZE, + action="store", +) opts = parser.parse_args() @@ -239,9 +248,9 @@ for feed_url in feed_urls: if entry_hash in old_feed_hashes: # We've already seen this. No need to process any older entries. break - if not old_feed_hashes and len(new_hashes) >= 3: - # On a first run, pick up the 3 most recent entries. An RSS feed has - # entries in reverse chronological order. + if not old_feed_hashes and len(new_hashes) >= opts.max_batch_size: + # On a first run, pick up the n (= opts.max_batch_size) most recent entries. + # An RSS feed has entries in reverse chronological order. break response: Dict[str, Any] = send_zulip(entry, feed_name) From 7ddc07cf95b8bbf62a0aa492123be149c1424459 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:03:37 +0530 Subject: [PATCH 4/6] rss-bot: Introduce a earliest_entry_age option to establish a cutoff. Renamed the OLDNESS_THRESHOLD constant that was being used for the same, to match the name of the newly added option. --- zulip/integrations/rss/rss-bot | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 29cce4b5e..ec72efcc9 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -23,7 +23,7 @@ import zulip VERSION = "0.9" RSS_DATA_DIR = os.path.expanduser(os.path.join("~", ".cache", "zulip-rss")) -OLDNESS_THRESHOLD = 30 +EARLIEST_ENTRY_AGE = 30 MAX_BATCH_SIZE = 100 usage = """Usage: Send summaries of RSS entries for your favorite feeds to Zulip. @@ -101,6 +101,14 @@ parser.add_argument( default=MAX_BATCH_SIZE, action="store", ) +parser.add_argument( + "--earliest-entry-age", + dest="earliest_entry_age", + type=int, + help="The earliest date (relative to today) you want to process entries from (in days)", + default=EARLIEST_ENTRY_AGE, + action="store", +) opts = parser.parse_args() @@ -240,7 +248,7 @@ for feed_url in feed_urls: ) if ( entry_time is not None - and time.time() - calendar.timegm(entry_time) > OLDNESS_THRESHOLD * 60 * 60 * 24 + and time.time() - calendar.timegm(entry_time) > opts.earliest_entry_age * 60 * 60 * 24 ): # As a safeguard against misbehaving feeds, don't try to process # entries older than some threshold. From dd147b18aae7bd918a62c5f8e5b9396c2ee0a119 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 18 Dec 2025 10:43:11 +0530 Subject: [PATCH 5/6] rss-bot: Split out get_entry_time() and entry_threshold. We will be using entry time for sorting entries in the following commits. --- zulip/integrations/rss/rss-bot | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index ec72efcc9..3e42c933f 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -14,7 +14,7 @@ import sys import time import urllib.parse from html.parser import HTMLParser -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List import feedparser from typing_extensions import override @@ -189,6 +189,11 @@ def elide_subject(subject: str) -> str: return subject +def get_entry_time(entry: Any) -> tuple[float, bool]: + entry_time = entry.get("published_parsed", entry.get("updated_parsed")) + return (calendar.timegm(entry_time), True) if entry_time else (float("-inf"), False) + + def send_zulip(entry: Any, feed_name: str) -> Dict[str, Any]: body: str = entry.summary if opts.unwrap: @@ -239,17 +244,13 @@ for feed_url in feed_urls: new_hashes: List[str] = [] data = feedparser.parse(feed_url) feed_name: str = data.feed.title or feed_url + # Safeguard to not process older entries in unordered feeds + entry_threshold = time.time() - opts.earliest_entry_age * 60 * 60 * 24 for entry in data.entries: entry_hash = compute_entry_hash(entry) - # An entry has either been published or updated. - entry_time: Optional[Tuple[int, int]] = entry.get( - "published_parsed", entry.get("updated_parsed") - ) - if ( - entry_time is not None - and time.time() - calendar.timegm(entry_time) > opts.earliest_entry_age * 60 * 60 * 24 - ): + entry_time, is_time_tagged = get_entry_time(entry) + if (is_time_tagged and entry_time < entry_threshold) or entry_hash in old_feed_hashes: # As a safeguard against misbehaving feeds, don't try to process # entries older than some threshold. continue From e19ce6a291800250fb6f228b2e02e92636f97a85 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 18 Dec 2025 10:45:04 +0530 Subject: [PATCH 6/6] rss-bot: Support unordered RSS feeds. By splitting the logic into two loops - one for processing all the entries in the feed, and another to post only the latest ones in chronological order. Instead of tracking new_hashes in memory while processing the feed file, we track unhashed_entries now, since we will not be hashing all the entries, only the ones that we post. Fixes #831. --- zulip/integrations/rss/rss-bot | 56 ++++++++++++++++------------------ 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 3e42c933f..faea7b780 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -228,8 +228,6 @@ client: zulip.Client = zulip.Client( client="ZulipRSS/" + VERSION, ) -first_message = True - for feed_url in feed_urls: feed_hashes_file = os.path.join( opts.data_dir, urllib.parse.urlparse(feed_url).netloc @@ -241,7 +239,7 @@ for feed_url in feed_urls: except OSError: old_feed_hashes = {} - new_hashes: List[str] = [] + unhashed_entries: List[tuple[Any, str, float]] = [] data = feedparser.parse(feed_url) feed_name: str = data.feed.title or feed_url # Safeguard to not process older entries in unordered feeds @@ -251,32 +249,32 @@ for feed_url in feed_urls: entry_hash = compute_entry_hash(entry) entry_time, is_time_tagged = get_entry_time(entry) if (is_time_tagged and entry_time < entry_threshold) or entry_hash in old_feed_hashes: - # As a safeguard against misbehaving feeds, don't try to process - # entries older than some threshold. continue - if entry_hash in old_feed_hashes: - # We've already seen this. No need to process any older entries. - break - if not old_feed_hashes and len(new_hashes) >= opts.max_batch_size: - # On a first run, pick up the n (= opts.max_batch_size) most recent entries. - # An RSS feed has entries in reverse chronological order. - break - - response: Dict[str, Any] = send_zulip(entry, feed_name) - if response["result"] != "success": - logger.error("Error processing %s", feed_url) - logger.error("%s", response) - if first_message: - # This is probably some fundamental problem like the stream not - # existing or something being misconfigured, so bail instead of - # getting the same error for every RSS entry. - log_error_and_exit("Failed to process first message") - # Go ahead and move on -- perhaps this entry is corrupt. - new_hashes.append(entry_hash) - first_message = False + unhashed_entries.append((entry, entry_hash, entry_time)) - with open(feed_hashes_file, "a") as f: - for hash in new_hashes: - f.write(hash + "\n") + # We process all entries to support unordered feeds, + # but post only the latest ones in chronological order. + sorted_entries = sorted(unhashed_entries, key=lambda x: x[2])[-opts.max_batch_size :] - logger.info("Sent zulips for %d %s entries", len(new_hashes), feed_url) + with open(feed_hashes_file, "a") as f: + for entry_tuple in sorted_entries: + entry, entry_hash, _ = entry_tuple + + response: Dict[str, Any] = send_zulip(entry, feed_name) + if response["result"] != "success": + logger.error("Error processing %s", feed_url) + logger.error("%s", response) + if not old_feed_hashes and entry_tuple == sorted_entries[0]: + # This is probably some fundamental problem like the stream not + # existing or something being misconfigured, so bail instead of + # getting the same error for every RSS entry. + log_error_and_exit("Failed to process first message") + # Go ahead and move on -- perhaps this entry is corrupt. + f.write(entry_hash + "\n") + + logger.info( + "Processed %d entries from %s and sent %d zulips", + len(unhashed_entries), + feed_url, + len(sorted_entries), + )