From 6072a9422891ff397b890e5904797e1d948d4cbb Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 9 Feb 2026 21:29:31 +0000 Subject: [PATCH 1/4] Add attachment archival to MongoDB GridFS to fix expired Discord CDN URLs Discord's signed CDN URLs expire after ~24h, breaking images in logs. This adds an optional background archiver downloads attachments and avatars before they expire, compresses images to JPEG, and stores them in MongoDB GridFS. Archived files are served via a new /attachments/ route. Configurable retention period, image quality, max resolution, and scan interval. To fix #80 --- .env.example | 18 ++- README.md | 45 ++++++ app.json | 25 +++ app.py | 77 ++++++++- core/archiver.py | 411 +++++++++++++++++++++++++++++++++++++++++++++++ core/models.py | 96 +++++++++-- requirements.txt | 2 + 7 files changed, 656 insertions(+), 18 deletions(-) create mode 100644 core/archiver.py diff --git a/.env.example b/.env.example index 2797714..bb63ba4 100644 --- a/.env.example +++ b/.env.example @@ -5,9 +5,23 @@ LOG_URL_PREFIX=/logs # Listen address and port. Don't change them if you don't know what they do. HOST=0.0.0.0 PORT=8000 -# Whether if the logviewer should use a proxy to view attachments. -# If set to "no" (default), attachments will expire after 1 day and the logviewer won't be able to show the attachment. +# Attachments will expire after 1 day and the logviewer won't be able to show the attachment. # Please be aware that this may violate Discord TOS and the proxy will have full access to your attachments. +# Your options are to either use an attachment proxy or archive them to Mongo GridFS. # Modmail/Logviewer is not affiliated with the proxy in any way. USE AT YOUR OWN RISK. USE_ATTACHMENT_PROXY=no ATTACHMENT_PROXY_URL=https://cdn.discordapp.xyz +# Archive attachments to MongoDB GridFS so they persist beyond Discord's 24h CDN expiry. +SAVE_ATTACHMENTS=no +# How often (in seconds) the archiver scans for new unarchived attachments. Default: 600 (10 minutes) +ARCHIVE_INTERVAL=600 +# Maximum file size in bytes to archive. Files larger than this are skipped. Default: 26214400 (25 MB) +ARCHIVE_MAX_FILE_SIZE=26214400 +# How long to keep archived attachments. Options: 1w, 1month, 1y, forever. Default: forever +ARCHIVE_RETENTION=forever +# Compress images to JPEG before storing. Greatly reduces storage usage. Default: yes +ARCHIVE_COMPRESS_IMAGES=yes +# JPEG quality (1-100). Lower = smaller files. 65 is a good balance of quality and size. Default: 65 +ARCHIVE_IMAGE_QUALITY=65 +# Max image resolution (longest edge in pixels). Images larger than this are downscaled. Default: 1920 +ARCHIVE_IMAGE_MAX_RESOLUTION=1920 diff --git a/README.md b/README.md index fd7d3bf..af9f555 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,51 @@ We recommend setting up a reverse proxy (e.g. Nginx) to port forward external po To accept requests from a domain instead of your server IP, simply set an `A`/`AAAA` record from your DNS provider to forward your domain to your server IP. +## Preserving Attachments + +Discord's CDN URLs for attachments expire after ~24 hours, which means images in your logs will break. There are two ways to fix this: + +### Option 1: Attachment Archival (Recommended) + +Downloads attachments and stores them directly in your MongoDB using GridFS. No third parties involved. + +Add to your `.env` file: +``` +SAVE_ATTACHMENTS=yes +``` + +| Variable | Default | Description | +|---|---|---| +| `SAVE_ATTACHMENTS` | `no` | Enable attachment archival (`yes` / `no`) | +| `ARCHIVE_INTERVAL` | `600` | Seconds between scans for new attachments | +| `ARCHIVE_MAX_FILE_SIZE` | `26214400` | Max file size in bytes to archive (default 25 MB) | +| `ARCHIVE_RETENTION` | `forever` | How long to keep files: `1w`, `1month`, `1y`, `forever` | +| `ARCHIVE_COMPRESS_IMAGES` | `yes` | Compress images to JPEG before storing (`yes` / `no`) | +| `ARCHIVE_IMAGE_QUALITY` | `65` | JPEG quality 1-100 (lower = smaller files) | +| `ARCHIVE_IMAGE_MAX_RESOLUTION` | `1920` | Downscale images larger than this (longest edge in px) | + +A background task scans your logs collection periodically and downloads any Discord CDN attachment/avatar URLs it finds. Images are compressed to JPEG (configurable) and stored in MongoDB GridFS. When a log page is viewed, archived images are served from `/attachments/` instead of the expired Discord URLs. + +> [!NOTE] +> Attachments that already expired before enabling this feature cannot be recovered. Enable it as soon as possible to archive existing logs while their URLs are still valid. + +> [!NOTE] +> The free MongoDB Atlas tier (M0) has a 512 MB storage limit. With compression enabled, this can hold roughly 2,000-10,000 images depending on size. Consider adjusting `ARCHIVE_IMAGE_QUALITY` and `ARCHIVE_IMAGE_MAX_RESOLUTION` to reduce storage usage, or use a paid tier for more space. + +### Option 2: Attachment Proxy + +Routes image requests through a third-party proxy service instead of storing them yourself. Simpler setup but relies on an external service. + +| Variable | Default | Description | +|---|---|---| +| `USE_ATTACHMENT_PROXY` | `no` | Enable the attachment proxy (`yes` / `no`) | +| `ATTACHMENT_PROXY_URL` | `https://cdn.discordapp.xyz` | The proxy service URL (not your site URL) | + +> [!WARNING] +> The proxy service is not affiliated with Modmail. It will have full access to your attachments. Use at your own risk. + +You can enable both options together - archived images take priority, and the proxy is used as a fallback for images not yet archived. + ## Discord OAuth2 Protecting your logs with a login (Discord Oauth2 support) is a premium feature, only available to [Premium members](https://buymeacoffee.com/modmaildev). diff --git a/app.json b/app.json index 4187572..0c92707 100644 --- a/app.json +++ b/app.json @@ -16,6 +16,31 @@ "description": "Proxy URL for viewing attachments.", "required": false, "value": "https://cdn.discordapp.xyz" + }, + "SAVE_ATTACHMENTS": { + "description": "Whether to archive attachments to MongoDB GridFS. Set to 'yes' to enable.", + "required": false, + "value": "no" + }, + "ARCHIVE_INTERVAL": { + "description": "How often (in seconds) to scan for unarchived attachments. Default: 600", + "required": false, + "value": "600" + }, + "ARCHIVE_RETENTION": { + "description": "How long to keep archived attachments. Options: 1w, 1month, 1y, forever. Default: forever", + "required": false, + "value": "forever" + }, + "ARCHIVE_COMPRESS_IMAGES": { + "description": "Compress images to JPEG before storing to save space. Default: yes", + "required": false, + "value": "yes" + }, + "ARCHIVE_IMAGE_QUALITY": { + "description": "JPEG quality (1-100). Lower = smaller files. Default: 65", + "required": false, + "value": "65" } } } diff --git a/app.py b/app.py index 277aa81..a2bd7cb 100644 --- a/app.py +++ b/app.py @@ -3,13 +3,14 @@ import html import os +from bson import ObjectId from dotenv import load_dotenv -from motor.motor_asyncio import AsyncIOMotorClient +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket from sanic import Sanic, response from sanic.exceptions import NotFound from jinja2 import Environment, FileSystemLoader -from core.models import LogEntry +from core.models import LogEntry, build_archive_lookup load_dotenv() @@ -61,6 +62,15 @@ def strtobool(val): raise ValueError("invalid truth value %r" % (val,)) +SAVE_ATTACHMENTS = strtobool(os.getenv("SAVE_ATTACHMENTS", "no")) +ARCHIVE_INTERVAL = int(os.getenv("ARCHIVE_INTERVAL", "600")) +ARCHIVE_MAX_FILE_SIZE = int(os.getenv("ARCHIVE_MAX_FILE_SIZE", str(25 * 1024 * 1024))) +ARCHIVE_RETENTION = os.getenv("ARCHIVE_RETENTION", "forever").strip().lower() +ARCHIVE_COMPRESS_IMAGES = strtobool(os.getenv("ARCHIVE_COMPRESS_IMAGES", "yes")) +ARCHIVE_IMAGE_QUALITY = int(os.getenv("ARCHIVE_IMAGE_QUALITY", "65")) +ARCHIVE_IMAGE_MAX_RESOLUTION = int(os.getenv("ARCHIVE_IMAGE_MAX_RESOLUTION", "1920")) + + @app.listener("before_server_start") async def init(app, loop): app.ctx.db = AsyncIOMotorClient(MONGO_URI).modmail_bot @@ -71,6 +81,32 @@ async def init(app, loop): else: app.ctx.attachment_proxy_url = None + # Attachment archival setup + app.ctx.save_attachments = bool(SAVE_ATTACHMENTS) + if app.ctx.save_attachments: + app.ctx.fs = AsyncIOMotorGridFSBucket(app.ctx.db, bucket_name="attachments") + await app.ctx.db.archived_attachments.create_index("original_url", unique=True) + await app.ctx.db.archived_attachments.create_index("status") + await app.ctx.db.archived_attachments.create_index("archived_at") + else: + app.ctx.fs = None + + +@app.listener("after_server_start") +async def start_archiver(app, loop): + if app.ctx.save_attachments: + from core.archiver import run_archiver_loop + archiver_config = { + "interval": ARCHIVE_INTERVAL, + "max_file_size": ARCHIVE_MAX_FILE_SIZE, + "retention": ARCHIVE_RETENTION, + "compress_images": bool(ARCHIVE_COMPRESS_IMAGES), + "image_quality": ARCHIVE_IMAGE_QUALITY, + "image_max_resolution": ARCHIVE_IMAGE_MAX_RESOLUTION, + } + app.add_task(run_archiver_loop(app, archiver_config)) + + @app.exception(NotFound) async def not_found(request, exc): return render_template("not_found") @@ -89,7 +125,8 @@ async def get_raw_logs_file(request, key): if document is None: raise NotFound - log_entry = LogEntry(app, document) + archive_lookup = await build_archive_lookup(app, document) + log_entry = LogEntry(app, document, archive_lookup=archive_lookup) return log_entry.render_plain_text() @@ -102,11 +139,43 @@ async def get_logs_file(request, key): if document is None: raise NotFound - log_entry = LogEntry(app, document) + archive_lookup = await build_archive_lookup(app, document) + log_entry = LogEntry(app, document, archive_lookup=archive_lookup) return log_entry.render_html() +@app.get("/attachments//") +async def serve_attachment(request, file_id, filename): + """Serve an archived attachment from GridFS.""" + if not app.ctx.save_attachments or app.ctx.fs is None: + raise NotFound + + try: + oid = ObjectId(file_id) + except Exception: + raise NotFound + + try: + grid_out = await app.ctx.fs.open_download_stream(oid) + except Exception: + raise NotFound + + content_type = "application/octet-stream" + if grid_out.metadata: + content_type = grid_out.metadata.get("content_type", content_type) + + data = await grid_out.read() + return response.raw( + data, + content_type=content_type, + headers={ + "Content-Disposition": f'inline; filename="{filename}"', + "Cache-Control": "public, max-age=31536000, immutable", + }, + ) + + if __name__ == "__main__": app.run( host=os.getenv("HOST", "0.0.0.0"), diff --git a/core/archiver.py b/core/archiver.py new file mode 100644 index 0000000..ae5f220 --- /dev/null +++ b/core/archiver.py @@ -0,0 +1,411 @@ +""" +Background task that archives Discord CDN attachments and avatars to MongoDB GridFS. + +When enabled via SAVE_ATTACHMENTS=yes, this module periodically scans the logs collection +for Discord CDN URLs, downloads them before they expire (~24h), and stores them in GridFS. + +Features: +- Image compression: Converts images to optimized JPEG to minimize storage +- Retention policy: Auto-deletes archived files after a configurable period +""" + +import asyncio +import io +import logging +import re +from datetime import datetime, timedelta, timezone + +import aiohttp +from pymongo.errors import DuplicateKeyError + +logger = logging.getLogger("logviewer.archiver") + +DISCORD_CDN_PATTERN = re.compile( + r"https?://(?:cdn\.discordapp\.com|media\.discordapp\.net)/" +) + +COMPRESSIBLE_TYPES = {"image/png", "image/jpeg", "image/webp", "image/bmp", "image/tiff"} + +CONTENT_TYPE_MAP = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".mp4": "video/mp4", + ".webm": "video/webm", + ".mp3": "audio/mpeg", + ".ogg": "audio/ogg", + ".wav": "audio/wav", + ".pdf": "application/pdf", + ".txt": "text/plain", +} + +RETENTION_MAP = { + "1w": timedelta(weeks=1), + "1week": timedelta(weeks=1), + "1m": timedelta(days=30), + "1month": timedelta(days=30), + "1y": timedelta(days=365), + "1year": timedelta(days=365), + "forever": None, +} + + +def parse_retention(value): + """Parse retention string to timedelta. Returns None for 'forever'.""" + td = RETENTION_MAP.get(value) + if td is None and value != "forever": + logger.warning("Unknown ARCHIVE_RETENTION value '%s', defaulting to 'forever'", value) + return td + + +def guess_content_type(filename, response_content_type=None): + if response_content_type and response_content_type != "application/octet-stream": + return response_content_type + for ext, ctype in CONTENT_TYPE_MAP.items(): + if filename.lower().endswith(ext): + return ctype + return "application/octet-stream" + + +def strip_query_params(url): + """Return URL without query parameters (Discord's signed params change but the path is stable).""" + return url.split("?")[0] + + +def compress_image(image_data, content_type, quality, max_resolution): + """ + Compress an image to JPEG with optimized settings for minimal file size. + + Strategy: + - Convert to RGB (JPEG doesn't support alpha) + - Downscale if either dimension exceeds max_resolution + - Save as progressive JPEG at the configured quality + - Strip all EXIF/metadata + + Returns (compressed_bytes, "image/jpeg") or (original_data, original_type) if compression fails or is larger. + """ + try: + from PIL import Image + + img = Image.open(io.BytesIO(image_data)) + + # Skip animated images (GIFs with multiple frames) + if getattr(img, "n_frames", 1) > 1: + return image_data, content_type + + # Convert to RGB (drop alpha channel for JPEG) + if img.mode in ("RGBA", "P", "LA"): + background = Image.new("RGB", img.size, (54, 57, 63)) # Discord dark bg color + if img.mode == "P": + img = img.convert("RGBA") + if img.mode in ("RGBA", "LA"): + background.paste(img, mask=img.split()[-1]) + img = background + else: + img = img.convert("RGB") + elif img.mode != "RGB": + img = img.convert("RGB") + + # Downscale if too large (preserve aspect ratio) + w, h = img.size + if max(w, h) > max_resolution: + if w > h: + new_w = max_resolution + new_h = int(h * (max_resolution / w)) + else: + new_h = max_resolution + new_w = int(w * (max_resolution / h)) + img = img.resize((new_w, new_h), Image.LANCZOS) + + # Save as optimized progressive JPEG + buf = io.BytesIO() + img.save( + buf, + format="JPEG", + quality=quality, + optimize=True, + progressive=True, + subsampling="4:2:0", # Maximum chroma subsampling for smallest size + ) + compressed = buf.getvalue() + + # Only use compressed version if it's actually smaller + if len(compressed) < len(image_data): + logger.debug( + "Compressed image: %d -> %d bytes (%.0f%% reduction)", + len(image_data), len(compressed), + (1 - len(compressed) / len(image_data)) * 100 + ) + return compressed, "image/jpeg" + + return image_data, content_type + + except Exception as e: + logger.warning("Image compression failed, storing original: %s", e) + return image_data, content_type + + +async def download_and_store(app, session, url, filename, config): + """Download a URL and store it in GridFS. Returns GridFS ObjectId on success, None on failure.""" + max_size = config["max_file_size"] + try: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=120)) as resp: + if resp.status == 404: + logger.warning("Attachment 404: %s", strip_query_params(url)) + return "404" + if resp.status != 200: + logger.warning("Download failed (HTTP %d): %s", resp.status, strip_query_params(url)) + return None + + content_length = resp.content_length + if content_length and content_length > max_size: + logger.info("Skipping oversized attachment (%d bytes): %s", content_length, strip_query_params(url)) + return "oversized" + + content_type = guess_content_type(filename, resp.content_type) + + # Read entire file for potential compression + data = await resp.read() + if len(data) > max_size: + logger.info("Skipping oversized attachment (%d bytes): %s", len(data), strip_query_params(url)) + return "oversized" + + # Compress images if enabled + stored_filename = filename + if config["compress_images"] and content_type in COMPRESSIBLE_TYPES: + data, content_type = compress_image( + data, content_type, + quality=config["image_quality"], + max_resolution=config["image_max_resolution"], + ) + # Update filename extension if we converted to JPEG + if content_type == "image/jpeg" and not filename.lower().endswith((".jpg", ".jpeg")): + stored_filename = filename.rsplit(".", 1)[0] + ".jpg" if "." in filename else filename + ".jpg" + + grid_in = app.ctx.fs.open_upload_stream( + stored_filename, + metadata={ + "content_type": content_type, + "original_url": strip_query_params(url), + "archived_at": datetime.now(timezone.utc), + "original_size": len(data), + }, + ) + + try: + await grid_in.write(data) + await grid_in.close() + except Exception: + await grid_in.abort() + raise + + logger.info("Archived: %s -> GridFS %s (%d bytes)", strip_query_params(url), grid_in._id, len(data)) + return grid_in._id + + except asyncio.TimeoutError: + logger.warning("Timeout downloading: %s", strip_query_params(url)) + return None + except aiohttp.ClientError as e: + logger.warning("Client error downloading %s: %s", strip_query_params(url), e) + return None + except Exception as e: + logger.error("Unexpected error archiving %s: %s", strip_query_params(url), e, exc_info=True) + return None + + +async def _record_result(db, canonical_url, filename, result): + """Record the archival result in the archived_attachments collection.""" + try: + if result == "404": + await db.archived_attachments.insert_one({ + "original_url": canonical_url, + "filename": filename, + "status": "failed_permanent", + "reason": "404_not_found", + "failed_at": datetime.now(timezone.utc), + }) + elif result == "oversized": + await db.archived_attachments.insert_one({ + "original_url": canonical_url, + "filename": filename, + "status": "failed_permanent", + "reason": "oversized", + "failed_at": datetime.now(timezone.utc), + }) + elif result is not None: + await db.archived_attachments.insert_one({ + "original_url": canonical_url, + "gridfs_id": result, + "filename": filename, + "status": "archived", + "archived_at": datetime.now(timezone.utc), + }) + # If result is None (transient failure), don't record - will retry next cycle + except DuplicateKeyError: + pass # Another instance archived it first + + +async def archive_attachments_batch(app, session, config): + """Scan logs for unarchived attachment URLs and archive them.""" + db = app.ctx.db + + cursor = db.logs.find( + {"messages.attachments": {"$exists": True, "$ne": []}}, + {"messages.attachments": 1, "key": 1}, + ).batch_size(50) + + count_archived = 0 + + async for doc in cursor: + for message in doc.get("messages", []): + for att in message.get("attachments", []): + if isinstance(att, str): + url = att + filename = "attachment" + elif isinstance(att, dict): + url = att.get("url", "") + filename = att.get("filename", "attachment") + else: + continue + + if not url or not DISCORD_CDN_PATTERN.match(url): + continue + + canonical_url = strip_query_params(url) + + existing = await db.archived_attachments.find_one({"original_url": canonical_url}) + if existing: + continue + + result = await download_and_store(app, session, url, filename, config) + await _record_result(db, canonical_url, filename, result) + + if result is not None and result not in ("404", "oversized"): + count_archived += 1 + + await asyncio.sleep(0.5) + + if count_archived > 0: + logger.info("Attachments archived this cycle: %d", count_archived) + + +async def archive_avatars_batch(app, session, config): + """Scan logs for unarchived avatar URLs and archive them.""" + db = app.ctx.db + + cursor = db.logs.find( + {}, + { + "creator.avatar_url": 1, + "recipient.avatar_url": 1, + "closer.avatar_url": 1, + "messages.author.avatar_url": 1, + "key": 1, + }, + ).batch_size(50) + + seen_urls = set() + count_archived = 0 + + async for doc in cursor: + avatar_urls = [] + + for field in ("creator", "recipient", "closer"): + user_data = doc.get(field) + if user_data and isinstance(user_data, dict): + avatar_url = user_data.get("avatar_url", "") + if avatar_url: + avatar_urls.append(avatar_url) + + for message in doc.get("messages", []): + author = message.get("author") + if author and isinstance(author, dict): + avatar_url = author.get("avatar_url", "") + if avatar_url: + avatar_urls.append(avatar_url) + + for url in avatar_urls: + if not DISCORD_CDN_PATTERN.match(url): + continue + + canonical_url = strip_query_params(url) + if canonical_url in seen_urls: + continue + seen_urls.add(canonical_url) + + existing = await db.archived_attachments.find_one({"original_url": canonical_url}) + if existing: + continue + + url_path = canonical_url.rsplit("/", 1)[-1] if "/" in canonical_url else "avatar.png" + result = await download_and_store(app, session, url, url_path, config) + await _record_result(db, canonical_url, url_path, result) + + if result is not None and result not in ("404", "oversized"): + count_archived += 1 + + await asyncio.sleep(0.5) + + if count_archived > 0: + logger.info("Avatars archived this cycle: %d", count_archived) + + +async def cleanup_expired(app, retention_delta): + """Delete archived attachments older than the retention period.""" + if retention_delta is None: + return # "forever" - no cleanup + + db = app.ctx.db + cutoff = datetime.now(timezone.utc) - retention_delta + + cursor = db.archived_attachments.find( + {"status": "archived", "archived_at": {"$lt": cutoff}}, + {"gridfs_id": 1, "original_url": 1}, + ) + + count_deleted = 0 + async for record in cursor: + gridfs_id = record.get("gridfs_id") + if gridfs_id: + try: + await app.ctx.fs.delete(gridfs_id) + except Exception as e: + logger.warning("Failed to delete GridFS file %s: %s", gridfs_id, e) + + await db.archived_attachments.delete_one({"_id": record["_id"]}) + count_deleted += 1 + + if count_deleted > 0: + logger.info("Retention cleanup: deleted %d expired archives (cutoff: %s)", count_deleted, cutoff.isoformat()) + + +async def run_archiver_loop(app, config): + """Main archiver loop. Runs indefinitely, sleeping between scans.""" + interval = config["interval"] + retention_delta = parse_retention(config["retention"]) + + logger.info( + "Attachment archiver started (interval=%ds, max_size=%d bytes, retention=%s, compress=%s, quality=%d, max_res=%d)", + interval, config["max_file_size"], config["retention"], + config["compress_images"], config["image_quality"], config["image_max_resolution"], + ) + + await asyncio.sleep(5) # Let the server fully start + + while True: + try: + async with aiohttp.ClientSession( + headers={"User-Agent": "ModmailLogviewer/1.0 (attachment archiver)"} + ) as session: + await archive_attachments_batch(app, session, config) + await archive_avatars_batch(app, session, config) + await cleanup_expired(app, retention_delta) + except asyncio.CancelledError: + logger.info("Archiver task cancelled, shutting down") + return + except Exception as e: + logger.error("Archiver loop error: %s", e, exc_info=True) + + await asyncio.sleep(interval) diff --git a/core/models.py b/core/models.py index 7c0b850..88ec000 100644 --- a/core/models.py +++ b/core/models.py @@ -1,15 +1,74 @@ +import re from datetime import datetime, timezone -import dateutil.parser +import dateutil.parser from sanic import response from natural.date import duration from .formatter import format_content_html +_DISCORD_CDN_PATTERN = re.compile( + r"https?://(?:cdn\.discordapp\.com|media\.discordapp\.net)/" +) + + +async def build_archive_lookup(app, document): + """ + Pre-fetch all archived attachment mappings for URLs found in this log document. + Returns a dict mapping canonical_url -> "/attachments//" + """ + if not getattr(app.ctx, "save_attachments", False): + return {} + + urls = set() + + for message in document.get("messages", []): + for att in message.get("attachments", []): + if isinstance(att, str): + url = att + elif isinstance(att, dict): + url = att.get("url", "") + else: + continue + if url and _DISCORD_CDN_PATTERN.match(url): + urls.add(url.split("?")[0]) + + for field in ("creator", "recipient", "closer"): + user_data = document.get(field) + if user_data and isinstance(user_data, dict): + avatar_url = user_data.get("avatar_url", "") + if avatar_url and _DISCORD_CDN_PATTERN.match(avatar_url): + urls.add(avatar_url.split("?")[0]) + + for message in document.get("messages", []): + author = message.get("author") + if author and isinstance(author, dict): + avatar_url = author.get("avatar_url", "") + if avatar_url and _DISCORD_CDN_PATTERN.match(avatar_url): + urls.add(avatar_url.split("?")[0]) + + if not urls: + return {} + + cursor = app.ctx.db.archived_attachments.find( + {"original_url": {"$in": list(urls)}, "status": "archived"}, + {"original_url": 1, "gridfs_id": 1, "filename": 1}, + ) + + lookup = {} + async for record in cursor: + original_url = record["original_url"] + gridfs_id = str(record["gridfs_id"]) + filename = record.get("filename", "attachment") + lookup[original_url] = f"/attachments/{gridfs_id}/{filename}" + + return lookup + class LogEntry: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app + self.archive_lookup = archive_lookup or {} self.key = data["key"] self.open = data["open"] self.created_at = dateutil.parser.parse(data["created_at"]).astimezone(timezone.utc) @@ -19,11 +78,11 @@ def __init__(self, app, data): ) self.channel_id = int(data["channel_id"]) self.guild_id = int(data["guild_id"]) - self.creator = User(app, data["creator"]) - self.recipient = User(app, data["recipient"]) - self.closer = User(app, data["closer"]) if not self.open else None + self.creator = User(app, data["creator"], archive_lookup=self.archive_lookup) + self.recipient = User(app, data["recipient"], archive_lookup=self.archive_lookup) + self.closer = User(app, data["closer"], archive_lookup=self.archive_lookup) if not self.open else None self.close_message = format_content_html(data.get("close_message") or "") - self.messages = [Message(app, m) for m in data["messages"]] + self.messages = [Message(app, m, archive_lookup=self.archive_lookup) for m in data["messages"]] self.internal_messages = [m for m in self.messages if m.type == "internal"] self.thread_messages = [ m for m in self.messages if m.type not in ("internal", "system") @@ -112,7 +171,7 @@ def render_plain_text(self): class User: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app self.id = int(data.get("id")) self.name = data["name"] @@ -120,6 +179,11 @@ def __init__(self, app, data): self.avatar_url = data["avatar_url"] self.mod = data["mod"] + if archive_lookup: + canonical = self.avatar_url.split("?")[0] + if canonical in archive_lookup: + self.avatar_url = archive_lookup[canonical] + @property def default_avatar_url(self): return "https://cdn.discordapp.com/embed/avatars/{}.png".format( @@ -148,7 +212,7 @@ def type(self): class Attachment: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app if isinstance(data, str): # Backwards compatibility self.id = 0 @@ -162,22 +226,30 @@ def __init__(self, app, data): self.url = data["url"] self.is_image = data["is_image"] self.size = data["size"] + + # Check archive first (takes priority over proxy) + if archive_lookup: + canonical = self.url.split("?")[0] + if canonical in archive_lookup: + self.url = archive_lookup[canonical] + return + + # Fall back to attachment proxy if configured if self.app.ctx.attachment_proxy_url is not None: self.url = self.url.replace("https://cdn.discordapp.com", self.app.ctx.attachment_proxy_url) self.url = self.url.replace("https://media.discordapp.net", self.app.ctx.attachment_proxy_url) - print(self.url) class Message: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app self.id = int(data["message_id"]) self.created_at = dateutil.parser.parse(data["timestamp"]).astimezone(timezone.utc) self.human_created_at = duration(self.created_at, now=datetime.now(timezone.utc)) self.raw_content = data["content"] self.content = self.format_html_content(self.raw_content) - self.attachments = [Attachment(app, a) for a in data["attachments"]] - self.author = User(app, data["author"]) + self.attachments = [Attachment(app, a, archive_lookup=archive_lookup) for a in data["attachments"]] + self.author = User(app, data["author"], archive_lookup=archive_lookup) self.type = data.get("type", "thread_message") self.edited = data.get("edited", False) diff --git a/requirements.txt b/requirements.txt index 4777c9a..f6d1529 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ +aiohttp == 3.11.18 jinja2 == 3.1.6 +Pillow == 11.1.0 motor == 3.7.1 natural == 0.2.0 pymongo == 4.15.5 From 4a5c08f7e23f0cb3ff59795600dee98409f1aa80 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 12 Feb 2026 17:57:53 +0000 Subject: [PATCH 2/4] Fixed archiver query not matching any attachments Changed the mongo query from `$ne: []` as it was matching 0 logs, now checking messages.attachments.0 instead which picks up the attachments and loads them correctly in the log viewer. --- core/archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/archiver.py b/core/archiver.py index ae5f220..36c625e 100644 --- a/core/archiver.py +++ b/core/archiver.py @@ -252,7 +252,7 @@ async def archive_attachments_batch(app, session, config): db = app.ctx.db cursor = db.logs.find( - {"messages.attachments": {"$exists": True, "$ne": []}}, + {"messages.attachments.0": {"$exists": True}}, {"messages.attachments": 1, "key": 1}, ).batch_size(50) From 9bf65bfe990d3ae8325cf5aac41da76fe6b339a7 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 12 Feb 2026 18:05:10 +0000 Subject: [PATCH 3/4] Added better logging. Added more detailed logging so you can understand what's happening in the console. --- core/archiver.py | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/core/archiver.py b/core/archiver.py index 36c625e..b64cffa 100644 --- a/core/archiver.py +++ b/core/archiver.py @@ -251,14 +251,21 @@ async def archive_attachments_batch(app, session, config): """Scan logs for unarchived attachment URLs and archive them.""" db = app.ctx.db + logger.info("Attachment archiver: scanning for unarchived attachments...") + cursor = db.logs.find( {"messages.attachments.0": {"$exists": True}}, {"messages.attachments": 1, "key": 1}, ).batch_size(50) count_archived = 0 + count_skipped = 0 + count_failed = 0 + count_404 = 0 + logs_scanned = 0 async for doc in cursor: + logs_scanned += 1 for message in doc.get("messages", []): for att in message.get("attachments", []): if isinstance(att, str): @@ -277,24 +284,34 @@ async def archive_attachments_batch(app, session, config): existing = await db.archived_attachments.find_one({"original_url": canonical_url}) if existing: + count_skipped += 1 continue + logger.info("Attachment archiver: archiving %s from log %s", filename, doc.get("key", "?")) result = await download_and_store(app, session, url, filename, config) await _record_result(db, canonical_url, filename, result) if result is not None and result not in ("404", "oversized"): count_archived += 1 + elif result == "404": + count_404 += 1 + else: + count_failed += 1 await asyncio.sleep(0.5) - if count_archived > 0: - logger.info("Attachments archived this cycle: %d", count_archived) + logger.info( + "Attachment archiver: scan complete - %d logs scanned, %d archived, %d already archived, %d expired (404), %d failed", + logs_scanned, count_archived, count_skipped, count_404, count_failed, + ) async def archive_avatars_batch(app, session, config): """Scan logs for unarchived avatar URLs and archive them.""" db = app.ctx.db + logger.info("Attachment archiver: scanning for unarchived avatars...") + cursor = db.logs.find( {}, { @@ -308,8 +325,13 @@ async def archive_avatars_batch(app, session, config): seen_urls = set() count_archived = 0 + count_skipped = 0 + count_failed = 0 + count_404 = 0 + logs_scanned = 0 async for doc in cursor: + logs_scanned += 1 avatar_urls = [] for field in ("creator", "recipient", "closer"): @@ -337,19 +359,27 @@ async def archive_avatars_batch(app, session, config): existing = await db.archived_attachments.find_one({"original_url": canonical_url}) if existing: + count_skipped += 1 continue url_path = canonical_url.rsplit("/", 1)[-1] if "/" in canonical_url else "avatar.png" + logger.info("Attachment archiver: archiving avatar %s from log %s", url_path, doc.get("key", "?")) result = await download_and_store(app, session, url, url_path, config) await _record_result(db, canonical_url, url_path, result) if result is not None and result not in ("404", "oversized"): count_archived += 1 + elif result == "404": + count_404 += 1 + else: + count_failed += 1 await asyncio.sleep(0.5) - if count_archived > 0: - logger.info("Avatars archived this cycle: %d", count_archived) + logger.info( + "Attachment archiver: avatar scan complete - %d logs scanned, %d archived, %d already archived, %d expired (404), %d failed", + logs_scanned, count_archived, count_skipped, count_404, count_failed, + ) async def cleanup_expired(app, retention_delta): @@ -396,16 +426,18 @@ async def run_archiver_loop(app, config): while True: try: + logger.info("Attachment archiver: starting scan cycle") async with aiohttp.ClientSession( headers={"User-Agent": "ModmailLogviewer/1.0 (attachment archiver)"} ) as session: await archive_attachments_batch(app, session, config) await archive_avatars_batch(app, session, config) await cleanup_expired(app, retention_delta) + logger.info("Attachment archiver: cycle complete, next scan in %ds", interval) except asyncio.CancelledError: - logger.info("Archiver task cancelled, shutting down") + logger.info("Attachment archiver: task cancelled, shutting down") return except Exception as e: - logger.error("Archiver loop error: %s", e, exc_info=True) + logger.error("Attachment archiver: loop error: %s", e, exc_info=True) await asyncio.sleep(interval) From 9fe8418036d3f30ab425a0babcf66a317ebe5e8a Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 13 Feb 2026 12:54:20 +0000 Subject: [PATCH 4/4] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index dbd7def..fc93aeb 100644 --- a/.gitignore +++ b/.gitignore @@ -406,3 +406,4 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* +