diff --git a/.env.example b/.env.example index 2797714..bb63ba4 100644 --- a/.env.example +++ b/.env.example @@ -5,9 +5,23 @@ LOG_URL_PREFIX=/logs # Listen address and port. Don't change them if you don't know what they do. HOST=0.0.0.0 PORT=8000 -# Whether if the logviewer should use a proxy to view attachments. -# If set to "no" (default), attachments will expire after 1 day and the logviewer won't be able to show the attachment. +# Attachments will expire after 1 day and the logviewer won't be able to show the attachment. # Please be aware that this may violate Discord TOS and the proxy will have full access to your attachments. +# Your options are to either use an attachment proxy or archive them to Mongo GridFS. # Modmail/Logviewer is not affiliated with the proxy in any way. USE AT YOUR OWN RISK. USE_ATTACHMENT_PROXY=no ATTACHMENT_PROXY_URL=https://cdn.discordapp.xyz +# Archive attachments to MongoDB GridFS so they persist beyond Discord's 24h CDN expiry. +SAVE_ATTACHMENTS=no +# How often (in seconds) the archiver scans for new unarchived attachments. Default: 600 (10 minutes) +ARCHIVE_INTERVAL=600 +# Maximum file size in bytes to archive. Files larger than this are skipped. Default: 26214400 (25 MB) +ARCHIVE_MAX_FILE_SIZE=26214400 +# How long to keep archived attachments. Options: 1w, 1month, 1y, forever. Default: forever +ARCHIVE_RETENTION=forever +# Compress images to JPEG before storing. Greatly reduces storage usage. Default: yes +ARCHIVE_COMPRESS_IMAGES=yes +# JPEG quality (1-100). Lower = smaller files. 65 is a good balance of quality and size. Default: 65 +ARCHIVE_IMAGE_QUALITY=65 +# Max image resolution (longest edge in pixels). Images larger than this are downscaled. Default: 1920 +ARCHIVE_IMAGE_MAX_RESOLUTION=1920 diff --git a/.gitignore b/.gitignore index dbd7def..fc93aeb 100644 --- a/.gitignore +++ b/.gitignore @@ -406,3 +406,4 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* + diff --git a/README.md b/README.md index fd7d3bf..af9f555 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,51 @@ We recommend setting up a reverse proxy (e.g. Nginx) to port forward external po To accept requests from a domain instead of your server IP, simply set an `A`/`AAAA` record from your DNS provider to forward your domain to your server IP. +## Preserving Attachments + +Discord's CDN URLs for attachments expire after ~24 hours, which means images in your logs will break. There are two ways to fix this: + +### Option 1: Attachment Archival (Recommended) + +Downloads attachments and stores them directly in your MongoDB using GridFS. No third parties involved. + +Add to your `.env` file: +``` +SAVE_ATTACHMENTS=yes +``` + +| Variable | Default | Description | +|---|---|---| +| `SAVE_ATTACHMENTS` | `no` | Enable attachment archival (`yes` / `no`) | +| `ARCHIVE_INTERVAL` | `600` | Seconds between scans for new attachments | +| `ARCHIVE_MAX_FILE_SIZE` | `26214400` | Max file size in bytes to archive (default 25 MB) | +| `ARCHIVE_RETENTION` | `forever` | How long to keep files: `1w`, `1month`, `1y`, `forever` | +| `ARCHIVE_COMPRESS_IMAGES` | `yes` | Compress images to JPEG before storing (`yes` / `no`) | +| `ARCHIVE_IMAGE_QUALITY` | `65` | JPEG quality 1-100 (lower = smaller files) | +| `ARCHIVE_IMAGE_MAX_RESOLUTION` | `1920` | Downscale images larger than this (longest edge in px) | + +A background task scans your logs collection periodically and downloads any Discord CDN attachment/avatar URLs it finds. Images are compressed to JPEG (configurable) and stored in MongoDB GridFS. When a log page is viewed, archived images are served from `/attachments/` instead of the expired Discord URLs. + +> [!NOTE] +> Attachments that already expired before enabling this feature cannot be recovered. Enable it as soon as possible to archive existing logs while their URLs are still valid. + +> [!NOTE] +> The free MongoDB Atlas tier (M0) has a 512 MB storage limit. With compression enabled, this can hold roughly 2,000-10,000 images depending on size. Consider adjusting `ARCHIVE_IMAGE_QUALITY` and `ARCHIVE_IMAGE_MAX_RESOLUTION` to reduce storage usage, or use a paid tier for more space. + +### Option 2: Attachment Proxy + +Routes image requests through a third-party proxy service instead of storing them yourself. Simpler setup but relies on an external service. + +| Variable | Default | Description | +|---|---|---| +| `USE_ATTACHMENT_PROXY` | `no` | Enable the attachment proxy (`yes` / `no`) | +| `ATTACHMENT_PROXY_URL` | `https://cdn.discordapp.xyz` | The proxy service URL (not your site URL) | + +> [!WARNING] +> The proxy service is not affiliated with Modmail. It will have full access to your attachments. Use at your own risk. + +You can enable both options together - archived images take priority, and the proxy is used as a fallback for images not yet archived. + ## Discord OAuth2 Protecting your logs with a login (Discord Oauth2 support) is a premium feature, only available to [Premium members](https://buymeacoffee.com/modmaildev). diff --git a/app.json b/app.json index 4187572..0c92707 100644 --- a/app.json +++ b/app.json @@ -16,6 +16,31 @@ "description": "Proxy URL for viewing attachments.", "required": false, "value": "https://cdn.discordapp.xyz" + }, + "SAVE_ATTACHMENTS": { + "description": "Whether to archive attachments to MongoDB GridFS. Set to 'yes' to enable.", + "required": false, + "value": "no" + }, + "ARCHIVE_INTERVAL": { + "description": "How often (in seconds) to scan for unarchived attachments. Default: 600", + "required": false, + "value": "600" + }, + "ARCHIVE_RETENTION": { + "description": "How long to keep archived attachments. Options: 1w, 1month, 1y, forever. Default: forever", + "required": false, + "value": "forever" + }, + "ARCHIVE_COMPRESS_IMAGES": { + "description": "Compress images to JPEG before storing to save space. Default: yes", + "required": false, + "value": "yes" + }, + "ARCHIVE_IMAGE_QUALITY": { + "description": "JPEG quality (1-100). Lower = smaller files. Default: 65", + "required": false, + "value": "65" } } } diff --git a/app.py b/app.py index 277aa81..a2bd7cb 100644 --- a/app.py +++ b/app.py @@ -3,13 +3,14 @@ import html import os +from bson import ObjectId from dotenv import load_dotenv -from motor.motor_asyncio import AsyncIOMotorClient +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket from sanic import Sanic, response from sanic.exceptions import NotFound from jinja2 import Environment, FileSystemLoader -from core.models import LogEntry +from core.models import LogEntry, build_archive_lookup load_dotenv() @@ -61,6 +62,15 @@ def strtobool(val): raise ValueError("invalid truth value %r" % (val,)) +SAVE_ATTACHMENTS = strtobool(os.getenv("SAVE_ATTACHMENTS", "no")) +ARCHIVE_INTERVAL = int(os.getenv("ARCHIVE_INTERVAL", "600")) +ARCHIVE_MAX_FILE_SIZE = int(os.getenv("ARCHIVE_MAX_FILE_SIZE", str(25 * 1024 * 1024))) +ARCHIVE_RETENTION = os.getenv("ARCHIVE_RETENTION", "forever").strip().lower() +ARCHIVE_COMPRESS_IMAGES = strtobool(os.getenv("ARCHIVE_COMPRESS_IMAGES", "yes")) +ARCHIVE_IMAGE_QUALITY = int(os.getenv("ARCHIVE_IMAGE_QUALITY", "65")) +ARCHIVE_IMAGE_MAX_RESOLUTION = int(os.getenv("ARCHIVE_IMAGE_MAX_RESOLUTION", "1920")) + + @app.listener("before_server_start") async def init(app, loop): app.ctx.db = AsyncIOMotorClient(MONGO_URI).modmail_bot @@ -71,6 +81,32 @@ async def init(app, loop): else: app.ctx.attachment_proxy_url = None + # Attachment archival setup + app.ctx.save_attachments = bool(SAVE_ATTACHMENTS) + if app.ctx.save_attachments: + app.ctx.fs = AsyncIOMotorGridFSBucket(app.ctx.db, bucket_name="attachments") + await app.ctx.db.archived_attachments.create_index("original_url", unique=True) + await app.ctx.db.archived_attachments.create_index("status") + await app.ctx.db.archived_attachments.create_index("archived_at") + else: + app.ctx.fs = None + + +@app.listener("after_server_start") +async def start_archiver(app, loop): + if app.ctx.save_attachments: + from core.archiver import run_archiver_loop + archiver_config = { + "interval": ARCHIVE_INTERVAL, + "max_file_size": ARCHIVE_MAX_FILE_SIZE, + "retention": ARCHIVE_RETENTION, + "compress_images": bool(ARCHIVE_COMPRESS_IMAGES), + "image_quality": ARCHIVE_IMAGE_QUALITY, + "image_max_resolution": ARCHIVE_IMAGE_MAX_RESOLUTION, + } + app.add_task(run_archiver_loop(app, archiver_config)) + + @app.exception(NotFound) async def not_found(request, exc): return render_template("not_found") @@ -89,7 +125,8 @@ async def get_raw_logs_file(request, key): if document is None: raise NotFound - log_entry = LogEntry(app, document) + archive_lookup = await build_archive_lookup(app, document) + log_entry = LogEntry(app, document, archive_lookup=archive_lookup) return log_entry.render_plain_text() @@ -102,11 +139,43 @@ async def get_logs_file(request, key): if document is None: raise NotFound - log_entry = LogEntry(app, document) + archive_lookup = await build_archive_lookup(app, document) + log_entry = LogEntry(app, document, archive_lookup=archive_lookup) return log_entry.render_html() +@app.get("/attachments//") +async def serve_attachment(request, file_id, filename): + """Serve an archived attachment from GridFS.""" + if not app.ctx.save_attachments or app.ctx.fs is None: + raise NotFound + + try: + oid = ObjectId(file_id) + except Exception: + raise NotFound + + try: + grid_out = await app.ctx.fs.open_download_stream(oid) + except Exception: + raise NotFound + + content_type = "application/octet-stream" + if grid_out.metadata: + content_type = grid_out.metadata.get("content_type", content_type) + + data = await grid_out.read() + return response.raw( + data, + content_type=content_type, + headers={ + "Content-Disposition": f'inline; filename="{filename}"', + "Cache-Control": "public, max-age=31536000, immutable", + }, + ) + + if __name__ == "__main__": app.run( host=os.getenv("HOST", "0.0.0.0"), diff --git a/core/archiver.py b/core/archiver.py new file mode 100644 index 0000000..b64cffa --- /dev/null +++ b/core/archiver.py @@ -0,0 +1,443 @@ +""" +Background task that archives Discord CDN attachments and avatars to MongoDB GridFS. + +When enabled via SAVE_ATTACHMENTS=yes, this module periodically scans the logs collection +for Discord CDN URLs, downloads them before they expire (~24h), and stores them in GridFS. + +Features: +- Image compression: Converts images to optimized JPEG to minimize storage +- Retention policy: Auto-deletes archived files after a configurable period +""" + +import asyncio +import io +import logging +import re +from datetime import datetime, timedelta, timezone + +import aiohttp +from pymongo.errors import DuplicateKeyError + +logger = logging.getLogger("logviewer.archiver") + +DISCORD_CDN_PATTERN = re.compile( + r"https?://(?:cdn\.discordapp\.com|media\.discordapp\.net)/" +) + +COMPRESSIBLE_TYPES = {"image/png", "image/jpeg", "image/webp", "image/bmp", "image/tiff"} + +CONTENT_TYPE_MAP = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".mp4": "video/mp4", + ".webm": "video/webm", + ".mp3": "audio/mpeg", + ".ogg": "audio/ogg", + ".wav": "audio/wav", + ".pdf": "application/pdf", + ".txt": "text/plain", +} + +RETENTION_MAP = { + "1w": timedelta(weeks=1), + "1week": timedelta(weeks=1), + "1m": timedelta(days=30), + "1month": timedelta(days=30), + "1y": timedelta(days=365), + "1year": timedelta(days=365), + "forever": None, +} + + +def parse_retention(value): + """Parse retention string to timedelta. Returns None for 'forever'.""" + td = RETENTION_MAP.get(value) + if td is None and value != "forever": + logger.warning("Unknown ARCHIVE_RETENTION value '%s', defaulting to 'forever'", value) + return td + + +def guess_content_type(filename, response_content_type=None): + if response_content_type and response_content_type != "application/octet-stream": + return response_content_type + for ext, ctype in CONTENT_TYPE_MAP.items(): + if filename.lower().endswith(ext): + return ctype + return "application/octet-stream" + + +def strip_query_params(url): + """Return URL without query parameters (Discord's signed params change but the path is stable).""" + return url.split("?")[0] + + +def compress_image(image_data, content_type, quality, max_resolution): + """ + Compress an image to JPEG with optimized settings for minimal file size. + + Strategy: + - Convert to RGB (JPEG doesn't support alpha) + - Downscale if either dimension exceeds max_resolution + - Save as progressive JPEG at the configured quality + - Strip all EXIF/metadata + + Returns (compressed_bytes, "image/jpeg") or (original_data, original_type) if compression fails or is larger. + """ + try: + from PIL import Image + + img = Image.open(io.BytesIO(image_data)) + + # Skip animated images (GIFs with multiple frames) + if getattr(img, "n_frames", 1) > 1: + return image_data, content_type + + # Convert to RGB (drop alpha channel for JPEG) + if img.mode in ("RGBA", "P", "LA"): + background = Image.new("RGB", img.size, (54, 57, 63)) # Discord dark bg color + if img.mode == "P": + img = img.convert("RGBA") + if img.mode in ("RGBA", "LA"): + background.paste(img, mask=img.split()[-1]) + img = background + else: + img = img.convert("RGB") + elif img.mode != "RGB": + img = img.convert("RGB") + + # Downscale if too large (preserve aspect ratio) + w, h = img.size + if max(w, h) > max_resolution: + if w > h: + new_w = max_resolution + new_h = int(h * (max_resolution / w)) + else: + new_h = max_resolution + new_w = int(w * (max_resolution / h)) + img = img.resize((new_w, new_h), Image.LANCZOS) + + # Save as optimized progressive JPEG + buf = io.BytesIO() + img.save( + buf, + format="JPEG", + quality=quality, + optimize=True, + progressive=True, + subsampling="4:2:0", # Maximum chroma subsampling for smallest size + ) + compressed = buf.getvalue() + + # Only use compressed version if it's actually smaller + if len(compressed) < len(image_data): + logger.debug( + "Compressed image: %d -> %d bytes (%.0f%% reduction)", + len(image_data), len(compressed), + (1 - len(compressed) / len(image_data)) * 100 + ) + return compressed, "image/jpeg" + + return image_data, content_type + + except Exception as e: + logger.warning("Image compression failed, storing original: %s", e) + return image_data, content_type + + +async def download_and_store(app, session, url, filename, config): + """Download a URL and store it in GridFS. Returns GridFS ObjectId on success, None on failure.""" + max_size = config["max_file_size"] + try: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=120)) as resp: + if resp.status == 404: + logger.warning("Attachment 404: %s", strip_query_params(url)) + return "404" + if resp.status != 200: + logger.warning("Download failed (HTTP %d): %s", resp.status, strip_query_params(url)) + return None + + content_length = resp.content_length + if content_length and content_length > max_size: + logger.info("Skipping oversized attachment (%d bytes): %s", content_length, strip_query_params(url)) + return "oversized" + + content_type = guess_content_type(filename, resp.content_type) + + # Read entire file for potential compression + data = await resp.read() + if len(data) > max_size: + logger.info("Skipping oversized attachment (%d bytes): %s", len(data), strip_query_params(url)) + return "oversized" + + # Compress images if enabled + stored_filename = filename + if config["compress_images"] and content_type in COMPRESSIBLE_TYPES: + data, content_type = compress_image( + data, content_type, + quality=config["image_quality"], + max_resolution=config["image_max_resolution"], + ) + # Update filename extension if we converted to JPEG + if content_type == "image/jpeg" and not filename.lower().endswith((".jpg", ".jpeg")): + stored_filename = filename.rsplit(".", 1)[0] + ".jpg" if "." in filename else filename + ".jpg" + + grid_in = app.ctx.fs.open_upload_stream( + stored_filename, + metadata={ + "content_type": content_type, + "original_url": strip_query_params(url), + "archived_at": datetime.now(timezone.utc), + "original_size": len(data), + }, + ) + + try: + await grid_in.write(data) + await grid_in.close() + except Exception: + await grid_in.abort() + raise + + logger.info("Archived: %s -> GridFS %s (%d bytes)", strip_query_params(url), grid_in._id, len(data)) + return grid_in._id + + except asyncio.TimeoutError: + logger.warning("Timeout downloading: %s", strip_query_params(url)) + return None + except aiohttp.ClientError as e: + logger.warning("Client error downloading %s: %s", strip_query_params(url), e) + return None + except Exception as e: + logger.error("Unexpected error archiving %s: %s", strip_query_params(url), e, exc_info=True) + return None + + +async def _record_result(db, canonical_url, filename, result): + """Record the archival result in the archived_attachments collection.""" + try: + if result == "404": + await db.archived_attachments.insert_one({ + "original_url": canonical_url, + "filename": filename, + "status": "failed_permanent", + "reason": "404_not_found", + "failed_at": datetime.now(timezone.utc), + }) + elif result == "oversized": + await db.archived_attachments.insert_one({ + "original_url": canonical_url, + "filename": filename, + "status": "failed_permanent", + "reason": "oversized", + "failed_at": datetime.now(timezone.utc), + }) + elif result is not None: + await db.archived_attachments.insert_one({ + "original_url": canonical_url, + "gridfs_id": result, + "filename": filename, + "status": "archived", + "archived_at": datetime.now(timezone.utc), + }) + # If result is None (transient failure), don't record - will retry next cycle + except DuplicateKeyError: + pass # Another instance archived it first + + +async def archive_attachments_batch(app, session, config): + """Scan logs for unarchived attachment URLs and archive them.""" + db = app.ctx.db + + logger.info("Attachment archiver: scanning for unarchived attachments...") + + cursor = db.logs.find( + {"messages.attachments.0": {"$exists": True}}, + {"messages.attachments": 1, "key": 1}, + ).batch_size(50) + + count_archived = 0 + count_skipped = 0 + count_failed = 0 + count_404 = 0 + logs_scanned = 0 + + async for doc in cursor: + logs_scanned += 1 + for message in doc.get("messages", []): + for att in message.get("attachments", []): + if isinstance(att, str): + url = att + filename = "attachment" + elif isinstance(att, dict): + url = att.get("url", "") + filename = att.get("filename", "attachment") + else: + continue + + if not url or not DISCORD_CDN_PATTERN.match(url): + continue + + canonical_url = strip_query_params(url) + + existing = await db.archived_attachments.find_one({"original_url": canonical_url}) + if existing: + count_skipped += 1 + continue + + logger.info("Attachment archiver: archiving %s from log %s", filename, doc.get("key", "?")) + result = await download_and_store(app, session, url, filename, config) + await _record_result(db, canonical_url, filename, result) + + if result is not None and result not in ("404", "oversized"): + count_archived += 1 + elif result == "404": + count_404 += 1 + else: + count_failed += 1 + + await asyncio.sleep(0.5) + + logger.info( + "Attachment archiver: scan complete - %d logs scanned, %d archived, %d already archived, %d expired (404), %d failed", + logs_scanned, count_archived, count_skipped, count_404, count_failed, + ) + + +async def archive_avatars_batch(app, session, config): + """Scan logs for unarchived avatar URLs and archive them.""" + db = app.ctx.db + + logger.info("Attachment archiver: scanning for unarchived avatars...") + + cursor = db.logs.find( + {}, + { + "creator.avatar_url": 1, + "recipient.avatar_url": 1, + "closer.avatar_url": 1, + "messages.author.avatar_url": 1, + "key": 1, + }, + ).batch_size(50) + + seen_urls = set() + count_archived = 0 + count_skipped = 0 + count_failed = 0 + count_404 = 0 + logs_scanned = 0 + + async for doc in cursor: + logs_scanned += 1 + avatar_urls = [] + + for field in ("creator", "recipient", "closer"): + user_data = doc.get(field) + if user_data and isinstance(user_data, dict): + avatar_url = user_data.get("avatar_url", "") + if avatar_url: + avatar_urls.append(avatar_url) + + for message in doc.get("messages", []): + author = message.get("author") + if author and isinstance(author, dict): + avatar_url = author.get("avatar_url", "") + if avatar_url: + avatar_urls.append(avatar_url) + + for url in avatar_urls: + if not DISCORD_CDN_PATTERN.match(url): + continue + + canonical_url = strip_query_params(url) + if canonical_url in seen_urls: + continue + seen_urls.add(canonical_url) + + existing = await db.archived_attachments.find_one({"original_url": canonical_url}) + if existing: + count_skipped += 1 + continue + + url_path = canonical_url.rsplit("/", 1)[-1] if "/" in canonical_url else "avatar.png" + logger.info("Attachment archiver: archiving avatar %s from log %s", url_path, doc.get("key", "?")) + result = await download_and_store(app, session, url, url_path, config) + await _record_result(db, canonical_url, url_path, result) + + if result is not None and result not in ("404", "oversized"): + count_archived += 1 + elif result == "404": + count_404 += 1 + else: + count_failed += 1 + + await asyncio.sleep(0.5) + + logger.info( + "Attachment archiver: avatar scan complete - %d logs scanned, %d archived, %d already archived, %d expired (404), %d failed", + logs_scanned, count_archived, count_skipped, count_404, count_failed, + ) + + +async def cleanup_expired(app, retention_delta): + """Delete archived attachments older than the retention period.""" + if retention_delta is None: + return # "forever" - no cleanup + + db = app.ctx.db + cutoff = datetime.now(timezone.utc) - retention_delta + + cursor = db.archived_attachments.find( + {"status": "archived", "archived_at": {"$lt": cutoff}}, + {"gridfs_id": 1, "original_url": 1}, + ) + + count_deleted = 0 + async for record in cursor: + gridfs_id = record.get("gridfs_id") + if gridfs_id: + try: + await app.ctx.fs.delete(gridfs_id) + except Exception as e: + logger.warning("Failed to delete GridFS file %s: %s", gridfs_id, e) + + await db.archived_attachments.delete_one({"_id": record["_id"]}) + count_deleted += 1 + + if count_deleted > 0: + logger.info("Retention cleanup: deleted %d expired archives (cutoff: %s)", count_deleted, cutoff.isoformat()) + + +async def run_archiver_loop(app, config): + """Main archiver loop. Runs indefinitely, sleeping between scans.""" + interval = config["interval"] + retention_delta = parse_retention(config["retention"]) + + logger.info( + "Attachment archiver started (interval=%ds, max_size=%d bytes, retention=%s, compress=%s, quality=%d, max_res=%d)", + interval, config["max_file_size"], config["retention"], + config["compress_images"], config["image_quality"], config["image_max_resolution"], + ) + + await asyncio.sleep(5) # Let the server fully start + + while True: + try: + logger.info("Attachment archiver: starting scan cycle") + async with aiohttp.ClientSession( + headers={"User-Agent": "ModmailLogviewer/1.0 (attachment archiver)"} + ) as session: + await archive_attachments_batch(app, session, config) + await archive_avatars_batch(app, session, config) + await cleanup_expired(app, retention_delta) + logger.info("Attachment archiver: cycle complete, next scan in %ds", interval) + except asyncio.CancelledError: + logger.info("Attachment archiver: task cancelled, shutting down") + return + except Exception as e: + logger.error("Attachment archiver: loop error: %s", e, exc_info=True) + + await asyncio.sleep(interval) diff --git a/core/models.py b/core/models.py index 7c0b850..88ec000 100644 --- a/core/models.py +++ b/core/models.py @@ -1,15 +1,74 @@ +import re from datetime import datetime, timezone -import dateutil.parser +import dateutil.parser from sanic import response from natural.date import duration from .formatter import format_content_html +_DISCORD_CDN_PATTERN = re.compile( + r"https?://(?:cdn\.discordapp\.com|media\.discordapp\.net)/" +) + + +async def build_archive_lookup(app, document): + """ + Pre-fetch all archived attachment mappings for URLs found in this log document. + Returns a dict mapping canonical_url -> "/attachments//" + """ + if not getattr(app.ctx, "save_attachments", False): + return {} + + urls = set() + + for message in document.get("messages", []): + for att in message.get("attachments", []): + if isinstance(att, str): + url = att + elif isinstance(att, dict): + url = att.get("url", "") + else: + continue + if url and _DISCORD_CDN_PATTERN.match(url): + urls.add(url.split("?")[0]) + + for field in ("creator", "recipient", "closer"): + user_data = document.get(field) + if user_data and isinstance(user_data, dict): + avatar_url = user_data.get("avatar_url", "") + if avatar_url and _DISCORD_CDN_PATTERN.match(avatar_url): + urls.add(avatar_url.split("?")[0]) + + for message in document.get("messages", []): + author = message.get("author") + if author and isinstance(author, dict): + avatar_url = author.get("avatar_url", "") + if avatar_url and _DISCORD_CDN_PATTERN.match(avatar_url): + urls.add(avatar_url.split("?")[0]) + + if not urls: + return {} + + cursor = app.ctx.db.archived_attachments.find( + {"original_url": {"$in": list(urls)}, "status": "archived"}, + {"original_url": 1, "gridfs_id": 1, "filename": 1}, + ) + + lookup = {} + async for record in cursor: + original_url = record["original_url"] + gridfs_id = str(record["gridfs_id"]) + filename = record.get("filename", "attachment") + lookup[original_url] = f"/attachments/{gridfs_id}/{filename}" + + return lookup + class LogEntry: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app + self.archive_lookup = archive_lookup or {} self.key = data["key"] self.open = data["open"] self.created_at = dateutil.parser.parse(data["created_at"]).astimezone(timezone.utc) @@ -19,11 +78,11 @@ def __init__(self, app, data): ) self.channel_id = int(data["channel_id"]) self.guild_id = int(data["guild_id"]) - self.creator = User(app, data["creator"]) - self.recipient = User(app, data["recipient"]) - self.closer = User(app, data["closer"]) if not self.open else None + self.creator = User(app, data["creator"], archive_lookup=self.archive_lookup) + self.recipient = User(app, data["recipient"], archive_lookup=self.archive_lookup) + self.closer = User(app, data["closer"], archive_lookup=self.archive_lookup) if not self.open else None self.close_message = format_content_html(data.get("close_message") or "") - self.messages = [Message(app, m) for m in data["messages"]] + self.messages = [Message(app, m, archive_lookup=self.archive_lookup) for m in data["messages"]] self.internal_messages = [m for m in self.messages if m.type == "internal"] self.thread_messages = [ m for m in self.messages if m.type not in ("internal", "system") @@ -112,7 +171,7 @@ def render_plain_text(self): class User: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app self.id = int(data.get("id")) self.name = data["name"] @@ -120,6 +179,11 @@ def __init__(self, app, data): self.avatar_url = data["avatar_url"] self.mod = data["mod"] + if archive_lookup: + canonical = self.avatar_url.split("?")[0] + if canonical in archive_lookup: + self.avatar_url = archive_lookup[canonical] + @property def default_avatar_url(self): return "https://cdn.discordapp.com/embed/avatars/{}.png".format( @@ -148,7 +212,7 @@ def type(self): class Attachment: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app if isinstance(data, str): # Backwards compatibility self.id = 0 @@ -162,22 +226,30 @@ def __init__(self, app, data): self.url = data["url"] self.is_image = data["is_image"] self.size = data["size"] + + # Check archive first (takes priority over proxy) + if archive_lookup: + canonical = self.url.split("?")[0] + if canonical in archive_lookup: + self.url = archive_lookup[canonical] + return + + # Fall back to attachment proxy if configured if self.app.ctx.attachment_proxy_url is not None: self.url = self.url.replace("https://cdn.discordapp.com", self.app.ctx.attachment_proxy_url) self.url = self.url.replace("https://media.discordapp.net", self.app.ctx.attachment_proxy_url) - print(self.url) class Message: - def __init__(self, app, data): + def __init__(self, app, data, archive_lookup=None): self.app = app self.id = int(data["message_id"]) self.created_at = dateutil.parser.parse(data["timestamp"]).astimezone(timezone.utc) self.human_created_at = duration(self.created_at, now=datetime.now(timezone.utc)) self.raw_content = data["content"] self.content = self.format_html_content(self.raw_content) - self.attachments = [Attachment(app, a) for a in data["attachments"]] - self.author = User(app, data["author"]) + self.attachments = [Attachment(app, a, archive_lookup=archive_lookup) for a in data["attachments"]] + self.author = User(app, data["author"], archive_lookup=archive_lookup) self.type = data.get("type", "thread_message") self.edited = data.get("edited", False) diff --git a/requirements.txt b/requirements.txt index 4777c9a..f6d1529 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ +aiohttp == 3.11.18 jinja2 == 3.1.6 +Pillow == 11.1.0 motor == 3.7.1 natural == 0.2.0 pymongo == 4.15.5