Add initial version of feeds app

2026-03-24 03:58:08 +01:00 · 2026-03-24 03:58:08 +01:00 · a02b5d5f66
commit a02b5d5f66
parent e889b58aec
17 changed files with 993 additions and 15 deletions
--- a/feeds/services.py
+++ b/feeds/services.py
@ -0,0 +1,191 @@
+from typing import TYPE_CHECKING
+from typing import Any
+from xml.parsers.expat import ExpatError
+
+import dateparser
+import niquests
+import xmltodict
+import xxhash
+from django.conf import settings
+from django.utils import timezone
+
+from feeds.models import Entry
+
+if TYPE_CHECKING:
+    import datetime
+
+    from feeds.models import Feed
+
+HTTP_OK = 200
+HTTP_NOT_MODIFIED = 304
+
+
+def extract_id(val: str | dict | None) -> str | None:
+    """Extracts a string ID from a guid or id field, handling both string and dict formats.
+
+    Args:
+        val (str | dict | None): The value to extract the ID from, which can be a string, a dict (with possible '#text' or '@id' keys), or None
+
+    Returns:
+        str | None: The extracted ID as a string, or None if it cannot be extracted
+    """
+    if isinstance(val, dict):
+        # RSS guid or Atom id as dict: prefer '#text', fallback to str(val)
+        return val.get("#text") or val.get("@id") or str(val)
+    return val
+
+
+def fetch_and_archive_feed(feed: Feed) -> int:
+    """Fetches the feed, parses entries, deduplicates, and archives new entries.
+
+    Returns:
+        The number of new entries archived.
+    """
+    request_headers: dict[str, str] = get_request_headers()
+    if feed.etag:
+        request_headers["If-None-Match"] = feed.etag
+    if feed.last_modified:
+        request_headers["If-Modified-Since"] = feed.last_modified
+
+    try:
+        response: niquests.Response = niquests.get(
+            feed.url,
+            headers=request_headers,
+            timeout=10,
+        )
+
+        if response.status_code == HTTP_NOT_MODIFIED:
+            feed.last_fetched_at = timezone.now()
+            feed.save(update_fields=["last_fetched_at"])
+            return 0
+
+        raw_xml: bytes = response.content or b""
+        error_msg: str = ""
+        parsed_data: dict[str, Any] | None = None
+        if response.status_code == HTTP_OK:
+            try:
+                parsed_data = xmltodict.parse(
+                    raw_xml.decode("utf-8", errors="replace"),
+                    process_namespaces=False,
+                )
+            except ExpatError as e:
+                error_msg = f"XML Parsing Error: {e!s}"
+
+        # Extract entries from parsed_data
+        entries: list[dict[str, Any]] = extract_feed_entries(parsed_data)
+
+        new_count = 0
+        for entry in entries:
+            content_hash: int = calculate_content_hash(entry)
+
+            entry_id: str = (
+                extract_id(entry.get("guid"))
+                or extract_id(entry.get("id"))
+                or entry.get("link")
+                or str(content_hash)
+            )
+            if not isinstance(entry_id, str):
+                entry_id = str(entry_id)
+
+            published_at: datetime.datetime | None = None
+            for date_field in ("published", "pubDate", "updated", "created"):
+                if entry.get(date_field):
+                    published_at = dateparser.parse(entry[date_field])
+                    if published_at:
+                        break
+
+            # Deduplicate: skip if entry with same feed+entry_id+content_hash exists
+            exists: bool = Entry.objects.filter(
+                feed=feed,
+                entry_id=entry_id,
+                content_hash=content_hash,
+            ).exists()
+            if not exists:
+                Entry.objects.create(
+                    feed=feed,
+                    entry_id=entry_id,
+                    fetched_at=timezone.now(),
+                    published_at=published_at,
+                    content_hash=content_hash,
+                    data=entry,
+                    error_message=error_msg,
+                )
+                new_count += 1
+
+        feed.etag = response.headers.get("ETag", "")
+        feed.last_modified = response.headers.get("Last-Modified", "")
+        feed.last_fetched_at = timezone.now()
+        feed.save()
+
+    except niquests.exceptions.RequestException as e:
+        Entry.objects.create(
+            feed=feed,
+            entry_id="__error__",
+            fetched_at=timezone.now(),
+            published_at=None,
+            content_hash=0,
+            data=None,
+            error_message=str(e),
+        )
+        return 0
+
+    else:
+        return new_count
+
+
+def calculate_content_hash(entry: dict[str, Any]) -> int:
+    """Calculates a content hash for the entry using xxhash64.
+
+    Args:
+        entry (dict[str, Any]): The entry data as a dictionary.
+
+    Returns:
+        int: A 64-bit integer hash of the entry content, suitable for deduplication.
+    """
+    entry_bytes: bytes = str(entry).encode("utf-8")
+    entry_hash_int: int = xxhash.xxh64_intdigest(entry_bytes)
+
+    # Ensure content_hash fits in signed 64-bit integer
+    content_hash: int = entry_hash_int & 0x7FFFFFFFFFFFFFFF
+    return content_hash
+
+
+def extract_feed_entries(parsed_data: dict[str, Any] | None) -> list[dict[str, Any]]:
+    """Extracts a list of entries from the parsed feed data, handling both RSS and Atom formats.
+
+    Args:
+        parsed_data (dict[str, Any] | None): The parsed feed data as a dictionary, or None if parsing failed
+
+    Returns:
+        list[dict[str, Any]]: A list of entries extracted from the feed, where each entry is represented as a dictionary. If no entries are found or if parsed_data is None, an empty list is returned.
+    """
+    entries: list[dict[str, Any]] = []
+    if parsed_data:
+        # RSS: channel > item; Atom: feed > entry
+        items: list[dict[str, Any]] | dict[str, Any] = []
+        if "rss" in parsed_data:
+            items = parsed_data["rss"].get("channel", {}).get("item", [])
+        elif "feed" in parsed_data:
+            items = parsed_data["feed"].get("entry", [])
+        if isinstance(items, dict):
+            items = [items]
+        entries = items
+    return entries
+
+
+def get_request_headers() -> dict[str, str]:
+    """Helper function to get standard request headers for fetching feeds.
+
+    Returns:
+        dict[str, str]: A dictionary of HTTP headers to include in feed fetch requests.
+    """
+    # https://blog.cloudflare.com/verified-bots-with-cryptography/
+    # https://www.cloudflare.com/lp/verified-bots/
+    # TODO(TheLovinator): We have to sign our requests  # noqa: TD003
+
+    request_headers: dict[str, str] = {
+        "User-Agent": settings.USER_AGENT,
+        "From": settings.BOT_CONTACT_EMAIL,
+    }
+
+    return request_headers