feedvault.se/feeds/services.py

from typing import TYPE_CHECKING
from typing import Any
from xml.parsers.expat import ExpatError

import dateparser
import niquests
import xmltodict
import xxhash
from django.conf import settings
from django.utils import timezone

from feeds.models import Entry

if TYPE_CHECKING:
    import datetime

    from feeds.models import Feed

HTTP_OK = 200
HTTP_NOT_MODIFIED = 304


def extract_id(val: str | dict | None) -> str | None:
    """Extracts a string ID from a guid or id field, handling both string and dict formats.

    Args:
        val (str | dict | None): The value to extract the ID from, which can be a string, a dict (with possible '#text' or '@id' keys), or None

    Returns:
        str | None: The extracted ID as a string, or None if it cannot be extracted
    """
    if isinstance(val, dict):
        # RSS guid or Atom id as dict: prefer '#text', fallback to str(val)
        return val.get("#text") or val.get("@id") or str(val)
    return val


def fetch_and_archive_feed(feed: Feed) -> int:
    """Fetches the feed, parses entries, deduplicates, and archives new entries.

    Returns:
        The number of new entries archived.
    """
    request_headers: dict[str, str] = get_request_headers()
    if feed.etag:
        request_headers["If-None-Match"] = feed.etag
    if feed.last_modified:
        request_headers["If-Modified-Since"] = feed.last_modified

    try:
        response: niquests.Response = niquests.get(
            feed.url,
            headers=request_headers,
            timeout=10,
        )

        if response.status_code == HTTP_NOT_MODIFIED:
            feed.last_fetched_at = timezone.now()
            feed.save(update_fields=["last_fetched_at"])
            return 0

        raw_xml: bytes = response.content or b""
        error_msg: str = ""
        parsed_data: dict[str, Any] | None = None
        if response.status_code == HTTP_OK:
            try:
                parsed_data = xmltodict.parse(
                    raw_xml.decode("utf-8", errors="replace"),
                    process_namespaces=False,
                )
            except ExpatError as e:
                error_msg = f"XML Parsing Error: {e!s}"

        # Extract entries from parsed_data
        entries: list[dict[str, Any]] = extract_feed_entries(parsed_data)

        new_count = 0
        for entry in entries:
            content_hash: int = calculate_content_hash(entry)

            entry_id: str = (
                extract_id(entry.get("guid"))
                or extract_id(entry.get("id"))
                or entry.get("link")
                or str(content_hash)
            )
            if not isinstance(entry_id, str):
                entry_id = str(entry_id)

            published_at: datetime.datetime | None = None
            for date_field in ("published", "pubDate", "updated", "created"):
                if entry.get(date_field):
                    published_at = dateparser.parse(entry[date_field])
                    if published_at:
                        break

            # Deduplicate: skip if entry with same feed+entry_id+content_hash exists
            exists: bool = Entry.objects.filter(
                feed=feed,
                entry_id=entry_id,
                content_hash=content_hash,
            ).exists()
            if not exists:
                Entry.objects.create(
                    feed=feed,
                    entry_id=entry_id,
                    fetched_at=timezone.now(),
                    published_at=published_at,
                    content_hash=content_hash,
                    data=entry,
                    error_message=error_msg,
                )
                new_count += 1

        feed.etag = response.headers.get("ETag", "")
        feed.last_modified = response.headers.get("Last-Modified", "")
        feed.last_fetched_at = timezone.now()
        feed.save()

    except niquests.exceptions.RequestException as e:
        Entry.objects.create(
            feed=feed,
            entry_id="__error__",
            fetched_at=timezone.now(),
            published_at=None,
            content_hash=0,
            data=None,
            error_message=str(e),
        )
        return 0

    else:
        return new_count


def calculate_content_hash(entry: dict[str, Any]) -> int:
    """Calculates a content hash for the entry using xxhash64.

    Args:
        entry (dict[str, Any]): The entry data as a dictionary.

    Returns:
        int: A 64-bit integer hash of the entry content, suitable for deduplication.
    """
    entry_bytes: bytes = str(entry).encode("utf-8")
    entry_hash_int: int = xxhash.xxh64_intdigest(entry_bytes)

    # Ensure content_hash fits in signed 64-bit integer
    content_hash: int = entry_hash_int & 0x7FFFFFFFFFFFFFFF
    return content_hash


def extract_feed_entries(parsed_data: dict[str, Any] | None) -> list[dict[str, Any]]:
    """Extracts a list of entries from the parsed feed data, handling both RSS and Atom formats.

    Args:
        parsed_data (dict[str, Any] | None): The parsed feed data as a dictionary, or None if parsing failed

    Returns:
        list[dict[str, Any]]: A list of entries extracted from the feed, where each entry is represented as a dictionary. If no entries are found or if parsed_data is None, an empty list is returned.
    """
    entries: list[dict[str, Any]] = []
    if parsed_data:
        # RSS: channel > item; Atom: feed > entry
        items: list[dict[str, Any]] | dict[str, Any] = []
        if "rss" in parsed_data:
            items = parsed_data["rss"].get("channel", {}).get("item", [])
        elif "feed" in parsed_data:
            items = parsed_data["feed"].get("entry", [])
        if isinstance(items, dict):
            items = [items]
        entries = items
    return entries


def get_request_headers() -> dict[str, str]:
    """Helper function to get standard request headers for fetching feeds.

    Returns:
        dict[str, str]: A dictionary of HTTP headers to include in feed fetch requests.
    """
    # https://blog.cloudflare.com/verified-bots-with-cryptography/
    # https://www.cloudflare.com/lp/verified-bots/
    # TODO(TheLovinator): We have to sign our requests  # noqa: TD003

    request_headers: dict[str, str] = {
        "User-Agent": settings.USER_AGENT,
        "From": settings.BOT_CONTACT_EMAIL,
    }

    return request_headers