All checks were successful
Deploy to Server / deploy (push) Successful in 11s
191 lines
6.2 KiB
Python
191 lines
6.2 KiB
Python
from typing import TYPE_CHECKING
|
|
from typing import Any
|
|
from xml.parsers.expat import ExpatError
|
|
|
|
import dateparser
|
|
import niquests
|
|
import xmltodict
|
|
import xxhash
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
|
|
from feeds.models import Entry
|
|
|
|
if TYPE_CHECKING:
|
|
import datetime
|
|
|
|
from feeds.models import Feed
|
|
|
|
HTTP_OK = 200
|
|
HTTP_NOT_MODIFIED = 304
|
|
|
|
|
|
def extract_id(val: str | dict | None) -> str | None:
|
|
"""Extracts a string ID from a guid or id field, handling both string and dict formats.
|
|
|
|
Args:
|
|
val (str | dict | None): The value to extract the ID from, which can be a string, a dict (with possible '#text' or '@id' keys), or None
|
|
|
|
Returns:
|
|
str | None: The extracted ID as a string, or None if it cannot be extracted
|
|
"""
|
|
if isinstance(val, dict):
|
|
# RSS guid or Atom id as dict: prefer '#text', fallback to str(val)
|
|
return val.get("#text") or val.get("@id") or str(val)
|
|
return val
|
|
|
|
|
|
def fetch_and_archive_feed(feed: Feed) -> int:
|
|
"""Fetches the feed, parses entries, deduplicates, and archives new entries.
|
|
|
|
Returns:
|
|
The number of new entries archived.
|
|
"""
|
|
request_headers: dict[str, str] = get_request_headers()
|
|
if feed.etag:
|
|
request_headers["If-None-Match"] = feed.etag
|
|
if feed.last_modified:
|
|
request_headers["If-Modified-Since"] = feed.last_modified
|
|
|
|
try:
|
|
response: niquests.Response = niquests.get(
|
|
feed.url,
|
|
headers=request_headers,
|
|
timeout=10,
|
|
)
|
|
|
|
if response.status_code == HTTP_NOT_MODIFIED:
|
|
feed.last_fetched_at = timezone.now()
|
|
feed.save(update_fields=["last_fetched_at"])
|
|
return 0
|
|
|
|
raw_xml: bytes = response.content or b""
|
|
error_msg: str = ""
|
|
parsed_data: dict[str, Any] | None = None
|
|
if response.status_code == HTTP_OK:
|
|
try:
|
|
parsed_data = xmltodict.parse(
|
|
raw_xml.decode("utf-8", errors="replace"),
|
|
process_namespaces=False,
|
|
)
|
|
except ExpatError as e:
|
|
error_msg = f"XML Parsing Error: {e!s}"
|
|
|
|
# Extract entries from parsed_data
|
|
entries: list[dict[str, Any]] = extract_feed_entries(parsed_data)
|
|
|
|
new_count = 0
|
|
for entry in entries:
|
|
content_hash: int = calculate_content_hash(entry)
|
|
|
|
entry_id: str = (
|
|
extract_id(entry.get("guid"))
|
|
or extract_id(entry.get("id"))
|
|
or entry.get("link")
|
|
or str(content_hash)
|
|
)
|
|
if not isinstance(entry_id, str):
|
|
entry_id = str(entry_id)
|
|
|
|
published_at: datetime.datetime | None = None
|
|
for date_field in ("published", "pubDate", "updated", "created"):
|
|
if entry.get(date_field):
|
|
published_at = dateparser.parse(entry[date_field])
|
|
if published_at:
|
|
break
|
|
|
|
# Deduplicate: skip if entry with same feed+entry_id+content_hash exists
|
|
exists: bool = Entry.objects.filter(
|
|
feed=feed,
|
|
entry_id=entry_id,
|
|
content_hash=content_hash,
|
|
).exists()
|
|
if not exists:
|
|
Entry.objects.create(
|
|
feed=feed,
|
|
entry_id=entry_id,
|
|
fetched_at=timezone.now(),
|
|
published_at=published_at,
|
|
content_hash=content_hash,
|
|
data=entry,
|
|
error_message=error_msg,
|
|
)
|
|
new_count += 1
|
|
|
|
feed.etag = response.headers.get("ETag", "")
|
|
feed.last_modified = response.headers.get("Last-Modified", "")
|
|
feed.last_fetched_at = timezone.now()
|
|
feed.save()
|
|
|
|
except niquests.exceptions.RequestException as e:
|
|
Entry.objects.create(
|
|
feed=feed,
|
|
entry_id="__error__",
|
|
fetched_at=timezone.now(),
|
|
published_at=None,
|
|
content_hash=0,
|
|
data=None,
|
|
error_message=str(e),
|
|
)
|
|
return 0
|
|
|
|
else:
|
|
return new_count
|
|
|
|
|
|
def calculate_content_hash(entry: dict[str, Any]) -> int:
|
|
"""Calculates a content hash for the entry using xxhash64.
|
|
|
|
Args:
|
|
entry (dict[str, Any]): The entry data as a dictionary.
|
|
|
|
Returns:
|
|
int: A 64-bit integer hash of the entry content, suitable for deduplication.
|
|
"""
|
|
entry_bytes: bytes = str(entry).encode("utf-8")
|
|
entry_hash_int: int = xxhash.xxh64_intdigest(entry_bytes)
|
|
|
|
# Ensure content_hash fits in signed 64-bit integer
|
|
content_hash: int = entry_hash_int & 0x7FFFFFFFFFFFFFFF
|
|
return content_hash
|
|
|
|
|
|
def extract_feed_entries(parsed_data: dict[str, Any] | None) -> list[dict[str, Any]]:
|
|
"""Extracts a list of entries from the parsed feed data, handling both RSS and Atom formats.
|
|
|
|
Args:
|
|
parsed_data (dict[str, Any] | None): The parsed feed data as a dictionary, or None if parsing failed
|
|
|
|
Returns:
|
|
list[dict[str, Any]]: A list of entries extracted from the feed, where each entry is represented as a dictionary. If no entries are found or if parsed_data is None, an empty list is returned.
|
|
"""
|
|
entries: list[dict[str, Any]] = []
|
|
if parsed_data:
|
|
# RSS: channel > item; Atom: feed > entry
|
|
items: list[dict[str, Any]] | dict[str, Any] = []
|
|
if "rss" in parsed_data:
|
|
items = parsed_data["rss"].get("channel", {}).get("item", [])
|
|
elif "feed" in parsed_data:
|
|
items = parsed_data["feed"].get("entry", [])
|
|
if isinstance(items, dict):
|
|
items = [items]
|
|
entries = items
|
|
return entries
|
|
|
|
|
|
def get_request_headers() -> dict[str, str]:
|
|
"""Helper function to get standard request headers for fetching feeds.
|
|
|
|
Returns:
|
|
dict[str, str]: A dictionary of HTTP headers to include in feed fetch requests.
|
|
"""
|
|
# https://blog.cloudflare.com/verified-bots-with-cryptography/
|
|
# https://www.cloudflare.com/lp/verified-bots/
|
|
# TODO(TheLovinator): We have to sign our requests # noqa: TD003
|
|
|
|
request_headers: dict[str, str] = {
|
|
"User-Agent": settings.USER_AGENT,
|
|
"From": settings.BOT_CONTACT_EMAIL,
|
|
}
|
|
|
|
return request_headers
|