Add initial version of feeds app
All checks were successful
Deploy to Server / deploy (push) Successful in 11s
All checks were successful
Deploy to Server / deploy (push) Successful in 11s
This commit is contained in:
parent
e889b58aec
commit
a02b5d5f66
17 changed files with 993 additions and 15 deletions
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"python.analysis.typeCheckingMode": "standard"
|
||||
}
|
||||
|
|
@ -4,10 +4,13 @@ import sys
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import django_stubs_ext
|
||||
import sentry_sdk
|
||||
from dotenv import load_dotenv
|
||||
from platformdirs import user_data_dir
|
||||
|
||||
django_stubs_ext.monkeypatch()
|
||||
|
||||
logger: logging.Logger = logging.getLogger("feedvault.settings")
|
||||
|
||||
load_dotenv(verbose=True)
|
||||
|
|
@ -224,3 +227,6 @@ CELERY_BROKER_URL: str = REDIS_URL_CELERY
|
|||
CELERY_RESULT_BACKEND = "django-db"
|
||||
CELERY_RESULT_EXTENDED = True
|
||||
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
|
||||
|
||||
USER_AGENT = "FeedVault/1.0 (+https://feedvault.se/bot; archiving feeds; contact: Discord: TheLovinator#9276, Email: bot@feedvault.se)"
|
||||
BOT_CONTACT_EMAIL = "bot@feedvault.se"
|
||||
|
|
|
|||
0
feeds/management/__init__.py
Normal file
0
feeds/management/__init__.py
Normal file
0
feeds/management/commands/__init__.py
Normal file
0
feeds/management/commands/__init__.py
Normal file
34
feeds/management/commands/archive_feed.py
Normal file
34
feeds/management/commands/archive_feed.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from feeds.models import Feed
|
||||
from feeds.services import fetch_and_archive_feed
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.core.management.base import CommandParser
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""Django management command to fetch and archive a feed by URL."""
|
||||
|
||||
help = "Fetch and archive a feed by URL."
|
||||
|
||||
def add_arguments(self, parser: CommandParser) -> None:
|
||||
"""Add URL argument to the command."""
|
||||
parser.add_argument("url", type=str, help="Feed URL to fetch and archive.")
|
||||
|
||||
def handle(self, *args, **options) -> None: # noqa: ARG002
|
||||
"""Handle the command execution."""
|
||||
url: str = options["url"]
|
||||
feed, created = Feed.objects.get_or_create(url=url)
|
||||
if created:
|
||||
self.stdout.write(self.style.SUCCESS(f"Created new feed for URL: {url}"))
|
||||
|
||||
new_entries: int = fetch_and_archive_feed(feed)
|
||||
if new_entries:
|
||||
msg: str = f"Archived {new_entries} new entr{'y' if new_entries == 1 else 'ies'} for URL: {url}"
|
||||
self.stdout.write(self.style.SUCCESS(msg))
|
||||
else:
|
||||
msg: str = "\tFeed is up to date, but no new entries were archived."
|
||||
self.stdout.write(self.style.WARNING(msg))
|
||||
186
feeds/migrations/0001_initial.py
Normal file
186
feeds/migrations/0001_initial.py
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
# Generated by Django 6.0.3 on 2026-03-24 01:13
|
||||
|
||||
import django.contrib.postgres.indexes
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
"""Initial migration for Feed and Entry models."""
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Feed",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"url",
|
||||
models.URLField(
|
||||
help_text="The canonical URL of the RSS/Atom feed. Must be unique.",
|
||||
max_length=2048,
|
||||
unique=True,
|
||||
verbose_name="Feed URL",
|
||||
),
|
||||
),
|
||||
(
|
||||
"domain",
|
||||
models.CharField(
|
||||
db_index=True,
|
||||
help_text="Domain name extracted from the feed URL.",
|
||||
max_length=255,
|
||||
verbose_name="Domain",
|
||||
),
|
||||
),
|
||||
(
|
||||
"etag",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="HTTP ETag header for conditional requests.",
|
||||
max_length=255,
|
||||
verbose_name="ETag",
|
||||
),
|
||||
),
|
||||
(
|
||||
"last_modified",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="HTTP Last-Modified header for conditional requests.",
|
||||
max_length=255,
|
||||
verbose_name="Last Modified",
|
||||
),
|
||||
),
|
||||
(
|
||||
"is_active",
|
||||
models.BooleanField(
|
||||
default=True,
|
||||
help_text="Whether this feed is currently being fetched.",
|
||||
verbose_name="Is Active",
|
||||
),
|
||||
),
|
||||
(
|
||||
"created_at",
|
||||
models.DateTimeField(
|
||||
auto_now_add=True,
|
||||
help_text="Timestamp when this feed was first added.",
|
||||
verbose_name="Created At",
|
||||
),
|
||||
),
|
||||
(
|
||||
"last_fetched_at",
|
||||
models.DateTimeField(
|
||||
blank=True,
|
||||
help_text="Timestamp when this feed was last fetched.",
|
||||
null=True,
|
||||
verbose_name="Last Fetched At",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "Feed",
|
||||
"verbose_name_plural": "Feeds",
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="Entry",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"entry_id",
|
||||
models.CharField(
|
||||
db_index=True,
|
||||
help_text="Unique entry ID (guid, id, or link) from the feed.",
|
||||
max_length=512,
|
||||
verbose_name="Entry ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"fetched_at",
|
||||
models.DateTimeField(
|
||||
auto_now_add=True,
|
||||
db_index=True,
|
||||
help_text="Timestamp when this entry was archived.",
|
||||
verbose_name="Fetched At",
|
||||
),
|
||||
),
|
||||
(
|
||||
"published_at",
|
||||
models.DateTimeField(
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text="Timestamp when this entry was published (if available).",
|
||||
null=True,
|
||||
verbose_name="Published At",
|
||||
),
|
||||
),
|
||||
(
|
||||
"content_hash",
|
||||
models.BigIntegerField(
|
||||
db_index=True,
|
||||
help_text="xxhash64 integer of the entry content for deduplication.",
|
||||
verbose_name="Content Hash",
|
||||
),
|
||||
),
|
||||
(
|
||||
"data",
|
||||
models.JSONField(
|
||||
blank=True,
|
||||
help_text="Parsed entry data as JSON.",
|
||||
null=True,
|
||||
verbose_name="Entry Data",
|
||||
),
|
||||
),
|
||||
(
|
||||
"error_message",
|
||||
models.TextField(
|
||||
blank=True,
|
||||
default="",
|
||||
help_text="Error message if archiving failed.",
|
||||
verbose_name="Error Message",
|
||||
),
|
||||
),
|
||||
(
|
||||
"feed",
|
||||
models.ForeignKey(
|
||||
help_text="The feed this entry was fetched from.",
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="entries",
|
||||
to="feeds.feed",
|
||||
verbose_name="Feed",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "Entry",
|
||||
"verbose_name_plural": "Entries",
|
||||
"indexes": [
|
||||
django.contrib.postgres.indexes.GinIndex(
|
||||
fields=["data"], name="feeds_entry_data_c87562_gin"
|
||||
)
|
||||
],
|
||||
"unique_together": {("feed", "entry_id", "content_hash")},
|
||||
},
|
||||
),
|
||||
]
|
||||
136
feeds/models.py
136
feeds/models.py
|
|
@ -0,0 +1,136 @@
|
|||
import logging
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.contrib.postgres.indexes import GinIndex
|
||||
from django.db import models
|
||||
|
||||
logger: logging.Logger = logging.getLogger("feeds.models")
|
||||
|
||||
|
||||
class Feed(models.Model):
|
||||
"""Represents the actual RSS/Atom feed URL and its metadata."""
|
||||
|
||||
url = models.URLField(
|
||||
help_text="The canonical URL of the RSS/Atom feed. Must be unique.",
|
||||
verbose_name="Feed URL",
|
||||
max_length=2048,
|
||||
unique=True,
|
||||
)
|
||||
domain = models.CharField(
|
||||
help_text="Domain name extracted from the feed URL.",
|
||||
verbose_name="Domain",
|
||||
max_length=255,
|
||||
db_index=True,
|
||||
)
|
||||
etag = models.CharField(
|
||||
help_text="HTTP ETag header for conditional requests.",
|
||||
verbose_name="ETag",
|
||||
max_length=255,
|
||||
blank=True,
|
||||
default="",
|
||||
)
|
||||
last_modified = models.CharField(
|
||||
help_text="HTTP Last-Modified header for conditional requests.",
|
||||
verbose_name="Last Modified",
|
||||
max_length=255,
|
||||
blank=True,
|
||||
default="",
|
||||
)
|
||||
is_active = models.BooleanField(
|
||||
help_text="Whether this feed is currently being fetched.",
|
||||
verbose_name="Is Active",
|
||||
default=True,
|
||||
)
|
||||
created_at = models.DateTimeField(
|
||||
help_text="Timestamp when this feed was first added.",
|
||||
verbose_name="Created At",
|
||||
auto_now_add=True,
|
||||
)
|
||||
last_fetched_at = models.DateTimeField(
|
||||
help_text="Timestamp when this feed was last fetched.",
|
||||
verbose_name="Last Fetched At",
|
||||
blank=True,
|
||||
null=True,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = "Feed"
|
||||
verbose_name_plural = "Feeds"
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return the feed URL as string representation."""
|
||||
return self.url
|
||||
|
||||
def save(self, *args, **kwargs) -> None:
|
||||
"""Override save to auto-populate domain from URL if not set."""
|
||||
if not self.domain and self.url:
|
||||
self.domain = str(urlparse(str(self.url)).netloc)
|
||||
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
"Auto-populated domain '%s' for feed URL: %s",
|
||||
self.domain,
|
||||
self.url,
|
||||
)
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
|
||||
class Entry(models.Model):
|
||||
"""An archived entry (item/post) from a feed."""
|
||||
|
||||
feed = models.ForeignKey(
|
||||
to="Feed",
|
||||
help_text="The feed this entry was fetched from.",
|
||||
on_delete=models.CASCADE,
|
||||
related_name="entries",
|
||||
verbose_name="Feed",
|
||||
)
|
||||
entry_id = models.CharField(
|
||||
help_text="Unique entry ID (guid, id, or link) from the feed.",
|
||||
verbose_name="Entry ID",
|
||||
max_length=512,
|
||||
db_index=True,
|
||||
)
|
||||
fetched_at = models.DateTimeField(
|
||||
help_text="Timestamp when this entry was archived.",
|
||||
verbose_name="Fetched At",
|
||||
auto_now_add=True,
|
||||
db_index=True,
|
||||
)
|
||||
published_at = models.DateTimeField(
|
||||
help_text="Timestamp when this entry was published (if available).",
|
||||
verbose_name="Published At",
|
||||
db_index=True,
|
||||
blank=True,
|
||||
null=True,
|
||||
)
|
||||
content_hash = models.BigIntegerField(
|
||||
help_text="xxhash64 integer of the entry content for deduplication.",
|
||||
verbose_name="Content Hash",
|
||||
db_index=True,
|
||||
)
|
||||
data = models.JSONField(
|
||||
help_text="Parsed entry data as JSON.",
|
||||
verbose_name="Entry Data",
|
||||
blank=True,
|
||||
null=True,
|
||||
)
|
||||
error_message = models.TextField(
|
||||
help_text="Error message if archiving failed.",
|
||||
verbose_name="Error Message",
|
||||
blank=True,
|
||||
default="",
|
||||
)
|
||||
|
||||
class Meta:
|
||||
unique_together = ("feed", "entry_id", "content_hash")
|
||||
indexes = [
|
||||
GinIndex(fields=["data"]),
|
||||
]
|
||||
verbose_name = "Entry"
|
||||
verbose_name_plural = "Entries"
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return a string representation of the entry."""
|
||||
return f"{self.feed.domain} entry {self.entry_id} at {self.fetched_at}"
|
||||
191
feeds/services.py
Normal file
191
feeds/services.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
from typing import TYPE_CHECKING
|
||||
from typing import Any
|
||||
from xml.parsers.expat import ExpatError
|
||||
|
||||
import dateparser
|
||||
import niquests
|
||||
import xmltodict
|
||||
import xxhash
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from feeds.models import Entry
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
|
||||
from feeds.models import Feed
|
||||
|
||||
HTTP_OK = 200
|
||||
HTTP_NOT_MODIFIED = 304
|
||||
|
||||
|
||||
def extract_id(val: str | dict | None) -> str | None:
|
||||
"""Extracts a string ID from a guid or id field, handling both string and dict formats.
|
||||
|
||||
Args:
|
||||
val (str | dict | None): The value to extract the ID from, which can be a string, a dict (with possible '#text' or '@id' keys), or None
|
||||
|
||||
Returns:
|
||||
str | None: The extracted ID as a string, or None if it cannot be extracted
|
||||
"""
|
||||
if isinstance(val, dict):
|
||||
# RSS guid or Atom id as dict: prefer '#text', fallback to str(val)
|
||||
return val.get("#text") or val.get("@id") or str(val)
|
||||
return val
|
||||
|
||||
|
||||
def fetch_and_archive_feed(feed: Feed) -> int:
|
||||
"""Fetches the feed, parses entries, deduplicates, and archives new entries.
|
||||
|
||||
Returns:
|
||||
The number of new entries archived.
|
||||
"""
|
||||
request_headers: dict[str, str] = get_request_headers()
|
||||
if feed.etag:
|
||||
request_headers["If-None-Match"] = feed.etag
|
||||
if feed.last_modified:
|
||||
request_headers["If-Modified-Since"] = feed.last_modified
|
||||
|
||||
try:
|
||||
response: niquests.Response = niquests.get(
|
||||
feed.url,
|
||||
headers=request_headers,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if response.status_code == HTTP_NOT_MODIFIED:
|
||||
feed.last_fetched_at = timezone.now()
|
||||
feed.save(update_fields=["last_fetched_at"])
|
||||
return 0
|
||||
|
||||
raw_xml: bytes = response.content or b""
|
||||
error_msg: str = ""
|
||||
parsed_data: dict[str, Any] | None = None
|
||||
if response.status_code == HTTP_OK:
|
||||
try:
|
||||
parsed_data = xmltodict.parse(
|
||||
raw_xml.decode("utf-8", errors="replace"),
|
||||
process_namespaces=False,
|
||||
)
|
||||
except ExpatError as e:
|
||||
error_msg = f"XML Parsing Error: {e!s}"
|
||||
|
||||
# Extract entries from parsed_data
|
||||
entries: list[dict[str, Any]] = extract_feed_entries(parsed_data)
|
||||
|
||||
new_count = 0
|
||||
for entry in entries:
|
||||
content_hash: int = calculate_content_hash(entry)
|
||||
|
||||
entry_id: str = (
|
||||
extract_id(entry.get("guid"))
|
||||
or extract_id(entry.get("id"))
|
||||
or entry.get("link")
|
||||
or str(content_hash)
|
||||
)
|
||||
if not isinstance(entry_id, str):
|
||||
entry_id = str(entry_id)
|
||||
|
||||
published_at: datetime.datetime | None = None
|
||||
for date_field in ("published", "pubDate", "updated", "created"):
|
||||
if entry.get(date_field):
|
||||
published_at = dateparser.parse(entry[date_field])
|
||||
if published_at:
|
||||
break
|
||||
|
||||
# Deduplicate: skip if entry with same feed+entry_id+content_hash exists
|
||||
exists: bool = Entry.objects.filter(
|
||||
feed=feed,
|
||||
entry_id=entry_id,
|
||||
content_hash=content_hash,
|
||||
).exists()
|
||||
if not exists:
|
||||
Entry.objects.create(
|
||||
feed=feed,
|
||||
entry_id=entry_id,
|
||||
fetched_at=timezone.now(),
|
||||
published_at=published_at,
|
||||
content_hash=content_hash,
|
||||
data=entry,
|
||||
error_message=error_msg,
|
||||
)
|
||||
new_count += 1
|
||||
|
||||
feed.etag = response.headers.get("ETag", "")
|
||||
feed.last_modified = response.headers.get("Last-Modified", "")
|
||||
feed.last_fetched_at = timezone.now()
|
||||
feed.save()
|
||||
|
||||
except niquests.exceptions.RequestException as e:
|
||||
Entry.objects.create(
|
||||
feed=feed,
|
||||
entry_id="__error__",
|
||||
fetched_at=timezone.now(),
|
||||
published_at=None,
|
||||
content_hash=0,
|
||||
data=None,
|
||||
error_message=str(e),
|
||||
)
|
||||
return 0
|
||||
|
||||
else:
|
||||
return new_count
|
||||
|
||||
|
||||
def calculate_content_hash(entry: dict[str, Any]) -> int:
|
||||
"""Calculates a content hash for the entry using xxhash64.
|
||||
|
||||
Args:
|
||||
entry (dict[str, Any]): The entry data as a dictionary.
|
||||
|
||||
Returns:
|
||||
int: A 64-bit integer hash of the entry content, suitable for deduplication.
|
||||
"""
|
||||
entry_bytes: bytes = str(entry).encode("utf-8")
|
||||
entry_hash_int: int = xxhash.xxh64_intdigest(entry_bytes)
|
||||
|
||||
# Ensure content_hash fits in signed 64-bit integer
|
||||
content_hash: int = entry_hash_int & 0x7FFFFFFFFFFFFFFF
|
||||
return content_hash
|
||||
|
||||
|
||||
def extract_feed_entries(parsed_data: dict[str, Any] | None) -> list[dict[str, Any]]:
|
||||
"""Extracts a list of entries from the parsed feed data, handling both RSS and Atom formats.
|
||||
|
||||
Args:
|
||||
parsed_data (dict[str, Any] | None): The parsed feed data as a dictionary, or None if parsing failed
|
||||
|
||||
Returns:
|
||||
list[dict[str, Any]]: A list of entries extracted from the feed, where each entry is represented as a dictionary. If no entries are found or if parsed_data is None, an empty list is returned.
|
||||
"""
|
||||
entries: list[dict[str, Any]] = []
|
||||
if parsed_data:
|
||||
# RSS: channel > item; Atom: feed > entry
|
||||
items: list[dict[str, Any]] | dict[str, Any] = []
|
||||
if "rss" in parsed_data:
|
||||
items = parsed_data["rss"].get("channel", {}).get("item", [])
|
||||
elif "feed" in parsed_data:
|
||||
items = parsed_data["feed"].get("entry", [])
|
||||
if isinstance(items, dict):
|
||||
items = [items]
|
||||
entries = items
|
||||
return entries
|
||||
|
||||
|
||||
def get_request_headers() -> dict[str, str]:
|
||||
"""Helper function to get standard request headers for fetching feeds.
|
||||
|
||||
Returns:
|
||||
dict[str, str]: A dictionary of HTTP headers to include in feed fetch requests.
|
||||
"""
|
||||
# https://blog.cloudflare.com/verified-bots-with-cryptography/
|
||||
# https://www.cloudflare.com/lp/verified-bots/
|
||||
# TODO(TheLovinator): We have to sign our requests # noqa: TD003
|
||||
|
||||
request_headers: dict[str, str] = {
|
||||
"User-Agent": settings.USER_AGENT,
|
||||
"From": settings.BOT_CONTACT_EMAIL,
|
||||
}
|
||||
|
||||
return request_headers
|
||||
24
feeds/tasks.py
Normal file
24
feeds/tasks.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
from celery import shared_task
|
||||
|
||||
from feeds.models import Feed
|
||||
from feeds.services import fetch_and_archive_feed
|
||||
|
||||
|
||||
@shared_task
|
||||
def archive_feed_task(feed_id: int) -> str:
|
||||
"""Celery task to fetch and archive a feed by its ID.
|
||||
|
||||
Args:
|
||||
feed_id: The ID of the Feed to archive.
|
||||
|
||||
Returns:
|
||||
A message indicating the result of the archiving process.
|
||||
"""
|
||||
try:
|
||||
feed: Feed = Feed.objects.get(id=feed_id)
|
||||
except Feed.DoesNotExist:
|
||||
return f"Feed with id {feed_id} does not exist."
|
||||
new_entries_count: int = fetch_and_archive_feed(feed)
|
||||
if new_entries_count > 0:
|
||||
return f"Archived {new_entries_count} new entries for {feed.url}"
|
||||
return f"No new entries archived for {feed.url}"
|
||||
1
feeds/tests/__init__.py
Normal file
1
feeds/tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# This file marks the directory as a Python package.
|
||||
117
feeds/tests/test_entry_id.py
Normal file
117
feeds/tests/test_entry_id.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
import os
|
||||
import threading
|
||||
from http.server import HTTPServer
|
||||
from http.server import SimpleHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from feeds.models import Entry
|
||||
from feeds.models import Feed
|
||||
from feeds.services import fetch_and_archive_feed
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_entry_id_string_guid_dict(tmp_path: Path) -> None:
|
||||
"""Test that entry_id is always a string, even if guid is a dict."""
|
||||
# Prepare a fake RSS feed with guid as dict (attributes)
|
||||
feed_content = """
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<link>http://example.com/</link>
|
||||
<description>Test feed description</description>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<link>http://example.com/item1</link>
|
||||
<guid isPermaLink="true">http://example.com/item1</guid>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
feed_path: Path = tmp_path / "test_feed.xml"
|
||||
feed_path.write_text(feed_content, encoding="utf-8")
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
fetch_and_archive_feed(feed)
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert isinstance(entry.entry_id, str)
|
||||
assert entry.entry_id == "http://example.com/item1"
|
||||
server.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_entry_id_string_guid_string(tmp_path: Path) -> None:
|
||||
"""Test that entry_id is a string when guid is a plain string."""
|
||||
feed_content = """
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<link>http://example.com/</link>
|
||||
<description>Test feed description</description>
|
||||
<item>
|
||||
<title>Item 2</title>
|
||||
<link>http://example.com/item2</link>
|
||||
<guid>http://example.com/item2</guid>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
feed_path: Path = tmp_path / "test_feed.xml"
|
||||
feed_path.write_text(feed_content, encoding="utf-8")
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
fetch_and_archive_feed(feed)
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert isinstance(entry.entry_id, str)
|
||||
assert entry.entry_id == "http://example.com/item2"
|
||||
server.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_entry_id_fallback_to_link(tmp_path: Path) -> None:
|
||||
"""Test that entry_id falls back to link if guid/id missing."""
|
||||
feed_content = """
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<link>http://example.com/</link>
|
||||
<description>Test feed description</description>
|
||||
<item>
|
||||
<title>Item 3</title>
|
||||
<link>http://example.com/item3</link>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
feed_path: Path = tmp_path / "test_feed.xml"
|
||||
feed_path.write_text(feed_content, encoding="utf-8")
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
fetch_and_archive_feed(feed)
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert isinstance(entry.entry_id, str)
|
||||
assert entry.entry_id == "http://example.com/item3"
|
||||
server.shutdown()
|
||||
111
feeds/tests/test_entry_id_extra.py
Normal file
111
feeds/tests/test_entry_id_extra.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
import os
|
||||
import threading
|
||||
from http.server import HTTPServer
|
||||
from http.server import SimpleHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from feeds.models import Entry
|
||||
from feeds.models import Feed
|
||||
from feeds.services import fetch_and_archive_feed
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_entry_id_id_dict(tmp_path: Path) -> None:
|
||||
"""Test that entry_id is a string when id is a dict."""
|
||||
feed_content = """
|
||||
<feed xmlns='http://www.w3.org/2005/Atom'>
|
||||
<title>Test Atom Feed</title>
|
||||
<id>http://example.com/feed</id>
|
||||
<entry>
|
||||
<title>Entry 1</title>
|
||||
<id scheme='urn:uuid'>urn:uuid:1234</id>
|
||||
<link href='http://example.com/entry1'/>
|
||||
</entry>
|
||||
</feed>
|
||||
"""
|
||||
feed_path: Path = tmp_path / "test_feed.xml"
|
||||
feed_path.write_text(feed_content, encoding="utf-8")
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
fetch_and_archive_feed(feed)
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert isinstance(entry.entry_id, str)
|
||||
assert "urn:uuid:1234" in entry.entry_id
|
||||
server.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_entry_id_all_fields_missing(tmp_path: Path) -> None:
|
||||
"""Test that entry_id falls back to content_hash if guid/id/link missing."""
|
||||
feed_content = """
|
||||
<rss version='2.0'>
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<item>
|
||||
<title>Item with no id</title>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
feed_path: Path = tmp_path / "test_feed.xml"
|
||||
feed_path.write_text(feed_content, encoding="utf-8")
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
fetch_and_archive_feed(feed)
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert isinstance(entry.entry_id, str)
|
||||
|
||||
# Should be a hash string (digits only)
|
||||
assert entry.entry_id.isdigit() or entry.entry_id.lstrip("-").isdigit()
|
||||
server.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_entry_id_malformed_guid(tmp_path: Path) -> None:
|
||||
"""Test that entry_id handles malformed guid/id gracefully."""
|
||||
feed_content = """
|
||||
<rss version='2.0'>
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<item>
|
||||
<title>Malformed guid</title>
|
||||
<guid></guid>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
"""
|
||||
feed_path: Path = tmp_path / "test_feed.xml"
|
||||
feed_path.write_text(feed_content, encoding="utf-8")
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
fetch_and_archive_feed(feed)
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert isinstance(entry.entry_id, str)
|
||||
|
||||
# Should fallback to content_hash
|
||||
assert entry.entry_id.isdigit() or entry.entry_id.lstrip("-").isdigit()
|
||||
server.shutdown()
|
||||
64
feeds/tests/test_services.py
Normal file
64
feeds/tests/test_services.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import os
|
||||
import threading
|
||||
from http.server import HTTPServer
|
||||
from http.server import SimpleHTTPRequestHandler
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from feeds.models import Entry
|
||||
from feeds.models import Feed
|
||||
from feeds.services import fetch_and_archive_feed
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_fetch_and_archive_feed_xml(tmp_path: Path) -> None:
|
||||
"""Test fetching and archiving a simple XML feed using a local HTTP server."""
|
||||
# Use a local test XML file as a feed source
|
||||
test_feed_path: Path = tmp_path / "test_feed.xml"
|
||||
test_feed_path.write_text(
|
||||
encoding="utf-8",
|
||||
data="""
|
||||
<rss version='2.0'>
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<link>http://example.com/</link>
|
||||
<description>Test feed description</description>
|
||||
<item>
|
||||
<title>Item 1</title>
|
||||
<link>http://example.com/item1</link>
|
||||
<description>Item 1 description</description>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
""",
|
||||
)
|
||||
|
||||
# Serve the file using a simple HTTP server
|
||||
os.chdir(tmp_path)
|
||||
server = HTTPServer(("localhost", 0), SimpleHTTPRequestHandler)
|
||||
port: int = server.server_address[1]
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
url: str = f"http://localhost:{port}/test_feed.xml"
|
||||
|
||||
feed: Feed = Feed.objects.create(url=url, domain="localhost")
|
||||
new_entries: int = fetch_and_archive_feed(feed)
|
||||
assert new_entries == 1
|
||||
|
||||
# Check that the entry was archived and contains the expected data
|
||||
entry: Entry | None = Entry.objects.filter(feed=feed).first()
|
||||
assert entry is not None
|
||||
assert entry.data is not None
|
||||
assert entry.data["title"] == "Item 1"
|
||||
assert Entry.objects.filter(feed=feed).count() == 1
|
||||
|
||||
# Clean up: stop the server and wait for the thread to finish
|
||||
server.shutdown()
|
||||
|
||||
# Wait until the thread terminates.
|
||||
# This ensures the server is fully stopped before the test ends.
|
||||
thread.join()
|
||||
39
feeds/tests/twitch-campaigns.xml
vendored
Normal file
39
feeds/tests/twitch-campaigns.xml
vendored
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -1,26 +1,15 @@
|
|||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.http import HttpResponse
|
||||
from django.urls import path
|
||||
|
||||
from . import views
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.http import HttpRequest
|
||||
from django.urls import URLPattern
|
||||
from django.urls import URLResolver
|
||||
|
||||
|
||||
def index(request: HttpRequest) -> HttpResponse:
|
||||
"""View for the index page.
|
||||
|
||||
Args:
|
||||
request: The HTTP request object.
|
||||
|
||||
Returns:
|
||||
HttpResponse: A simple HTTP response with a greeting message.
|
||||
"""
|
||||
return HttpResponse("Hello, world!")
|
||||
|
||||
|
||||
urlpatterns: list[URLPattern | URLResolver] = [
|
||||
path("", index, name="index"),
|
||||
path("", views.feed_list, name="feed-list"),
|
||||
path("feeds/<int:feed_id>/", views.feed_detail, name="feed-detail"),
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,70 @@
|
|||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.http import HttpResponse
|
||||
from django.shortcuts import get_object_or_404
|
||||
|
||||
from feeds.models import Entry
|
||||
from feeds.models import Feed
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.http import HttpRequest
|
||||
from pytest_django.asserts import QuerySet
|
||||
|
||||
|
||||
def feed_list(request: HttpRequest) -> HttpResponse:
|
||||
"""View to list all feeds.
|
||||
|
||||
Returns:
|
||||
HttpResponse: An HTML response containing the list of feeds.
|
||||
"""
|
||||
feeds = Feed.objects.all().order_by("id")
|
||||
html = [
|
||||
"<!DOCTYPE html>",
|
||||
"<html><head><title>FeedVault - Feeds</title></head><body>",
|
||||
"<h1>Feed List</h1>",
|
||||
"<ul>",
|
||||
]
|
||||
html.extend(
|
||||
f'<li><a href="/feeds/{feed.pk}/">{feed.url}</a></li>' for feed in feeds
|
||||
)
|
||||
html.extend(("</ul>", "</body></html>"))
|
||||
return HttpResponse("\n".join(html))
|
||||
|
||||
|
||||
def feed_detail(request: HttpRequest, feed_id: int) -> HttpResponse:
|
||||
"""View to display the details of a specific feed.
|
||||
|
||||
Args:
|
||||
request (HttpRequest): The HTTP request object.
|
||||
feed_id (int): The ID of the feed to display.
|
||||
|
||||
Returns:
|
||||
HttpResponse: An HTML response containing the feed details and its entries.
|
||||
"""
|
||||
feed: Feed = get_object_or_404(Feed, id=feed_id)
|
||||
|
||||
entries: QuerySet[Entry, Entry] = Entry.objects.filter(feed=feed).order_by(
|
||||
"-published_at",
|
||||
"-fetched_at",
|
||||
)[:50]
|
||||
html: list[str] = [
|
||||
"<!DOCTYPE html>",
|
||||
f"<html><head><title>FeedVault - {feed.url}</title></head><body>",
|
||||
"<h1>Feed Detail</h1>",
|
||||
f"<p><b>URL:</b> {feed.url}</p>",
|
||||
f"<p><b>Domain:</b> {feed.domain}</p>",
|
||||
f"<p><b>Active:</b> {'yes' if feed.is_active else 'no'}</p>",
|
||||
f"<p><b>Created:</b> {feed.created_at}</p>",
|
||||
f"<p><b>Last fetched:</b> {feed.last_fetched_at}</p>",
|
||||
"<h2>Entries (latest 50)</h2>",
|
||||
"<ul>",
|
||||
]
|
||||
for entry in entries:
|
||||
title: str | None = entry.data.get("title") if entry.data else None
|
||||
summary: str | None = entry.data.get("summary") if entry.data else None
|
||||
snippet: str = title or summary or "[no title]"
|
||||
html.append(
|
||||
f"<li><b>{entry.published_at or entry.fetched_at}:</b> {snippet} <small>(id: {entry.entry_id})</small></li>",
|
||||
)
|
||||
html.extend(("</ul>", '<p><a href="/">Back to list</a></p>', "</body></html>"))
|
||||
return HttpResponse("\n".join(html))
|
||||
|
|
@ -11,18 +11,24 @@ dependencies = [
|
|||
"django-celery-results",
|
||||
"django-debug-toolbar",
|
||||
"django-silk[formatting]",
|
||||
"django-stubs-ext",
|
||||
"django",
|
||||
"flower",
|
||||
"gunicorn",
|
||||
"hiredis",
|
||||
"index-now-for-python",
|
||||
"niquests",
|
||||
"platformdirs",
|
||||
"psycopg[binary]",
|
||||
"pydantic",
|
||||
"python-dotenv",
|
||||
"redis",
|
||||
"sentry-sdk",
|
||||
"setproctitle",
|
||||
"sitemap-parser",
|
||||
"xmltodict",
|
||||
"dateparser>=1.3.0",
|
||||
"xxhash>=3.6.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
@ -36,6 +42,7 @@ dev = [
|
|||
"pytest-randomly",
|
||||
"pytest-xdist[psutil]",
|
||||
"pytest",
|
||||
"types-xmltodict",
|
||||
]
|
||||
[tool.pytest.ini_options]
|
||||
DJANGO_SETTINGS_MODULE = "config.settings"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue