Use dateparser to parse the date

This commit is contained in:
Joakim Hellsén 2024-03-15 14:15:27 +01:00
commit 61f05a9a23
No known key found for this signature in database
GPG key ID: D196AE66FEBE1DC9
3 changed files with 104 additions and 36 deletions

View file

@ -1,11 +1,10 @@
from __future__ import annotations from __future__ import annotations
import datetime
import logging import logging
from time import mktime, struct_time
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse from urllib.parse import ParseResult, urlparse
import dateparser
import feedparser import feedparser
from django.utils import timezone from django.utils import timezone
from feedparser import FeedParserDict from feedparser import FeedParserDict
@ -13,6 +12,8 @@ from feedparser import FeedParserDict
from feedvault.models import Author, Domain, Entry, Feed, Generator, Publisher from feedvault.models import Author, Domain, Entry, Feed, Generator, Publisher
if TYPE_CHECKING: if TYPE_CHECKING:
import datetime
from django.contrib.auth.models import AbstractBaseUser, AnonymousUser from django.contrib.auth.models import AbstractBaseUser, AnonymousUser
logger: logging.Logger = logging.getLogger(__name__) logger: logging.Logger = logging.getLogger(__name__)
@ -141,21 +142,6 @@ def parse_feed(url: str | None) -> dict | None:
return parsed_feed return parsed_feed
def struct_time_to_datetime(struct_time: struct_time | None) -> datetime.datetime | None:
"""Convert a struct_time to a datetime."""
if not struct_time:
return None
if struct_time == "Mon, 01 Jan 0001 00:00:00 +0000":
return None
dt: datetime.datetime = datetime.datetime.fromtimestamp(mktime(struct_time), tz=datetime.UTC)
if not dt:
logger.error("Error converting struct_time to datetime: %s", struct_time)
return None
return dt
def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None: def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
"""Add an entry to the database. """Add an entry to the database.
@ -165,10 +151,25 @@ def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
""" """
author: Author = get_author(parsed_feed=entry) author: Author = get_author(parsed_feed=entry)
publisher: Publisher = get_publisher(parsed_feed=entry) publisher: Publisher = get_publisher(parsed_feed=entry)
updated_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("updated_parsed")) # type: ignore # noqa: PGH003 pre_updated_parsed: str = str(entry.get("updated_parsed", ""))
published_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("published_parsed")) # type: ignore # noqa: PGH003 updated_parsed: datetime.datetime | None = (
expired_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("expired_parsed")) # type: ignore # noqa: PGH003 dateparser.parse(date_string=str(pre_updated_parsed)) if pre_updated_parsed else None
created_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("created_parsed")) # type: ignore # noqa: PGH003 )
pre_published_parsed: str = str(entry.get("published_parsed", ""))
published_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_published_parsed)) if pre_published_parsed else None
)
pre_expired_parsed: str = str(entry.get("expired_parsed", ""))
expired_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_expired_parsed)) if pre_expired_parsed else None
)
pre_created_parsed = str(entry.get("created_parsed", ""))
created_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_created_parsed)) if pre_created_parsed else None
)
_entry = Entry( _entry = Entry(
feed=feed, feed=feed,
@ -201,18 +202,14 @@ def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
) )
# Save the entry. # Save the entry.
try: _entry.save()
_entry.save()
except Exception:
logger.exception("Error saving entry for feed: %s", feed)
return None
logger.info("Created entry: %s", _entry) logger.info("Created entry: %s", _entry)
return _entry return _entry
def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed | None: def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed | None: # noqa: PLR0914
"""Add a feed to the database. """Add a feed to the database.
Args: Args:
@ -242,8 +239,18 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
generator: Generator = def_generator(parsed_feed=parsed_feed) generator: Generator = def_generator(parsed_feed=parsed_feed)
publisher: Publisher = get_publisher(parsed_feed=parsed_feed) publisher: Publisher = get_publisher(parsed_feed=parsed_feed)
published_parsed: datetime | None = struct_time_to_datetime(struct_time=parsed_feed.get("published_parsed")) # type: ignore # noqa: PGH003 pre_published_parsed: str = str(parsed_feed.get("published_parsed", ""))
updated_parsed: datetime | None = struct_time_to_datetime(struct_time=parsed_feed.get("updated_parsed")) # type: ignore # noqa: PGH003 published_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_published_parsed)) if pre_published_parsed else None
)
pre_updated_parsed: str = str(parsed_feed.get("updated_parsed", ""))
updated_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_updated_parsed)) if pre_updated_parsed else None
)
pre_modified: str = str(parsed_feed.get("modified", ""))
modified: timezone.datetime | None = dateparser.parse(date_string=pre_modified) if pre_modified else None
# Create the feed # Create the feed
feed = Feed( feed = Feed(
@ -257,7 +264,7 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
etag=parsed_feed.get("etag", ""), etag=parsed_feed.get("etag", ""),
headers=parsed_feed.get("headers", {}), headers=parsed_feed.get("headers", {}),
href=parsed_feed.get("href", ""), href=parsed_feed.get("href", ""),
modified=parsed_feed.get("modified"), modified=modified,
namespaces=parsed_feed.get("namespaces", {}), namespaces=parsed_feed.get("namespaces", {}),
status=parsed_feed.get("status", 0), status=parsed_feed.get("status", 0),
version=parsed_feed.get("version", ""), version=parsed_feed.get("version", ""),
@ -296,11 +303,7 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
) )
# Save the feed. # Save the feed.
try: feed.save()
feed.save()
except Exception:
logger.exception("Got exception while saving feed: %s", url)
return None
entries = parsed_feed.get("entries", []) entries = parsed_feed.get("entries", [])
for entry in entries: for entry in entries:

66
poetry.lock generated
View file

@ -175,6 +175,28 @@ editorconfig = ">=0.12.2"
jsbeautifier = "*" jsbeautifier = "*"
six = ">=1.13.0" six = ">=1.13.0"
[[package]]
name = "dateparser"
version = "1.2.0"
description = "Date parsing library designed to parse dates from HTML pages"
optional = false
python-versions = ">=3.7"
files = [
{file = "dateparser-1.2.0-py2.py3-none-any.whl", hash = "sha256:0b21ad96534e562920a0083e97fd45fa959882d4162acc358705144520a35830"},
{file = "dateparser-1.2.0.tar.gz", hash = "sha256:7975b43a4222283e0ae15be7b4999d08c9a70e2d378ac87385b1ccf2cffbbb30"},
]
[package.dependencies]
python-dateutil = "*"
pytz = "*"
regex = "<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27"
tzlocal = "*"
[package.extras]
calendars = ["convertdate", "hijri-converter"]
fasttext = ["fasttext"]
langdetect = ["langdetect"]
[[package]] [[package]]
name = "django" name = "django"
version = "5.0.3" version = "5.0.3"
@ -347,6 +369,20 @@ files = [
{file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
] ]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
description = "Extensions to the standard Python datetime module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
files = [
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
]
[package.dependencies]
six = ">=1.5"
[[package]] [[package]]
name = "python-dotenv" name = "python-dotenv"
version = "1.0.1" version = "1.0.1"
@ -361,6 +397,17 @@ files = [
[package.extras] [package.extras]
cli = ["click (>=5.0)"] cli = ["click (>=5.0)"]
[[package]]
name = "pytz"
version = "2024.1"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]] [[package]]
name = "pyyaml" name = "pyyaml"
version = "6.0.1" version = "6.0.1"
@ -617,7 +664,24 @@ files = [
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
] ]
[[package]]
name = "tzlocal"
version = "5.2"
description = "tzinfo object for the local timezone"
optional = false
python-versions = ">=3.8"
files = [
{file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"},
{file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"},
]
[package.dependencies]
tzdata = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.12" python-versions = "^3.12"
content-hash = "dd4d8ba16bb5e34d2e0f94009d4ea86de094a6b1d6d1af3e6b69c14e881ccf3e" content-hash = "2617c6ec410cc30c300b46a5d653fa2a2aaa1737509851ab19b2e628b2838a65"

View file

@ -11,6 +11,7 @@ django = { extras = ["argon2"], version = "^5.0.3" }
python-dotenv = "^1.0.1" python-dotenv = "^1.0.1"
feedparser = "^6.0.11" feedparser = "^6.0.11"
gunicorn = "^21.2.0" gunicorn = "^21.2.0"
dateparser = "^1.2.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
ruff = "^0.3.0" ruff = "^0.3.0"