Use dateparser to parse the date

This commit is contained in:
Joakim Hellsén 2024-03-15 14:15:27 +01:00
commit 61f05a9a23
No known key found for this signature in database
GPG key ID: D196AE66FEBE1DC9
3 changed files with 104 additions and 36 deletions

View file

@ -1,11 +1,10 @@
from __future__ import annotations
import datetime
import logging
from time import mktime, struct_time
from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse
import dateparser
import feedparser
from django.utils import timezone
from feedparser import FeedParserDict
@ -13,6 +12,8 @@ from feedparser import FeedParserDict
from feedvault.models import Author, Domain, Entry, Feed, Generator, Publisher
if TYPE_CHECKING:
import datetime
from django.contrib.auth.models import AbstractBaseUser, AnonymousUser
logger: logging.Logger = logging.getLogger(__name__)
@ -141,21 +142,6 @@ def parse_feed(url: str | None) -> dict | None:
return parsed_feed
def struct_time_to_datetime(struct_time: struct_time | None) -> datetime.datetime | None:
"""Convert a struct_time to a datetime."""
if not struct_time:
return None
if struct_time == "Mon, 01 Jan 0001 00:00:00 +0000":
return None
dt: datetime.datetime = datetime.datetime.fromtimestamp(mktime(struct_time), tz=datetime.UTC)
if not dt:
logger.error("Error converting struct_time to datetime: %s", struct_time)
return None
return dt
def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
"""Add an entry to the database.
@ -165,10 +151,25 @@ def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
"""
author: Author = get_author(parsed_feed=entry)
publisher: Publisher = get_publisher(parsed_feed=entry)
updated_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("updated_parsed")) # type: ignore # noqa: PGH003
published_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("published_parsed")) # type: ignore # noqa: PGH003
expired_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("expired_parsed")) # type: ignore # noqa: PGH003
created_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("created_parsed")) # type: ignore # noqa: PGH003
pre_updated_parsed: str = str(entry.get("updated_parsed", ""))
updated_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_updated_parsed)) if pre_updated_parsed else None
)
pre_published_parsed: str = str(entry.get("published_parsed", ""))
published_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_published_parsed)) if pre_published_parsed else None
)
pre_expired_parsed: str = str(entry.get("expired_parsed", ""))
expired_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_expired_parsed)) if pre_expired_parsed else None
)
pre_created_parsed = str(entry.get("created_parsed", ""))
created_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_created_parsed)) if pre_created_parsed else None
)
_entry = Entry(
feed=feed,
@ -201,18 +202,14 @@ def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
)
# Save the entry.
try:
_entry.save()
except Exception:
logger.exception("Error saving entry for feed: %s", feed)
return None
_entry.save()
logger.info("Created entry: %s", _entry)
return _entry
def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed | None:
def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed | None: # noqa: PLR0914
"""Add a feed to the database.
Args:
@ -242,8 +239,18 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
generator: Generator = def_generator(parsed_feed=parsed_feed)
publisher: Publisher = get_publisher(parsed_feed=parsed_feed)
published_parsed: datetime | None = struct_time_to_datetime(struct_time=parsed_feed.get("published_parsed")) # type: ignore # noqa: PGH003
updated_parsed: datetime | None = struct_time_to_datetime(struct_time=parsed_feed.get("updated_parsed")) # type: ignore # noqa: PGH003
pre_published_parsed: str = str(parsed_feed.get("published_parsed", ""))
published_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_published_parsed)) if pre_published_parsed else None
)
pre_updated_parsed: str = str(parsed_feed.get("updated_parsed", ""))
updated_parsed: datetime.datetime | None = (
dateparser.parse(date_string=str(pre_updated_parsed)) if pre_updated_parsed else None
)
pre_modified: str = str(parsed_feed.get("modified", ""))
modified: timezone.datetime | None = dateparser.parse(date_string=pre_modified) if pre_modified else None
# Create the feed
feed = Feed(
@ -257,7 +264,7 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
etag=parsed_feed.get("etag", ""),
headers=parsed_feed.get("headers", {}),
href=parsed_feed.get("href", ""),
modified=parsed_feed.get("modified"),
modified=modified,
namespaces=parsed_feed.get("namespaces", {}),
status=parsed_feed.get("status", 0),
version=parsed_feed.get("version", ""),
@ -296,11 +303,7 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
)
# Save the feed.
try:
feed.save()
except Exception:
logger.exception("Got exception while saving feed: %s", url)
return None
feed.save()
entries = parsed_feed.get("entries", [])
for entry in entries:

66
poetry.lock generated
View file

@ -175,6 +175,28 @@ editorconfig = ">=0.12.2"
jsbeautifier = "*"
six = ">=1.13.0"
[[package]]
name = "dateparser"
version = "1.2.0"
description = "Date parsing library designed to parse dates from HTML pages"
optional = false
python-versions = ">=3.7"
files = [
{file = "dateparser-1.2.0-py2.py3-none-any.whl", hash = "sha256:0b21ad96534e562920a0083e97fd45fa959882d4162acc358705144520a35830"},
{file = "dateparser-1.2.0.tar.gz", hash = "sha256:7975b43a4222283e0ae15be7b4999d08c9a70e2d378ac87385b1ccf2cffbbb30"},
]
[package.dependencies]
python-dateutil = "*"
pytz = "*"
regex = "<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27"
tzlocal = "*"
[package.extras]
calendars = ["convertdate", "hijri-converter"]
fasttext = ["fasttext"]
langdetect = ["langdetect"]
[[package]]
name = "django"
version = "5.0.3"
@ -347,6 +369,20 @@ files = [
{file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
description = "Extensions to the standard Python datetime module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
files = [
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
]
[package.dependencies]
six = ">=1.5"
[[package]]
name = "python-dotenv"
version = "1.0.1"
@ -361,6 +397,17 @@ files = [
[package.extras]
cli = ["click (>=5.0)"]
[[package]]
name = "pytz"
version = "2024.1"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]]
name = "pyyaml"
version = "6.0.1"
@ -617,7 +664,24 @@ files = [
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
]
[[package]]
name = "tzlocal"
version = "5.2"
description = "tzinfo object for the local timezone"
optional = false
python-versions = ">=3.8"
files = [
{file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"},
{file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"},
]
[package.dependencies]
tzdata = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
[metadata]
lock-version = "2.0"
python-versions = "^3.12"
content-hash = "dd4d8ba16bb5e34d2e0f94009d4ea86de094a6b1d6d1af3e6b69c14e881ccf3e"
content-hash = "2617c6ec410cc30c300b46a5d653fa2a2aaa1737509851ab19b2e628b2838a65"

View file

@ -11,6 +11,7 @@ django = { extras = ["argon2"], version = "^5.0.3" }
python-dotenv = "^1.0.1"
feedparser = "^6.0.11"
gunicorn = "^21.2.0"
dateparser = "^1.2.0"
[tool.poetry.group.dev.dependencies]
ruff = "^0.3.0"