Use dateparser to parse the date
This commit is contained in:
parent
927e20c9bb
commit
61f05a9a23
3 changed files with 104 additions and 36 deletions
|
|
@ -1,11 +1,10 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import datetime
|
|
||||||
import logging
|
import logging
|
||||||
from time import mktime, struct_time
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from urllib.parse import ParseResult, urlparse
|
from urllib.parse import ParseResult, urlparse
|
||||||
|
|
||||||
|
import dateparser
|
||||||
import feedparser
|
import feedparser
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from feedparser import FeedParserDict
|
from feedparser import FeedParserDict
|
||||||
|
|
@ -13,6 +12,8 @@ from feedparser import FeedParserDict
|
||||||
from feedvault.models import Author, Domain, Entry, Feed, Generator, Publisher
|
from feedvault.models import Author, Domain, Entry, Feed, Generator, Publisher
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
import datetime
|
||||||
|
|
||||||
from django.contrib.auth.models import AbstractBaseUser, AnonymousUser
|
from django.contrib.auth.models import AbstractBaseUser, AnonymousUser
|
||||||
|
|
||||||
logger: logging.Logger = logging.getLogger(__name__)
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
@ -141,21 +142,6 @@ def parse_feed(url: str | None) -> dict | None:
|
||||||
return parsed_feed
|
return parsed_feed
|
||||||
|
|
||||||
|
|
||||||
def struct_time_to_datetime(struct_time: struct_time | None) -> datetime.datetime | None:
|
|
||||||
"""Convert a struct_time to a datetime."""
|
|
||||||
if not struct_time:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if struct_time == "Mon, 01 Jan 0001 00:00:00 +0000":
|
|
||||||
return None
|
|
||||||
|
|
||||||
dt: datetime.datetime = datetime.datetime.fromtimestamp(mktime(struct_time), tz=datetime.UTC)
|
|
||||||
if not dt:
|
|
||||||
logger.error("Error converting struct_time to datetime: %s", struct_time)
|
|
||||||
return None
|
|
||||||
return dt
|
|
||||||
|
|
||||||
|
|
||||||
def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
|
def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
|
||||||
"""Add an entry to the database.
|
"""Add an entry to the database.
|
||||||
|
|
||||||
|
|
@ -165,10 +151,25 @@ def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
|
||||||
"""
|
"""
|
||||||
author: Author = get_author(parsed_feed=entry)
|
author: Author = get_author(parsed_feed=entry)
|
||||||
publisher: Publisher = get_publisher(parsed_feed=entry)
|
publisher: Publisher = get_publisher(parsed_feed=entry)
|
||||||
updated_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("updated_parsed")) # type: ignore # noqa: PGH003
|
pre_updated_parsed: str = str(entry.get("updated_parsed", ""))
|
||||||
published_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("published_parsed")) # type: ignore # noqa: PGH003
|
updated_parsed: datetime.datetime | None = (
|
||||||
expired_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("expired_parsed")) # type: ignore # noqa: PGH003
|
dateparser.parse(date_string=str(pre_updated_parsed)) if pre_updated_parsed else None
|
||||||
created_parsed: datetime | None = struct_time_to_datetime(struct_time=entry.get("created_parsed")) # type: ignore # noqa: PGH003
|
)
|
||||||
|
|
||||||
|
pre_published_parsed: str = str(entry.get("published_parsed", ""))
|
||||||
|
published_parsed: datetime.datetime | None = (
|
||||||
|
dateparser.parse(date_string=str(pre_published_parsed)) if pre_published_parsed else None
|
||||||
|
)
|
||||||
|
|
||||||
|
pre_expired_parsed: str = str(entry.get("expired_parsed", ""))
|
||||||
|
expired_parsed: datetime.datetime | None = (
|
||||||
|
dateparser.parse(date_string=str(pre_expired_parsed)) if pre_expired_parsed else None
|
||||||
|
)
|
||||||
|
|
||||||
|
pre_created_parsed = str(entry.get("created_parsed", ""))
|
||||||
|
created_parsed: datetime.datetime | None = (
|
||||||
|
dateparser.parse(date_string=str(pre_created_parsed)) if pre_created_parsed else None
|
||||||
|
)
|
||||||
|
|
||||||
_entry = Entry(
|
_entry = Entry(
|
||||||
feed=feed,
|
feed=feed,
|
||||||
|
|
@ -201,18 +202,14 @@ def add_entry(feed: Feed, entry: FeedParserDict) -> Entry | None:
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the entry.
|
# Save the entry.
|
||||||
try:
|
_entry.save()
|
||||||
_entry.save()
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Error saving entry for feed: %s", feed)
|
|
||||||
return None
|
|
||||||
|
|
||||||
logger.info("Created entry: %s", _entry)
|
logger.info("Created entry: %s", _entry)
|
||||||
|
|
||||||
return _entry
|
return _entry
|
||||||
|
|
||||||
|
|
||||||
def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed | None:
|
def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed | None: # noqa: PLR0914
|
||||||
"""Add a feed to the database.
|
"""Add a feed to the database.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -242,8 +239,18 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
|
||||||
generator: Generator = def_generator(parsed_feed=parsed_feed)
|
generator: Generator = def_generator(parsed_feed=parsed_feed)
|
||||||
publisher: Publisher = get_publisher(parsed_feed=parsed_feed)
|
publisher: Publisher = get_publisher(parsed_feed=parsed_feed)
|
||||||
|
|
||||||
published_parsed: datetime | None = struct_time_to_datetime(struct_time=parsed_feed.get("published_parsed")) # type: ignore # noqa: PGH003
|
pre_published_parsed: str = str(parsed_feed.get("published_parsed", ""))
|
||||||
updated_parsed: datetime | None = struct_time_to_datetime(struct_time=parsed_feed.get("updated_parsed")) # type: ignore # noqa: PGH003
|
published_parsed: datetime.datetime | None = (
|
||||||
|
dateparser.parse(date_string=str(pre_published_parsed)) if pre_published_parsed else None
|
||||||
|
)
|
||||||
|
|
||||||
|
pre_updated_parsed: str = str(parsed_feed.get("updated_parsed", ""))
|
||||||
|
updated_parsed: datetime.datetime | None = (
|
||||||
|
dateparser.parse(date_string=str(pre_updated_parsed)) if pre_updated_parsed else None
|
||||||
|
)
|
||||||
|
|
||||||
|
pre_modified: str = str(parsed_feed.get("modified", ""))
|
||||||
|
modified: timezone.datetime | None = dateparser.parse(date_string=pre_modified) if pre_modified else None
|
||||||
|
|
||||||
# Create the feed
|
# Create the feed
|
||||||
feed = Feed(
|
feed = Feed(
|
||||||
|
|
@ -257,7 +264,7 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
|
||||||
etag=parsed_feed.get("etag", ""),
|
etag=parsed_feed.get("etag", ""),
|
||||||
headers=parsed_feed.get("headers", {}),
|
headers=parsed_feed.get("headers", {}),
|
||||||
href=parsed_feed.get("href", ""),
|
href=parsed_feed.get("href", ""),
|
||||||
modified=parsed_feed.get("modified"),
|
modified=modified,
|
||||||
namespaces=parsed_feed.get("namespaces", {}),
|
namespaces=parsed_feed.get("namespaces", {}),
|
||||||
status=parsed_feed.get("status", 0),
|
status=parsed_feed.get("status", 0),
|
||||||
version=parsed_feed.get("version", ""),
|
version=parsed_feed.get("version", ""),
|
||||||
|
|
@ -296,11 +303,7 @@ def add_feed(url: str | None, user: AbstractBaseUser | AnonymousUser) -> Feed |
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the feed.
|
# Save the feed.
|
||||||
try:
|
feed.save()
|
||||||
feed.save()
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Got exception while saving feed: %s", url)
|
|
||||||
return None
|
|
||||||
|
|
||||||
entries = parsed_feed.get("entries", [])
|
entries = parsed_feed.get("entries", [])
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
|
|
||||||
66
poetry.lock
generated
66
poetry.lock
generated
|
|
@ -175,6 +175,28 @@ editorconfig = ">=0.12.2"
|
||||||
jsbeautifier = "*"
|
jsbeautifier = "*"
|
||||||
six = ">=1.13.0"
|
six = ">=1.13.0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dateparser"
|
||||||
|
version = "1.2.0"
|
||||||
|
description = "Date parsing library designed to parse dates from HTML pages"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "dateparser-1.2.0-py2.py3-none-any.whl", hash = "sha256:0b21ad96534e562920a0083e97fd45fa959882d4162acc358705144520a35830"},
|
||||||
|
{file = "dateparser-1.2.0.tar.gz", hash = "sha256:7975b43a4222283e0ae15be7b4999d08c9a70e2d378ac87385b1ccf2cffbbb30"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
python-dateutil = "*"
|
||||||
|
pytz = "*"
|
||||||
|
regex = "<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27"
|
||||||
|
tzlocal = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
calendars = ["convertdate", "hijri-converter"]
|
||||||
|
fasttext = ["fasttext"]
|
||||||
|
langdetect = ["langdetect"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "django"
|
name = "django"
|
||||||
version = "5.0.3"
|
version = "5.0.3"
|
||||||
|
|
@ -347,6 +369,20 @@ files = [
|
||||||
{file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
|
{file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "python-dateutil"
|
||||||
|
version = "2.9.0.post0"
|
||||||
|
description = "Extensions to the standard Python datetime module"
|
||||||
|
optional = false
|
||||||
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
|
||||||
|
files = [
|
||||||
|
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
|
||||||
|
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
six = ">=1.5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "python-dotenv"
|
name = "python-dotenv"
|
||||||
version = "1.0.1"
|
version = "1.0.1"
|
||||||
|
|
@ -361,6 +397,17 @@ files = [
|
||||||
[package.extras]
|
[package.extras]
|
||||||
cli = ["click (>=5.0)"]
|
cli = ["click (>=5.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pytz"
|
||||||
|
version = "2024.1"
|
||||||
|
description = "World timezone definitions, modern and historical"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
|
||||||
|
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyyaml"
|
name = "pyyaml"
|
||||||
version = "6.0.1"
|
version = "6.0.1"
|
||||||
|
|
@ -617,7 +664,24 @@ files = [
|
||||||
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
|
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tzlocal"
|
||||||
|
version = "5.2"
|
||||||
|
description = "tzinfo object for the local timezone"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"},
|
||||||
|
{file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
tzdata = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.12"
|
python-versions = "^3.12"
|
||||||
content-hash = "dd4d8ba16bb5e34d2e0f94009d4ea86de094a6b1d6d1af3e6b69c14e881ccf3e"
|
content-hash = "2617c6ec410cc30c300b46a5d653fa2a2aaa1737509851ab19b2e628b2838a65"
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ django = { extras = ["argon2"], version = "^5.0.3" }
|
||||||
python-dotenv = "^1.0.1"
|
python-dotenv = "^1.0.1"
|
||||||
feedparser = "^6.0.11"
|
feedparser = "^6.0.11"
|
||||||
gunicorn = "^21.2.0"
|
gunicorn = "^21.2.0"
|
||||||
|
dateparser = "^1.2.0"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
ruff = "^0.3.0"
|
ruff = "^0.3.0"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue