From e8e5e87c07344edc8d48ebaca5a0c8cb47fb384d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joakim=20Hells=C3=A9n?= Date: Wed, 22 May 2024 22:22:08 +0200 Subject: [PATCH] Add scraper for https://github.com/rumca-js/RSS-Link-Database --- .vscode/settings.json | 1 + app/cli.py | 29 +++++++++++++++++++++---- app/scrapers/__init__.py | 0 app/scrapers/rss_link_database.py | 35 +++++++++++++++++++++++++++++++ poetry.lock | 2 +- pyproject.toml | 2 ++ 6 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 app/scrapers/__init__.py create mode 100644 app/scrapers/rss_link_database.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 9ab1e45..5bd1cf7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -65,6 +65,7 @@ "nyaa", "Nyanpasu", "Omnis", + "orjson", "pacman", "PGHOST", "PGID", diff --git a/app/cli.py b/app/cli.py index 4857ebf..ccfd8bb 100644 --- a/app/cli.py +++ b/app/cli.py @@ -7,15 +7,22 @@ from pathlib import Path from typing import TYPE_CHECKING import click +import typer from reader import Feed, ParseError, Reader, StorageError, UpdatedFeed, UpdateError, UpdateResult from app.dependencies import get_reader +from app.scrapers.rss_link_database import scrape if TYPE_CHECKING: from collections.abc import Iterable +app = typer.Typer( + name="FeedVault CLI", + no_args_is_help=True, +) -def add_broken_feed_to_csv(feed: Feed | UpdateResult | None) -> None: + +def _add_broken_feed_to_csv(feed: Feed | UpdateResult | None) -> None: """Add a broken feed to a CSV file.""" if feed is None: click.echo("Feed is None.", err=True) @@ -25,7 +32,10 @@ def add_broken_feed_to_csv(feed: Feed | UpdateResult | None) -> None: f.write(f"{feed.url}\n") -@click.command() +@app.command( + name="update_feeds", + help="Update all the feeds.", +) def update_feeds() -> None: """Update all the feeds.""" reader: Reader = get_reader() @@ -73,10 +83,21 @@ def update_feeds() -> None: click.echo(f"Assertion error: {feed.url}", err=True) traceback.print_exc(file=sys.stderr) reader.disable_feed_updates(feed) - add_broken_feed_to_csv(feed) + _add_broken_feed_to_csv(feed) click.echo("Feeds updated.") +@app.command( + name="grab_links", + help="Grab RSS feeds from different sources.", +) +def grab_links() -> None: + """Grab RSS feeds from different sources.""" + click.echo("Grabbing links...") + rss_links: str = scrape() + click.echo(rss_links) + + if __name__ == "__main__": - update_feeds() + app() diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/scrapers/rss_link_database.py b/app/scrapers/rss_link_database.py new file mode 100644 index 0000000..5c485c3 --- /dev/null +++ b/app/scrapers/rss_link_database.py @@ -0,0 +1,35 @@ +"""Scrape https://github.com/rumca-js/RSS-Link-Database for RSS links.""" + +from pathlib import Path + +import orjson +from click import echo + + +def scrape(): + """Scrape. + + Raises: + FileNotFoundError: If the RSS-Link-Database repository is not found. + """ + repository_path = Path("RSS-Link-Database") + if not repository_path.exists(): + msg = "RSS-Link-Database repository not found." + raise FileNotFoundError(msg) + + rss_links = [] + for file in repository_path.glob("*.json"): + echo(f"Scraping {file.name}...") + + with file.open("r", encoding="utf-8") as f: + data = orjson.loads(f.read()) + + for d in data: + if d.get("url"): + rss_links.append(d["url"]) + + if d.get("link"): + rss_links.append(d["link"]) + + rss_links = list(set(rss_links)) + return "\n".join(rss_links) diff --git a/poetry.lock b/poetry.lock index 48152df..7893e06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1724,4 +1724,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "069d3a9e95892d8e057e97858f342469cb3543f8bde8d893c3592583b0d36948" +content-hash = "b2ab90fe77cb7df69d4fbbeaba5dcf37c30e5aa4606e2ce5a5442e02515eba6c" diff --git a/pyproject.toml b/pyproject.toml index 427a5f2..62221fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ jinja2 = "^3.1.4" python-dotenv = "^1.0.1" python-multipart = "^0.0.9" reader = "^3.12" +orjson = "^3.10.3" +typer = {extras = ["all"], version = "^0.12.3"} [tool.poetry.group.dev.dependencies] ruff = "^0.4.4"