Add scraper for https://github.com/rumca-js/RSS-Link-Database
This commit is contained in:
parent
72a967d473
commit
e8e5e87c07
6 changed files with 64 additions and 5 deletions
1
.vscode/settings.json
vendored
1
.vscode/settings.json
vendored
|
|
@ -65,6 +65,7 @@
|
||||||
"nyaa",
|
"nyaa",
|
||||||
"Nyanpasu",
|
"Nyanpasu",
|
||||||
"Omnis",
|
"Omnis",
|
||||||
|
"orjson",
|
||||||
"pacman",
|
"pacman",
|
||||||
"PGHOST",
|
"PGHOST",
|
||||||
"PGID",
|
"PGID",
|
||||||
|
|
|
||||||
29
app/cli.py
29
app/cli.py
|
|
@ -7,15 +7,22 @@ from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import click
|
import click
|
||||||
|
import typer
|
||||||
from reader import Feed, ParseError, Reader, StorageError, UpdatedFeed, UpdateError, UpdateResult
|
from reader import Feed, ParseError, Reader, StorageError, UpdatedFeed, UpdateError, UpdateResult
|
||||||
|
|
||||||
from app.dependencies import get_reader
|
from app.dependencies import get_reader
|
||||||
|
from app.scrapers.rss_link_database import scrape
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
|
|
||||||
|
app = typer.Typer(
|
||||||
|
name="FeedVault CLI",
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
def add_broken_feed_to_csv(feed: Feed | UpdateResult | None) -> None:
|
|
||||||
|
def _add_broken_feed_to_csv(feed: Feed | UpdateResult | None) -> None:
|
||||||
"""Add a broken feed to a CSV file."""
|
"""Add a broken feed to a CSV file."""
|
||||||
if feed is None:
|
if feed is None:
|
||||||
click.echo("Feed is None.", err=True)
|
click.echo("Feed is None.", err=True)
|
||||||
|
|
@ -25,7 +32,10 @@ def add_broken_feed_to_csv(feed: Feed | UpdateResult | None) -> None:
|
||||||
f.write(f"{feed.url}\n")
|
f.write(f"{feed.url}\n")
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@app.command(
|
||||||
|
name="update_feeds",
|
||||||
|
help="Update all the feeds.",
|
||||||
|
)
|
||||||
def update_feeds() -> None:
|
def update_feeds() -> None:
|
||||||
"""Update all the feeds."""
|
"""Update all the feeds."""
|
||||||
reader: Reader = get_reader()
|
reader: Reader = get_reader()
|
||||||
|
|
@ -73,10 +83,21 @@ def update_feeds() -> None:
|
||||||
click.echo(f"Assertion error: {feed.url}", err=True)
|
click.echo(f"Assertion error: {feed.url}", err=True)
|
||||||
traceback.print_exc(file=sys.stderr)
|
traceback.print_exc(file=sys.stderr)
|
||||||
reader.disable_feed_updates(feed)
|
reader.disable_feed_updates(feed)
|
||||||
add_broken_feed_to_csv(feed)
|
_add_broken_feed_to_csv(feed)
|
||||||
|
|
||||||
click.echo("Feeds updated.")
|
click.echo("Feeds updated.")
|
||||||
|
|
||||||
|
|
||||||
|
@app.command(
|
||||||
|
name="grab_links",
|
||||||
|
help="Grab RSS feeds from different sources.",
|
||||||
|
)
|
||||||
|
def grab_links() -> None:
|
||||||
|
"""Grab RSS feeds from different sources."""
|
||||||
|
click.echo("Grabbing links...")
|
||||||
|
rss_links: str = scrape()
|
||||||
|
click.echo(rss_links)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
update_feeds()
|
app()
|
||||||
|
|
|
||||||
0
app/scrapers/__init__.py
Normal file
0
app/scrapers/__init__.py
Normal file
35
app/scrapers/rss_link_database.py
Normal file
35
app/scrapers/rss_link_database.py
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
"""Scrape https://github.com/rumca-js/RSS-Link-Database for RSS links."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import orjson
|
||||||
|
from click import echo
|
||||||
|
|
||||||
|
|
||||||
|
def scrape():
|
||||||
|
"""Scrape.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If the RSS-Link-Database repository is not found.
|
||||||
|
"""
|
||||||
|
repository_path = Path("RSS-Link-Database")
|
||||||
|
if not repository_path.exists():
|
||||||
|
msg = "RSS-Link-Database repository not found."
|
||||||
|
raise FileNotFoundError(msg)
|
||||||
|
|
||||||
|
rss_links = []
|
||||||
|
for file in repository_path.glob("*.json"):
|
||||||
|
echo(f"Scraping {file.name}...")
|
||||||
|
|
||||||
|
with file.open("r", encoding="utf-8") as f:
|
||||||
|
data = orjson.loads(f.read())
|
||||||
|
|
||||||
|
for d in data:
|
||||||
|
if d.get("url"):
|
||||||
|
rss_links.append(d["url"])
|
||||||
|
|
||||||
|
if d.get("link"):
|
||||||
|
rss_links.append(d["link"])
|
||||||
|
|
||||||
|
rss_links = list(set(rss_links))
|
||||||
|
return "\n".join(rss_links)
|
||||||
2
poetry.lock
generated
2
poetry.lock
generated
|
|
@ -1724,4 +1724,4 @@ watchdog = ["watchdog (>=2.3)"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.12"
|
python-versions = "^3.12"
|
||||||
content-hash = "069d3a9e95892d8e057e97858f342469cb3543f8bde8d893c3592583b0d36948"
|
content-hash = "b2ab90fe77cb7df69d4fbbeaba5dcf37c30e5aa4606e2ce5a5442e02515eba6c"
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,8 @@ jinja2 = "^3.1.4"
|
||||||
python-dotenv = "^1.0.1"
|
python-dotenv = "^1.0.1"
|
||||||
python-multipart = "^0.0.9"
|
python-multipart = "^0.0.9"
|
||||||
reader = "^3.12"
|
reader = "^3.12"
|
||||||
|
orjson = "^3.10.3"
|
||||||
|
typer = {extras = ["all"], version = "^0.12.3"}
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
ruff = "^0.4.4"
|
ruff = "^0.4.4"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue