Add scraper for https://github.com/rumca-js/RSS-Link-Database
This commit is contained in:
parent
72a967d473
commit
e8e5e87c07
6 changed files with 64 additions and 5 deletions
35
app/scrapers/rss_link_database.py
Normal file
35
app/scrapers/rss_link_database.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
"""Scrape https://github.com/rumca-js/RSS-Link-Database for RSS links."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import orjson
|
||||
from click import echo
|
||||
|
||||
|
||||
def scrape():
|
||||
"""Scrape.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the RSS-Link-Database repository is not found.
|
||||
"""
|
||||
repository_path = Path("RSS-Link-Database")
|
||||
if not repository_path.exists():
|
||||
msg = "RSS-Link-Database repository not found."
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
rss_links = []
|
||||
for file in repository_path.glob("*.json"):
|
||||
echo(f"Scraping {file.name}...")
|
||||
|
||||
with file.open("r", encoding="utf-8") as f:
|
||||
data = orjson.loads(f.read())
|
||||
|
||||
for d in data:
|
||||
if d.get("url"):
|
||||
rss_links.append(d["url"])
|
||||
|
||||
if d.get("link"):
|
||||
rss_links.append(d["link"])
|
||||
|
||||
rss_links = list(set(rss_links))
|
||||
return "\n".join(rss_links)
|
||||
Loading…
Add table
Add a link
Reference in a new issue