Add Atom feeds

This commit is contained in:
2025-05-10 16:34:42 +02:00
parent 1b87fccaf4
commit f7b6600089
7 changed files with 4605 additions and 8 deletions

12
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,12 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View File

@ -1,8 +1,11 @@
{
"cSpell.words": [
"aiofiles",
"Hellsén",
"httpx",
"Joakim",
"levelname",
"markupsafe",
"TheLovinator",
"Wuthering",
"wutheringwaves"

4121
articles_all.xml Normal file

File diff suppressed because one or more lines are too long

299
articles_latest.xml Normal file

File diff suppressed because one or more lines are too long

BIN
logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

View File

@ -4,4 +4,4 @@ version = "0.1.0"
description = "Wuthering Waves archive"
readme = "README.md"
requires-python = ">=3.13"
dependencies = ["aiofiles", "httpx"]
dependencies = ["aiofiles", "html-sanitizer", "httpx", "markupsafe"]

176
scrape.py
View File

@ -12,10 +12,12 @@ import subprocess # noqa: S404
import time
from datetime import UTC, datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal, LiteralString
from typing import TYPE_CHECKING, Any, Literal
import aiofiles
import httpx
from html_sanitizer import Sanitizer # pyright: ignore[reportMissingTypeStubs]
from markupsafe import escape
if TYPE_CHECKING:
from collections.abc import Coroutine
@ -281,6 +283,170 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
else:
logger.error("Failed to update timestamp for %s", file_path)
def strip_unsafe_tags(content: str) -> str:
"""Strip unsafe HTML tags and return the cleaned content.
Args:
content (str): The HTML content to clean.
Returns:
str: The cleaned HTML content.
"""
sanitizer = Sanitizer()
return sanitizer.sanitize(content) # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
"""Generate an Atom feed from a list of articles.
Args:
articles (list[dict[Any, Any]]): The list of articles to include in the feed.
file_name (str): The name of the file to save the feed to.
Returns:
str: The generated Atom feed as a string.
"""
atom_entries: list[str] = []
latest_entry: str = datetime.now(UTC).isoformat()
# Get the latest entry date
if articles:
latest_entry = articles[0].get("createTime", "")
if latest_entry:
latest_entry = datetime.strptime(str(latest_entry), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC).isoformat()
for article in articles:
article_id: str = str(article.get("articleId", ""))
# Use stable identifier based on article ID
entry_id: str = (
f"urn:article:{article_id}"
if article_id
else f"urn:wutheringwaves:unknown-article-{hash(article.get('articleTitle', '') + article.get('createTime', ''))}"
)
article_title: str = article.get("articleTitle", "No Title")
article_content: str = article.get("articleContent", article_title)
article_content = strip_unsafe_tags(article_content)
if not article_content:
article_content = article_title
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
article_create_time: str = article.get("createTime", "")
published: str = ""
updated: str = latest_entry
if article_create_time:
timestamp: datetime = datetime.strptime(str(article_create_time), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
iso_time: str = timestamp.isoformat()
published = f"<published>{iso_time}</published>"
# Use createTime as updated if available (more accurate than now)
updated = iso_time
article_category: str = article.get("articleTypeName", "Wuthering Waves")
category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
# Create entry using Atom format
atom_entries.append(
f"""
<entry>
<id>{entry_id}</id>
<title>{escape(article_title)}</title>
<link href="{article_url}" rel="alternate" type="text/html"/>
<content type="html">{escape(article_content)}</content>
{published}
<updated>{updated}</updated>
{category}
<author>
<name>Wuthering Waves</name>
<email>wutheringwaves_ensupport@kurogames.com</email>
<uri>https://wutheringwaves.kurogames.com</uri>
</author>
</entry>""",
)
# Create the complete Atom feed
atom_feed: str = f"""<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Wuthering Waves Articles</title>
<link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/>
<link href="https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/{file_name}" rel="self" type="application/atom+xml"/>
<id>urn:wutheringwaves:feed</id>
<updated>{latest_entry}</updated>
<subtitle>Latest articles from Wuthering Waves</subtitle>
<icon>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</icon>
<logo>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</logo>
<rights>Copyright © {datetime.now(tz=UTC).year} Wuthering Waves</rights>
<generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator>
<author>
<name>Wuthering Waves</name>
<email>wutheringwaves_ensupport@kurogames.com</email>
<uri>https://wutheringwaves.kurogames.com</uri>
</author>
{"".join(atom_entries)}
</feed>""" # noqa: E501
return atom_feed
def create_atom_feeds(output_dir: Path) -> None:
"""Create Atom feeds for the articles.
Current feeds are:
- Last 10 articles
- All articles
Args:
output_dir (Path): The directory to save the RSS feed files.
"""
menu_data: list[dict[Any, Any]] = []
# Load data from all the articles
for file in output_dir.glob("*.json"):
if file.stem == "ArticleMenu":
continue
with file.open("r", encoding="utf-8") as f:
try:
article_data: dict[Any, Any] = json.load(f)
menu_data.append(article_data)
except json.JSONDecodeError:
logger.exception("Error decoding JSON from %s", file)
continue
if not menu_data:
logger.error("Can't create Atom feeds, no articles found in %s", output_dir)
return
# Create the Atom feed for the latest articles
amount_of_articles: int = 20
atom_feed_path: Path = Path("articles_latest.xml")
latest_articles: list[dict[Any, Any]] = sorted(
menu_data,
key=lambda x: get_file_timestamp(x.get("createTime", "")),
reverse=True,
)[:amount_of_articles]
logger.info("Dates of the last %s articles:", len(latest_articles))
for article in latest_articles:
article_id: str = str(article.get("articleId", ""))
article_create_time: str = article.get("createTime", "")
logger.info("\tArticle ID: %s, Date: %s", article_id, article_create_time)
atom_feed: str = generate_atom_feed(articles=latest_articles, file_name=atom_feed_path.name)
with atom_feed_path.open("w", encoding="utf-8") as f:
f.write(atom_feed)
logger.info("Created Atom feed for the last %s articles: %s", len(latest_articles), atom_feed_path)
# Create the Atom feed for all articles
atom_feed_path_all: Path = Path("articles_all.xml")
atom_feed_all_articles: str = generate_atom_feed(articles=menu_data, file_name=atom_feed_path_all.name)
with atom_feed_path_all.open("w", encoding="utf-8") as f:
f.write(atom_feed_all_articles)
logger.info("Created Atom feed for all articles: %s", atom_feed_path_all)
def add_data_to_articles(menu_data: dict[Any, Any], output_dir: Path) -> None:
"""ArticleMenu.json contains data that should be added to the articles.
@ -340,7 +506,7 @@ async def main() -> Literal[1, 0]:
current_time = int(time.time() * 1000) # Current time in milliseconds
base_url = "https://hw-media-cdn-mingchao.kurogame.com/akiwebsite/website2.0/json/G152/en"
article_menu_url: str = f"{base_url}/ArticleMenu.json?t={current_time}"
article_base_url: LiteralString = f"{base_url}/article/"
article_base_url: str = f"{base_url}/article/"
output_dir = Path("articles")
output_dir.mkdir(exist_ok=True)
@ -408,13 +574,9 @@ async def main() -> Literal[1, 0]:
else:
logger.info("No new articles to download")
# Add data from ArticleMenu.json to /articles/*.json files
add_data_to_articles(menu_data, output_dir)
# Update the README
add_articles_to_readme(menu_data)
# Process timestamps in batch
create_atom_feeds(output_dir)
batch_process_timestamps(menu_data, output_dir)
logger.info("Script finished. Articles are in the '%s' directory.", output_dir)