Update XML links to use GitHub raw URLs and format Python code for consistency

This commit is contained in:
2025-07-12 23:47:03 +02:00
parent 4c8d92aab6
commit 529b1a9cd4
3 changed files with 106 additions and 36 deletions

6
articles_all.xml generated
View File

@ -2,12 +2,12 @@
<feed xmlns="http://www.w3.org/2005/Atom"> <feed xmlns="http://www.w3.org/2005/Atom">
<title>Wuthering Waves Articles</title> <title>Wuthering Waves Articles</title>
<link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/> <link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/>
<link href="https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/articles_all.xml" rel="self" type="application/atom+xml"/> <link href="https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/articles_all.xml" rel="self" type="application/atom+xml"/>
<id>urn:wutheringwaves:feed</id> <id>urn:wutheringwaves:feed</id>
<updated>2025-07-09T18:03:53+00:00</updated> <updated>2025-07-09T18:03:53+00:00</updated>
<subtitle>Latest articles from Wuthering Waves</subtitle> <subtitle>Latest articles from Wuthering Waves</subtitle>
<icon>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</icon> <icon>https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/logo.png</icon>
<logo>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</logo> <logo>https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/logo.png</logo>
<rights>Copyright © 2025 Wuthering Waves</rights> <rights>Copyright © 2025 Wuthering Waves</rights>
<generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator> <generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator>
<author> <author>

6
articles_latest.xml generated
View File

@ -2,12 +2,12 @@
<feed xmlns="http://www.w3.org/2005/Atom"> <feed xmlns="http://www.w3.org/2005/Atom">
<title>Wuthering Waves Articles</title> <title>Wuthering Waves Articles</title>
<link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/> <link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/>
<link href="https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/articles_latest.xml" rel="self" type="application/atom+xml"/> <link href="https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/articles_latest.xml" rel="self" type="application/atom+xml"/>
<id>urn:wutheringwaves:feed</id> <id>urn:wutheringwaves:feed</id>
<updated>2025-07-09T18:03:53+00:00</updated> <updated>2025-07-09T18:03:53+00:00</updated>
<subtitle>Latest articles from Wuthering Waves</subtitle> <subtitle>Latest articles from Wuthering Waves</subtitle>
<icon>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</icon> <icon>https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/logo.png</icon>
<logo>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</logo> <logo>https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/logo.png</logo>
<rights>Copyright © 2025 Wuthering Waves</rights> <rights>Copyright © 2025 Wuthering Waves</rights>
<generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator> <generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator>
<author> <author>

130
scrape.py
View File

@ -83,7 +83,9 @@ def set_file_timestamp(filepath: Path, timestamp_str: str) -> bool:
""" """
try: try:
# Parse the timestamp string # Parse the timestamp string
dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC) dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(
tzinfo=UTC
)
# Convert to Unix timestamp # Convert to Unix timestamp
timestamp: float = dt.timestamp() timestamp: float = dt.timestamp()
@ -114,7 +116,9 @@ def get_file_timestamp(timestamp_str: str) -> float:
try: try:
# Parse the timestamp string # Parse the timestamp string
dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC) dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(
tzinfo=UTC
)
# Convert to Unix timestamp # Convert to Unix timestamp
return dt.timestamp() return dt.timestamp()
except ValueError: except ValueError:
@ -133,7 +137,13 @@ def commit_file_with_timestamp(filepath: Path) -> bool: # noqa: PLR0911
""" """
# Check in Git history if we already have this file # Check in Git history if we already have this file
git_log_cmd: list[str] = ["git", "log", "--pretty=format:%H", "--follow", str(filepath)] git_log_cmd: list[str] = [
"git",
"log",
"--pretty=format:%H",
"--follow",
str(filepath),
]
try: try:
git_log_output: str = subprocess.check_output(git_log_cmd, text=True).strip() # noqa: S603 git_log_output: str = subprocess.check_output(git_log_cmd, text=True).strip() # noqa: S603
if git_log_output: if git_log_output:
@ -157,7 +167,9 @@ def commit_file_with_timestamp(filepath: Path) -> bool: # noqa: PLR0911
# Get the file's modification time # Get the file's modification time
timestamp: float = filepath.stat().st_mtime timestamp: float = filepath.stat().st_mtime
git_time: str = datetime.fromtimestamp(timestamp, tz=UTC).strftime("%Y-%m-%dT%H:%M:%S") git_time: str = datetime.fromtimestamp(timestamp, tz=UTC).strftime(
"%Y-%m-%dT%H:%M:%S"
)
# Stage the file # Stage the file
subprocess.run([git_executable, "add", str(filepath)], check=True, text=True) # noqa: S603 subprocess.run([git_executable, "add", str(filepath)], check=True, text=True) # noqa: S603
@ -210,7 +222,9 @@ def add_articles_to_readme(articles: dict[Any, Any] | None = None) -> None:
# Create new content # Create new content
new_lines: list[str] = [] new_lines: list[str] = []
if articles_section_index >= 0: if articles_section_index >= 0:
new_lines = lines[: articles_section_index + 1] # Keep everything up to "## Articles" new_lines = lines[
: articles_section_index + 1
] # Keep everything up to "## Articles"
else: else:
new_lines = lines new_lines = lines
if new_lines and not new_lines[-1].endswith("\n"): if new_lines and not new_lines[-1].endswith("\n"):
@ -219,10 +233,14 @@ def add_articles_to_readme(articles: dict[Any, Any] | None = None) -> None:
# Add articles # Add articles
new_lines.append("\n") # Add a blank line after the heading new_lines.append("\n") # Add a blank line after the heading
for article in sorted(articles, key=lambda x: x.get("createTime", ""), reverse=True): for article in sorted(
articles, key=lambda x: x.get("createTime", ""), reverse=True
):
article_id: str = str(article.get("articleId", "")) article_id: str = str(article.get("articleId", ""))
article_title: str = article.get("articleTitle", "No Title") article_title: str = article.get("articleTitle", "No Title")
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}" article_url: str = (
f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
)
new_lines.append( new_lines.append(
f"- [{article_title}]({article_url}) [[json]](articles/{article_id}.json)\n", f"- [{article_title}]({article_url}) [[json]](articles/{article_id}.json)\n",
) )
@ -372,7 +390,11 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
if articles: if articles:
latest_entry = articles[0].get("createTime", "") latest_entry = articles[0].get("createTime", "")
if latest_entry: if latest_entry:
latest_entry = datetime.strptime(str(latest_entry), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC).isoformat() latest_entry = (
datetime.strptime(str(latest_entry), "%Y-%m-%d %H:%M:%S")
.replace(tzinfo=UTC)
.isoformat()
)
for article in articles: for article in articles:
article_id: str = str(article.get("articleId", "")) article_id: str = str(article.get("articleId", ""))
@ -401,7 +423,9 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
article_content_converted = "No content available" article_content_converted = "No content available"
# Remove non-breaking spaces # Remove non-breaking spaces
xa0_removed: str = re.sub(r"\xa0", " ", article_content_converted) # Replace non-breaking spaces with regular spaces xa0_removed: str = re.sub(
r"\xa0", " ", article_content_converted
) # Replace non-breaking spaces with regular spaces
# Replace non-breaking spaces with regular spaces # Replace non-breaking spaces with regular spaces
non_breaking_space_removed: str = xa0_removed.replace( non_breaking_space_removed: str = xa0_removed.replace(
@ -427,7 +451,12 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
stars_converted: str = handle_stars(square_brackets_converted) stars_converted: str = handle_stars(square_brackets_converted)
# If `● Word` is in the content, replace it `## Word` instead with regex # If `● Word` is in the content, replace it `## Word` instead with regex
ball_converted: str = re.sub(pattern=r"\s*(.*?)\n", repl=r"\n\n## \1\n\n", string=stars_converted, flags=re.MULTILINE) ball_converted: str = re.sub(
pattern=r"\s*(.*?)\n",
repl=r"\n\n## \1\n\n",
string=stars_converted,
flags=re.MULTILINE,
)
# If `※ Word` is in the content, replace it `* word * ` instead with regex # If `※ Word` is in the content, replace it `* word * ` instead with regex
reference_mark_converted: str = re.sub( reference_mark_converted: str = re.sub(
@ -458,7 +487,12 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
flags=re.MULTILINE, flags=re.MULTILINE,
) )
space_before_star_added: str = re.sub(pattern=r"\\\*(.*)", repl=r"* \1", string=reference_mark_converted, flags=re.MULTILINE) space_before_star_added: str = re.sub(
pattern=r"\\\*(.*)",
repl=r"* \1",
string=reference_mark_converted,
flags=re.MULTILINE,
)
markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003 markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003
space_before_star_added, space_before_star_added,
@ -470,19 +504,25 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
links_fixed: str = format_discord_links(markdown_formatted) links_fixed: str = format_discord_links(markdown_formatted)
article_escaped: Markup = escape(links_fixed) article_escaped: Markup = escape(links_fixed)
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}" article_url: str = (
f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
)
article_create_time: str = article.get("createTime", "") article_create_time: str = article.get("createTime", "")
published: str = "" published: str = ""
updated: str = latest_entry updated: str = latest_entry
if article_create_time: if article_create_time:
timestamp: datetime = datetime.strptime(str(article_create_time), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC) timestamp: datetime = datetime.strptime(
str(article_create_time), "%Y-%m-%d %H:%M:%S"
).replace(tzinfo=UTC)
iso_time: str = timestamp.isoformat() iso_time: str = timestamp.isoformat()
published = f"<published>{iso_time}</published>" published = f"<published>{iso_time}</published>"
updated = iso_time updated = iso_time
article_category: str = article.get("articleTypeName", "Wuthering Waves") article_category: str = article.get("articleTypeName", "Wuthering Waves")
category: str = f'<category term="{escape(article_category)}"/>' if article_category else "" category: str = (
f'<category term="{escape(article_category)}"/>' if article_category else ""
)
html: str = markdown.markdown( html: str = markdown.markdown(
text=article_escaped, text=article_escaped,
@ -528,12 +568,12 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
<feed xmlns="http://www.w3.org/2005/Atom"> <feed xmlns="http://www.w3.org/2005/Atom">
<title>Wuthering Waves Articles</title> <title>Wuthering Waves Articles</title>
<link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/> <link href="https://wutheringwaves.kurogames.com/en/main/news/" rel="alternate" type="text/html"/>
<link href="https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/{file_name}" rel="self" type="application/atom+xml"/> <link href="https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/{file_name}" rel="self" type="application/atom+xml"/>
<id>urn:wutheringwaves:feed</id> <id>urn:wutheringwaves:feed</id>
<updated>{latest_entry}</updated> <updated>{latest_entry}</updated>
<subtitle>Latest articles from Wuthering Waves</subtitle> <subtitle>Latest articles from Wuthering Waves</subtitle>
<icon>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</icon> <icon>https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/logo.png</icon>
<logo>https://git.lovinator.space/TheLovinator/wutheringwaves/raw/branch/master/logo.png</logo> <logo>https://raw.githubusercontent.com/TheLovinator1/wutheringwaves/refs/heads/master/logo.png</logo>
<rights>Copyright © {datetime.now(tz=UTC).year} Wuthering Waves</rights> <rights>Copyright © {datetime.now(tz=UTC).year} Wuthering Waves</rights>
<generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator> <generator uri="https://git.lovinator.space/TheLovinator/wutheringwaves" version="1.0">Python Script</generator>
<author> <author>
@ -593,14 +633,22 @@ def create_atom_feeds(output_dir: Path) -> None:
article_create_time: str = article.get("createTime", "") article_create_time: str = article.get("createTime", "")
logger.info("\tArticle ID: %s, Date: %s", article_id, article_create_time) logger.info("\tArticle ID: %s, Date: %s", article_id, article_create_time)
atom_feed: str = generate_atom_feed(articles=latest_articles, file_name=atom_feed_path.name) atom_feed: str = generate_atom_feed(
articles=latest_articles, file_name=atom_feed_path.name
)
with atom_feed_path.open("w", encoding="utf-8") as f: with atom_feed_path.open("w", encoding="utf-8") as f:
f.write(atom_feed) f.write(atom_feed)
logger.info("Created Atom feed for the last %s articles: %s", len(latest_articles), atom_feed_path) logger.info(
"Created Atom feed for the last %s articles: %s",
len(latest_articles),
atom_feed_path,
)
# Create the Atom feed for all articles # Create the Atom feed for all articles
atom_feed_path_all: Path = Path("articles_all.xml") atom_feed_path_all: Path = Path("articles_all.xml")
atom_feed_all_articles: str = generate_atom_feed(articles=articles_sorted, file_name=atom_feed_path_all.name) atom_feed_all_articles: str = generate_atom_feed(
articles=articles_sorted, file_name=atom_feed_path_all.name
)
with atom_feed_path_all.open("w", encoding="utf-8") as f: with atom_feed_path_all.open("w", encoding="utf-8") as f:
f.write(atom_feed_all_articles) f.write(atom_feed_all_articles)
logger.info("Created Atom feed for all articles: %s", atom_feed_path_all) logger.info("Created Atom feed for all articles: %s", atom_feed_path_all)
@ -663,7 +711,9 @@ async def main() -> Literal[1, 0]:
""" """
# Setup # Setup
current_time = int(time.time() * 1000) # Current time in milliseconds current_time = int(time.time() * 1000) # Current time in milliseconds
base_url = "https://hw-media-cdn-mingchao.kurogame.com/akiwebsite/website2.0/json/G152/en" base_url = (
"https://hw-media-cdn-mingchao.kurogame.com/akiwebsite/website2.0/json/G152/en"
)
article_menu_url: str = f"{base_url}/ArticleMenu.json?t={current_time}" article_menu_url: str = f"{base_url}/ArticleMenu.json?t={current_time}"
article_base_url: str = f"{base_url}/article/" article_base_url: str = f"{base_url}/article/"
output_dir = Path("articles") output_dir = Path("articles")
@ -685,19 +735,29 @@ async def main() -> Literal[1, 0]:
# Extract article IDs # Extract article IDs
logger.info("Extracting article IDs...") logger.info("Extracting article IDs...")
article_ids: list[str] = [str(item["articleId"]) for item in menu_data if item.get("articleId")] article_ids: list[str] = [
str(item["articleId"]) for item in menu_data if item.get("articleId")
]
if not article_ids: if not article_ids:
logger.warning("No article IDs found. Please check the JSON structure of ArticleMenu.json.") logger.warning(
"No article IDs found. Please check the JSON structure of ArticleMenu.json."
)
logger.warning("Full menu response for debugging:") logger.warning("Full menu response for debugging:")
logger.warning(json.dumps(menu_data, indent=2)) logger.warning(json.dumps(menu_data, indent=2))
return 1 return 1
# Get list of already downloaded article IDs # Get list of already downloaded article IDs
existing_files: list[str] = [file.stem for file in output_dir.glob("*.json") if file.stem != "ArticleMenu"] existing_files: list[str] = [
file.stem
for file in output_dir.glob("*.json")
if file.stem != "ArticleMenu"
]
# Filter out already downloaded articles # Filter out already downloaded articles
new_article_ids: list[str] = [article_id for article_id in article_ids if article_id not in existing_files] new_article_ids: list[str] = [
article_id for article_id in article_ids if article_id not in existing_files
]
if new_article_ids: if new_article_ids:
logger.info("Found %s new articles to download", len(new_article_ids)) logger.info("Found %s new articles to download", len(new_article_ids))
@ -705,14 +765,18 @@ async def main() -> Literal[1, 0]:
# Download each new article # Download each new article
download_tasks: list[Coroutine[Any, Any, dict[Any, Any] | None]] = [] download_tasks: list[Coroutine[Any, Any, dict[Any, Any] | None]] = []
for article_id in new_article_ids: for article_id in new_article_ids:
article_url: str = f"{article_base_url}{article_id}.json?t={current_time}" article_url: str = (
f"{article_base_url}{article_id}.json?t={current_time}"
)
output_file: Path = output_dir / f"{article_id}.json" output_file: Path = output_dir / f"{article_id}.json"
logger.info("Downloading article %s from %s", article_id, article_url) logger.info("Downloading article %s from %s", article_id, article_url)
download_tasks.append(fetch_json(article_url, client)) download_tasks.append(fetch_json(article_url, client))
# Wait for all downloads to complete # Wait for all downloads to complete
results: list[dict[Any, Any] | BaseException | None] = await asyncio.gather(*download_tasks, return_exceptions=True) results: list[dict[Any, Any] | BaseException | None] = await asyncio.gather(
*download_tasks, return_exceptions=True
)
# Process the downloaded articles # Process the downloaded articles
for i, result in enumerate(results): for i, result in enumerate(results):
@ -724,12 +788,18 @@ async def main() -> Literal[1, 0]:
continue continue
if not result: if not result:
logger.warning("Downloaded article %s is empty or invalid", article_id) logger.warning(
"Downloaded article %s is empty or invalid", article_id
)
continue continue
# Save the article JSON # Save the article JSON
if isinstance(result, dict) and await save_prettified_json(result, output_file): if isinstance(result, dict) and await save_prettified_json(
logger.info("Successfully downloaded and prettified %s", output_file) result, output_file
):
logger.info(
"Successfully downloaded and prettified %s", output_file
)
else: else:
logger.info("No new articles to download") logger.info("No new articles to download")