Enhance Markdown processing and link formatting

This commit is contained in:
2025-05-15 03:15:45 +02:00
parent 9c3bc8c153
commit 23cc6ca402
5 changed files with 9602 additions and 9880 deletions

View File

@ -7,6 +7,7 @@
"levelname",
"markdownify",
"markupsafe",
"mdformat",
"TheLovinator",
"Wuthering",
"wutheringwaves"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -10,4 +10,5 @@ dependencies = [
"httpx",
"markdownify",
"markupsafe",
"mdformat",
]

184
scrape.py
View File

@ -1,7 +1,8 @@
import asyncio
import asyncio # noqa: CPY001, D100
import json
import logging
import os
import re
import shutil
import subprocess # noqa: S404
import time
@ -11,8 +12,9 @@ from typing import TYPE_CHECKING, Any, Literal
import aiofiles
import httpx
from markdownify import MarkdownConverter
from markupsafe import escape
import mdformat
from markdownify import MarkdownConverter # pyright: ignore[reportMissingTypeStubs]
from markupsafe import Markup, escape
if TYPE_CHECKING:
from collections.abc import Coroutine
@ -104,6 +106,10 @@ def get_file_timestamp(timestamp_str: str) -> float:
float: The Unix timestamp, or 0 if conversion failed.
"""
if not timestamp_str:
logger.info("Empty timestamp string")
return 0.0
try:
# Parse the timestamp string
dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
@ -279,31 +285,80 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
logger.error("Failed to update timestamp for %s", file_path)
class CustomLinkMarkdownConverter(MarkdownConverter):
"""Custom Markdown converter to handle links.
def format_discord_links(md: str) -> str:
"""Make links work in Discord.
Discord doesn't support links with titles, so we need to remove them.
This function also adds angle brackets around the URL to not embed it.
Args:
md (str): The Markdown text containing links.
Returns:
str: The modified Markdown text with simplified links.
This class is a subclass of MarkdownConverter
and overrides the convert_a method to customize
the conversion of <a> tags to Markdown links.
"""
def convert_a(self, el: Any, text: str, **kwargs) -> str: # type: ignore # noqa: ANN003, ANN401, ARG002, PGH003, PLR6301
"""Convert <a> tags.
def repl(match: re.Match[str]) -> str:
url: str | Any = match.group(2)
display: str = re.sub(pattern=r"^https?://(www\.)?", repl="", string=url)
return f"[{display}]({url})"
Args:
el (Any): The element to convert.
text (str): The text content of the element.
kwargs (Any): Additional arguments.
# Before: [Link](https://example.com "Link")
# After: [Link](https://example.com)
formatted_links_md = re.sub(
pattern=r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)',
repl=repl,
string=md,
)
Returns:
str: The converted text.
# Before: [Link](https://example.com)
# After: [Link](<https://example.com>)
add_angle_brackets_md: str = re.sub(
pattern=r"\[([^\]]+)\]\((https?://[^\s)]+)\)",
repl=r"[\1](<\2>)",
string=formatted_links_md,
)
"""
href: str | None = el.get("href")
if not href:
return text
return add_angle_brackets_md
return f"[{text}](<{href}>)"
def handle_stars(text: str) -> str:
"""Handle stars in the text.
Args:
text (str): The text to process.
Returns:
str: The processed text with stars replaced by headers.
"""
lines: list[str] = text.strip().splitlines()
output: list[str] = []
for line in lines:
line: str = line.strip() # noqa: PLW2901
# Before: ✦ Title ✦
# After: # Title
if line.startswith("") and line.endswith(""):
title: str = line.removeprefix("").removesuffix("").strip()
output.append(f"# {title}")
# Before: **✦ Title ✦**
# After: # Title
elif line.startswith("**✦") and line.endswith("✦**"):
title: str = line.removeprefix("**✦").removesuffix("✦**").strip()
output.append(f"# {title}")
# Before: ✦ Title
# After: * Title
elif line.startswith(""):
title: str = line.removeprefix("").strip()
output.append(f"* {title}")
elif line:
output.append(line)
return "\n\n".join(output)
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: # noqa: PLR0914
@ -341,14 +396,88 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
if not article_content:
article_content = article_title
converter: CustomLinkMarkdownConverter = CustomLinkMarkdownConverter(
converter: MarkdownConverter = MarkdownConverter(
heading_style="ATX",
bullets="-",
strip=["img"],
default_title="Link",
)
article_content = article_content.replace(" ", " ") # Replace non-breaking spaces with regular spaces # noqa: RUF001
article_content: str = converter.convert(article_content).strip() # type: ignore # noqa: PGH003
article_content = escape(article_content)
article_content_converted = str(converter.convert(article_content).strip()) # type: ignore # noqa: PGH003
if not article_content_converted:
msg: str = f"Article content is empty for article ID: {article_id}"
logger.warning(msg)
article_content_converted = "No content available"
# Remove non-breaking spaces
xa0_removed: str = re.sub(r"\xa0", " ", article_content_converted) # Replace non-breaking spaces with regular spaces
# Replace non-breaking spaces with regular spaces
non_breaking_space_removed: str = xa0_removed.replace(
" ", # noqa: RUF001
" ",
)
# Remove code blocks that has only spaces and newlines inside them
empty_code_block_removed: str = re.sub(
pattern=r"```[ \t]*\n[ \t]*\n```",
repl="",
string=non_breaking_space_removed, # type: ignore # noqa: PGH003
)
# [How to Update] should be # How to Update
square_brackets_converted: str = re.sub(
pattern=r"^\s*\[([^\]]+)\]\s*$",
repl=r"# \1",
string=empty_code_block_removed, # type: ignore # noqa: PGH003
flags=re.MULTILINE,
)
stars_converted: str = handle_stars(square_brackets_converted)
# If `● Word` is in the content, replace it `## Word` instead with regex
ball_converted: str = re.sub(pattern=r"\s*(.*?)\n", repl=r"\n\n## \1\n\n", string=stars_converted, flags=re.MULTILINE)
# If `※ Word` is in the content, replace it `* word * ` instead with regex
reference_mark_converted: str = re.sub(
pattern=r"^\s*※\s*(\S.*?)\s*$",
repl=r"\n\n*\1*\n\n",
string=ball_converted,
flags=re.MULTILINE,
)
# Replace circled Unicode numbers (①-⑳) with plain numbered text (e.g., "1. ", "2. ", ..., "20. ")
number_symbol: dict[str, str] = {
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9",
"": "10",
}
for symbol, number in number_symbol.items():
reference_mark_converted = re.sub(
pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$",
repl=rf"\n\n{number}. \1\n\n",
string=reference_mark_converted,
flags=re.MULTILINE,
)
space_before_star_added: str = re.sub(pattern=r"\\\*(.*)", repl=r"* \1", string=reference_mark_converted, flags=re.MULTILINE)
markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003
space_before_star_added,
options={
"number": True, # Allow 1., 2., 3. numbering
},
)
links_fixed: str = format_discord_links(markdown_formatted)
article_escaped: Markup = escape(links_fixed)
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
article_create_time: str = article.get("createTime", "")
@ -361,6 +490,9 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
published = f"<published>{iso_time}</published>"
updated = iso_time
if article_id == "1004":
logger.info("Article ID: %s, Date: %s", article_id, article_create_time)
article_category: str = article.get("articleTypeName", "Wuthering Waves")
category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
atom_entries.append(
@ -369,7 +501,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
<id>{entry_id}</id>
<title>{escape(article_title)}</title>
<link href="{article_url}" rel="alternate" type="text/html"/>
<content type="text">{article_content}</content>
<content type="text">{article_escaped}</content>
{published}
<updated>{updated}</updated>
{category}