Enhance Markdown processing and link formatting

2025-05-15 03:15:45 +02:00
parent 9c3bc8c153
commit 23cc6ca402
5 changed files with 9602 additions and 9880 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -7,6 +7,7 @@
        "levelname",
        "markdownify",
        "markupsafe",
        "mdformat",
        "TheLovinator",
        "Wuthering",
        "wutheringwaves"
--- a/articles_all.xml
+++ b/articles_all.xml
--- a/articles_latest.xml
+++ b/articles_latest.xml
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,4 +10,5 @@ dependencies = [
    "httpx",
    "markdownify",
    "markupsafe",
    "mdformat",
 ]
--- a/scrape.py
+++ b/scrape.py
@ -1,7 +1,8 @@
-import asyncio
+import asyncio  # noqa: CPY001, D100
 import json
 import logging
 import os
 import re
 import shutil
 import subprocess  # noqa: S404
 import time
@ -11,8 +12,9 @@ from typing import TYPE_CHECKING, Any, Literal
 import aiofiles
 import httpx
-from markdownify import MarkdownConverter
+import mdformat
-from markupsafe import escape
+from markdownify import MarkdownConverter  # pyright: ignore[reportMissingTypeStubs]
 from markupsafe import Markup, escape
 if TYPE_CHECKING:
    from collections.abc import Coroutine
@ -104,6 +106,10 @@ def get_file_timestamp(timestamp_str: str) -> float:
        float: The Unix timestamp, or 0 if conversion failed.
    """
    if not timestamp_str:
        logger.info("Empty timestamp string")
        return 0.0
    try:
        # Parse the timestamp string
        dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
@ -279,31 +285,80 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
            logger.error("Failed to update timestamp for %s", file_path)
-class CustomLinkMarkdownConverter(MarkdownConverter):
+def format_discord_links(md: str) -> str:
-    """Custom Markdown converter to handle links.
+    """Make links work in Discord.
-    This class is a subclass of MarkdownConverter
+    Discord doesn't support links with titles, so we need to remove them.
-    and overrides the convert_a method to customize
+    This function also adds angle brackets around the URL to not embed it.
    the conversion of <a> tags to Markdown links.
    """
    def convert_a(self, el: Any, text: str, **kwargs) -> str:  # type: ignore  # noqa: ANN003, ANN401, ARG002, PGH003, PLR6301
        """Convert <a> tags.
    Args:
-            el (Any): The element to convert.
+        md (str): The Markdown text containing links.
            text (str): The text content of the element.
            kwargs (Any): Additional arguments.
    Returns:
-            str: The converted text.
+        str: The modified Markdown text with simplified links.
    """
        href: str | None = el.get("href")
        if not href:
            return text
-        return f"[{text}](<{href}>)"
+    def repl(match: re.Match[str]) -> str:
        url: str | Any = match.group(2)
        display: str = re.sub(pattern=r"^https?://(www\.)?", repl="", string=url)
        return f"[{display}]({url})"
    # Before: [Link](https://example.com "Link")
    # After: [Link](https://example.com)
    formatted_links_md = re.sub(
        pattern=r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)',
        repl=repl,
        string=md,
    )
    # Before: [Link](https://example.com)
    # After: [Link](<https://example.com>)
    add_angle_brackets_md: str = re.sub(
        pattern=r"\[([^\]]+)\]\((https?://[^\s)]+)\)",
        repl=r"[\1](<\2>)",
        string=formatted_links_md,
    )
    return add_angle_brackets_md
 def handle_stars(text: str) -> str:
    """Handle stars in the text.
    Args:
        text (str): The text to process.
    Returns:
        str: The processed text with stars replaced by headers.
    """
    lines: list[str] = text.strip().splitlines()
    output: list[str] = []
    for line in lines:
        line: str = line.strip()  # noqa: PLW2901
        # Before: ✦ Title ✦
        # After: # Title
        if line.startswith("✦") and line.endswith("✦"):
            title: str = line.removeprefix("✦").removesuffix("✦").strip()
            output.append(f"# {title}")
        # Before: **✦ Title ✦**
        # After: # Title
        elif line.startswith("**✦") and line.endswith("✦**"):
            title: str = line.removeprefix("**✦").removesuffix("✦**").strip()
            output.append(f"# {title}")
        # Before: ✦ Title
        # After: * Title
        elif line.startswith("✦"):
            title: str = line.removeprefix("✦").strip()
            output.append(f"* {title}")
        elif line:
            output.append(line)
    return "\n\n".join(output)
 def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:  # noqa: PLR0914
@ -341,14 +396,88 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
        if not article_content:
            article_content = article_title
-        converter: CustomLinkMarkdownConverter = CustomLinkMarkdownConverter(
+        converter: MarkdownConverter = MarkdownConverter(
            heading_style="ATX",
            bullets="-",
            strip=["img"],
            default_title="Link",
        )
-        article_content = article_content.replace(" ", " ")  # Replace non-breaking spaces with regular spaces  # noqa: RUF001
+        article_content_converted = str(converter.convert(article_content).strip())  # type: ignore  # noqa: PGH003
-        article_content: str = converter.convert(article_content).strip()  # type: ignore  # noqa: PGH003
+
-        article_content = escape(article_content)
+        if not article_content_converted:
            msg: str = f"Article content is empty for article ID: {article_id}"
            logger.warning(msg)
            article_content_converted = "No content available"
        # Remove non-breaking spaces
        xa0_removed: str = re.sub(r"\xa0", " ", article_content_converted)  # Replace non-breaking spaces with regular spaces
        # Replace non-breaking spaces with regular spaces
        non_breaking_space_removed: str = xa0_removed.replace(
            " ",  # noqa: RUF001
            " ",
        )
        # Remove code blocks that has only spaces and newlines inside them
        empty_code_block_removed: str = re.sub(
            pattern=r"```[ \t]*\n[ \t]*\n```",
            repl="",
            string=non_breaking_space_removed,  # type: ignore  # noqa: PGH003
        )
        # [How to Update] should be # How to Update
        square_brackets_converted: str = re.sub(
            pattern=r"^\s*\[([^\]]+)\]\s*$",
            repl=r"# \1",
            string=empty_code_block_removed,  # type: ignore  # noqa: PGH003
            flags=re.MULTILINE,
        )
        stars_converted: str = handle_stars(square_brackets_converted)
        # If `● Word` is in the content, replace it `## Word` instead with regex
        ball_converted: str = re.sub(pattern=r"●\s*(.*?)\n", repl=r"\n\n## \1\n\n", string=stars_converted, flags=re.MULTILINE)
        # If `※ Word` is in the content, replace it `* word * ` instead with regex
        reference_mark_converted: str = re.sub(
            pattern=r"^\s*※\s*(\S.*?)\s*$",
            repl=r"\n\n*\1*\n\n",
            string=ball_converted,
            flags=re.MULTILINE,
        )
        # Replace circled Unicode numbers (①-⑳) with plain numbered text (e.g., "1. ", "2. ", ..., "20. ")
        number_symbol: dict[str, str] = {
            "①": "1",
            "②": "2",
            "③": "3",
            "④": "4",
            "⑤": "5",
            "⑥": "6",
            "⑦": "7",
            "⑧": "8",
            "⑨": "9",
            "⑩": "10",
        }
        for symbol, number in number_symbol.items():
            reference_mark_converted = re.sub(
                pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$",
                repl=rf"\n\n{number}. \1\n\n",
                string=reference_mark_converted,
                flags=re.MULTILINE,
            )
        space_before_star_added: str = re.sub(pattern=r"\\\*(.*)", repl=r"* \1", string=reference_mark_converted, flags=re.MULTILINE)
        markdown_formatted: str = mdformat.text(  # type: ignore  # noqa: PGH003
            space_before_star_added,
            options={
                "number": True,  # Allow 1., 2., 3. numbering
            },
        )
        links_fixed: str = format_discord_links(markdown_formatted)
        article_escaped: Markup = escape(links_fixed)
        article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
        article_create_time: str = article.get("createTime", "")
@ -361,6 +490,9 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
            published = f"<published>{iso_time}</published>"
            updated = iso_time
        if article_id == "1004":
            logger.info("Article ID: %s, Date: %s", article_id, article_create_time)
        article_category: str = article.get("articleTypeName", "Wuthering Waves")
        category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
        atom_entries.append(
@ -369,7 +501,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
        <id>{entry_id}</id>
        <title>{escape(article_title)}</title>
        <link href="{article_url}" rel="alternate" type="text/html"/>
-        <content type="text">{article_content}</content>
+        <content type="text">{article_escaped}</content>
        {published}
        <updated>{updated}</updated>
        {category}