Use Markdown instead of HTML

2025-05-10 19:07:11 +02:00
parent 81742c35ce
commit e58751e464
5 changed files with 14328 additions and 422 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -5,6 +5,7 @@
        "httpx",
        "Joakim",
        "levelname",
+        "markdownify",
        "markupsafe",
        "TheLovinator",
        "Wuthering",
--- a/articles_all.xml
+++ b/articles_all.xml
--- a/articles_latest.xml
+++ b/articles_latest.xml
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,4 +4,10 @@ version = "0.1.0"
 description = "Wuthering Waves archive"
 readme = "README.md"
 requires-python = ">=3.13"
-dependencies = ["aiofiles", "beautifulsoup4", "httpx", "markupsafe"]
+dependencies = [
+    "aiofiles",
+    "beautifulsoup4",
+    "httpx",
+    "markdownify",
+    "markupsafe",
+]
--- a/scrape.py
+++ b/scrape.py
@ -11,8 +11,7 @@ from typing import TYPE_CHECKING, Any, Literal

 import aiofiles
 import httpx
-from bs4 import BeautifulSoup
-from bs4.element import PageElement, Tag
+from markdownify import MarkdownConverter
 from markupsafe import escape

 if TYPE_CHECKING:
@ -280,119 +279,34 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
            logger.error("Failed to update timestamp for %s", file_path)


-def clean_html(html: str) -> str:  # noqa: C901, PLR0912, PLR0915
-    """Clean HTML content by removing unwanted tags and formatting.
-
-    Args:
-        html (str): The HTML content to clean.
-
-    Returns:
-        str: The cleaned HTML content.
+class CustomLinkMarkdownConverter(MarkdownConverter):
+    """Custom Markdown converter to handle links.

+    This class is a subclass of MarkdownConverter
+    and overrides the convert_a method to customize
+    the conversion of <a> tags to Markdown links.
    """
-    soup = BeautifulSoup(html, "html.parser")

-    # 1. Remove unwanted tags completely
-    tags_to_remove: list[str] = ["img", "pre"]
-    for tag_name in tags_to_remove:
-        for tag in soup.find_all(tag_name):
-            tag.decompose()  # Removes the tag and its content
+    def convert_a(self, el: Any, text: str, **kwargs) -> str:  # type: ignore  # noqa: ANN003, ANN401, ARG002, PGH003, PLR6301
+        """Convert <a> tags.

-    # 2. Unwrap tags whose content should be preserved directly
-    tags_to_unwrap: list[str] = ["div", "p"]
-    for tag_name in tags_to_unwrap:
-        for element in soup.find_all(tag_name):
-            if isinstance(element, Tag):
-                element.unwrap()  # Removes the tag, keeps its children
+        Args:
+            el (Any): The element to convert.
+            text (str): The text content of the element.
+            kwargs (Any): Additional arguments.

-    # 3. Process <span> tags: extract their text and <br> tags, then remove the <span>
-    for span_tag in soup.find_all("span"):
-        if not isinstance(span_tag, Tag):
-            continue
+        Returns:
+            str: The converted text.

-        content_to_insert: list[str | Tag] = []
-        for child in span_tag.contents:
-            if isinstance(child, Tag) and child.name == "br":
-                # Create a new <br> tag object to insert
-                br_tag = soup.new_tag("br")
-                content_to_insert.append(br_tag)
-            elif isinstance(child, str):  # It's a NavigableString (text)
-                # Add the text content directly
-                content_to_insert.append(child)
-            # Add handling for other nested tags within span if necessary
+        """
+        href: str | None = el.get("href")
+        if not href:
+            return text

-        # Insert the extracted content before the span tag, in order
-        for item in content_to_insert:
-            span_tag.insert_before(item)
-        # Remove the original span tag
-        span_tag.decompose()
-
-    # 4. Consolidate text nodes and handle <br> tag sequences
-    # Determine the list of elements to iterate over (direct children of the main parsed content)
-    content_nodes: list[PageElement] = []
-    if soup.body:  # If BeautifulSoup added <html><body> tags
-        content_nodes = soup.body.contents
-    elif soup.html:  # If only <html> tag was added
-        content_nodes = soup.html.contents
-    else:  # If it's a fragment and no top-level tags were added by BS
-        content_nodes = soup.contents
-
-    final_output_parts: list[str] = []
-    consecutive_br_count = 0
-
-    max_br_allowed = 2  # Maximum number of <br> tags to add in sequence
-    for element in content_nodes:
-        if isinstance(element, str):  # It's a NavigableString (text node)
-            # First, handle any accumulated <br> tags before this text
-            if consecutive_br_count > 0:
-                brs_to_add = 0
-                if consecutive_br_count == 1:
-                    brs_to_add = 1
-                elif consecutive_br_count >= max_br_allowed:
-                    brs_to_add = 2
-
-                final_output_parts.extend("<br/>" for _ in range(brs_to_add))
-            consecutive_br_count = 0
-
-            # Clean and add the text
-            text: str = element.replace("\xa0", " ").strip()  # \xa0 is &nbsp;
-            if text:
-                final_output_parts.append(text)
-
-        elif isinstance(element, Tag) and element.name == "br":  # It's a <br> tag
-            consecutive_br_count += 1
-
-        else:  # Handle other unexpected elements if any (e.g., leftover unknown tags)
-            # This part depends on how strictly you want to clean.
-            # For now, we'll try to get their text if they weren't removed.
-            if consecutive_br_count > 0:  # Process pending BRs first
-                brs_to_add = 0
-                if consecutive_br_count == 1:
-                    brs_to_add = 1
-                elif consecutive_br_count >= max_br_allowed:
-                    brs_to_add = 2
-                final_output_parts.extend("<br/>" for _ in range(brs_to_add))
-            consecutive_br_count = 0
-
-            if hasattr(element, "get_text"):
-                other_text = element.get_text(separator=" ", strip=True).replace("\xa0", " ")
-                if other_text:
-                    final_output_parts.append(other_text)
-
-    # Handle any trailing <br> tags accumulated at the very end of the content
-    if consecutive_br_count > 0:
-        brs_to_add = 0
-        if consecutive_br_count == 1:
-            brs_to_add = 1
-        elif consecutive_br_count >= max_br_allowed:
-            brs_to_add = 2
-
-        final_output_parts.extend("<br/>" for _ in range(brs_to_add))
-
-    return "".join(final_output_parts)
+        return f"[{text}](<{href}>)"


-def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
+def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:  # noqa: PLR0914
    """Generate an Atom feed from a list of articles.

    Args:
@ -427,7 +341,13 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
        if not article_content:
            article_content = article_title

-        article_content = clean_html(article_content)
+        converter: CustomLinkMarkdownConverter = CustomLinkMarkdownConverter(
+            heading_style="ATX",
+            bullets="-",
+            strip=["img"],
+        )
+        article_content = article_content.replace(" ", " ")  # Replace non-breaking spaces with regular spaces  # noqa: RUF001
+        article_content: str = converter.convert(article_content).strip()  # type: ignore  # noqa: PGH003

        article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
        article_create_time: str = article.get("createTime", "")
@ -448,7 +368,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
        <id>{entry_id}</id>
        <title>{escape(article_title)}</title>
        <link href="{article_url}" rel="alternate" type="text/html"/>
-        <content type="html">{escape(article_content.strip()).replace("\n", "<br/>")}</content>
+        <content type="text">{article_content}</content>
        {published}
        <updated>{updated}</updated>
        {category}