Remove HTML sanitization

2025-05-10 17:26:54 +02:00
parent 7035f3e72e
commit 230094adcd
4 changed files with 315 additions and 7858 deletions
--- a/articles_all.xml
+++ b/articles_all.xml
--- a/articles_latest.xml
+++ b/articles_latest.xml
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,4 +4,4 @@ version = "0.1.0"
 description = "Wuthering Waves archive"
 readme = "README.md"
 requires-python = ">=3.13"
-dependencies = ["aiofiles", "html-sanitizer", "httpx", "markupsafe"]
+dependencies = ["aiofiles", "httpx", "markupsafe"]
--- a/scrape.py
+++ b/scrape.py
@ -1,8 +1,3 @@
 """Fetch articles from the Wuthering Waves website and saves them locally in JSON format.
 It retrieves the article menu and individual articles, prettifies the JSON output, and sets file timestamps based on article creation dates.
 """  # noqa: CPY001
 import asyncio
 import json
 import logging
@ -16,7 +11,6 @@ from typing import TYPE_CHECKING, Any, Literal
 import aiofiles
 import httpx
 from html_sanitizer import Sanitizer  # pyright: ignore[reportMissingTypeStubs]
 from markupsafe import escape
 if TYPE_CHECKING:
@ -284,24 +278,6 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
            logger.error("Failed to update timestamp for %s", file_path)
 def strip_unsafe_tags(content: str) -> str:
    """Strip unsafe HTML tags and return the cleaned content.
    Args:
        content (str): The HTML content to clean.
    Returns:
        str: The cleaned HTML content.
    """
    sanitizer = Sanitizer({
        "tags": {"a", "br", "b", "strong", "i", "em", "code", "s", "strike", "del", "u"},
        "empty": {"a", "br"},
        "separate": {"br"},
    })
    return sanitizer.sanitize(content.replace("\n", "<br>")).replace("<br>", "\n")  # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
 def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
    """Generate an Atom feed from a list of articles.
@ -334,7 +310,6 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
        article_title: str = article.get("articleTitle", "No Title")
        article_content: str = article.get("articleContent", article_title)
        article_content = strip_unsafe_tags(article_content)
        if not article_content:
            article_content = article_title
@ -347,19 +322,17 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
            timestamp: datetime = datetime.strptime(str(article_create_time), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
            iso_time: str = timestamp.isoformat()
            published = f"<published>{iso_time}</published>"
            # Use createTime as updated if available (more accurate than now)
            updated = iso_time
        article_category: str = article.get("articleTypeName", "Wuthering Waves")
        category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
        # Create entry using Atom format
        atom_entries.append(
            f"""
    <entry>
        <id>{entry_id}</id>
        <title>{escape(article_title)}</title>
        <link href="{article_url}" rel="alternate" type="text/html"/>
-        <content type="html">{escape(article_content.strip())}</content>
+        <content type="html">{escape(article_content.strip()).replace("\n", "<br/>")}</content>
        {published}
        <updated>{updated}</updated>
        {category}