Remove HTML sanitization
This commit is contained in:
7356
articles_all.xml
7356
articles_all.xml
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -4,4 +4,4 @@ version = "0.1.0"
|
|||||||
description = "Wuthering Waves archive"
|
description = "Wuthering Waves archive"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = ["aiofiles", "html-sanitizer", "httpx", "markupsafe"]
|
dependencies = ["aiofiles", "httpx", "markupsafe"]
|
||||||
|
29
scrape.py
29
scrape.py
@ -1,8 +1,3 @@
|
|||||||
"""Fetch articles from the Wuthering Waves website and saves them locally in JSON format.
|
|
||||||
|
|
||||||
It retrieves the article menu and individual articles, prettifies the JSON output, and sets file timestamps based on article creation dates.
|
|
||||||
""" # noqa: CPY001
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -16,7 +11,6 @@ from typing import TYPE_CHECKING, Any, Literal
|
|||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import httpx
|
import httpx
|
||||||
from html_sanitizer import Sanitizer # pyright: ignore[reportMissingTypeStubs]
|
|
||||||
from markupsafe import escape
|
from markupsafe import escape
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -284,24 +278,6 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
|
|||||||
logger.error("Failed to update timestamp for %s", file_path)
|
logger.error("Failed to update timestamp for %s", file_path)
|
||||||
|
|
||||||
|
|
||||||
def strip_unsafe_tags(content: str) -> str:
|
|
||||||
"""Strip unsafe HTML tags and return the cleaned content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
content (str): The HTML content to clean.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The cleaned HTML content.
|
|
||||||
|
|
||||||
"""
|
|
||||||
sanitizer = Sanitizer({
|
|
||||||
"tags": {"a", "br", "b", "strong", "i", "em", "code", "s", "strike", "del", "u"},
|
|
||||||
"empty": {"a", "br"},
|
|
||||||
"separate": {"br"},
|
|
||||||
})
|
|
||||||
return sanitizer.sanitize(content.replace("\n", "<br>")).replace("<br>", "\n") # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
|
|
||||||
|
|
||||||
|
|
||||||
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
||||||
"""Generate an Atom feed from a list of articles.
|
"""Generate an Atom feed from a list of articles.
|
||||||
|
|
||||||
@ -334,7 +310,6 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
|
|
||||||
article_title: str = article.get("articleTitle", "No Title")
|
article_title: str = article.get("articleTitle", "No Title")
|
||||||
article_content: str = article.get("articleContent", article_title)
|
article_content: str = article.get("articleContent", article_title)
|
||||||
article_content = strip_unsafe_tags(article_content)
|
|
||||||
if not article_content:
|
if not article_content:
|
||||||
article_content = article_title
|
article_content = article_title
|
||||||
|
|
||||||
@ -347,19 +322,17 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
timestamp: datetime = datetime.strptime(str(article_create_time), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
|
timestamp: datetime = datetime.strptime(str(article_create_time), "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
|
||||||
iso_time: str = timestamp.isoformat()
|
iso_time: str = timestamp.isoformat()
|
||||||
published = f"<published>{iso_time}</published>"
|
published = f"<published>{iso_time}</published>"
|
||||||
# Use createTime as updated if available (more accurate than now)
|
|
||||||
updated = iso_time
|
updated = iso_time
|
||||||
|
|
||||||
article_category: str = article.get("articleTypeName", "Wuthering Waves")
|
article_category: str = article.get("articleTypeName", "Wuthering Waves")
|
||||||
category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
|
category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
|
||||||
# Create entry using Atom format
|
|
||||||
atom_entries.append(
|
atom_entries.append(
|
||||||
f"""
|
f"""
|
||||||
<entry>
|
<entry>
|
||||||
<id>{entry_id}</id>
|
<id>{entry_id}</id>
|
||||||
<title>{escape(article_title)}</title>
|
<title>{escape(article_title)}</title>
|
||||||
<link href="{article_url}" rel="alternate" type="text/html"/>
|
<link href="{article_url}" rel="alternate" type="text/html"/>
|
||||||
<content type="html">{escape(article_content.strip())}</content>
|
<content type="html">{escape(article_content.strip()).replace("\n", "<br/>")}</content>
|
||||||
{published}
|
{published}
|
||||||
<updated>{updated}</updated>
|
<updated>{updated}</updated>
|
||||||
{category}
|
{category}
|
||||||
|
Reference in New Issue
Block a user