Use Markdown instead of HTML

This commit is contained in:
2025-05-10 19:07:11 +02:00
parent 81742c35ce
commit e58751e464
5 changed files with 14328 additions and 422 deletions

View File

@ -5,6 +5,7 @@
"httpx", "httpx",
"Joakim", "Joakim",
"levelname", "levelname",
"markdownify",
"markupsafe", "markupsafe",
"TheLovinator", "TheLovinator",
"Wuthering", "Wuthering",

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -4,4 +4,10 @@ version = "0.1.0"
description = "Wuthering Waves archive" description = "Wuthering Waves archive"
readme = "README.md" readme = "README.md"
requires-python = ">=3.13" requires-python = ">=3.13"
dependencies = ["aiofiles", "beautifulsoup4", "httpx", "markupsafe"] dependencies = [
"aiofiles",
"beautifulsoup4",
"httpx",
"markdownify",
"markupsafe",
]

136
scrape.py
View File

@ -11,8 +11,7 @@ from typing import TYPE_CHECKING, Any, Literal
import aiofiles import aiofiles
import httpx import httpx
from bs4 import BeautifulSoup from markdownify import MarkdownConverter
from bs4.element import PageElement, Tag
from markupsafe import escape from markupsafe import escape
if TYPE_CHECKING: if TYPE_CHECKING:
@ -280,119 +279,34 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
logger.error("Failed to update timestamp for %s", file_path) logger.error("Failed to update timestamp for %s", file_path)
def clean_html(html: str) -> str: # noqa: C901, PLR0912, PLR0915 class CustomLinkMarkdownConverter(MarkdownConverter):
"""Clean HTML content by removing unwanted tags and formatting. """Custom Markdown converter to handle links.
This class is a subclass of MarkdownConverter
and overrides the convert_a method to customize
the conversion of <a> tags to Markdown links.
"""
def convert_a(self, el: Any, text: str, **kwargs) -> str: # type: ignore # noqa: ANN003, ANN401, ARG002, PGH003, PLR6301
"""Convert <a> tags.
Args: Args:
html (str): The HTML content to clean. el (Any): The element to convert.
text (str): The text content of the element.
kwargs (Any): Additional arguments.
Returns: Returns:
str: The cleaned HTML content. str: The converted text.
""" """
soup = BeautifulSoup(html, "html.parser") href: str | None = el.get("href")
if not href:
return text
# 1. Remove unwanted tags completely return f"[{text}](<{href}>)"
tags_to_remove: list[str] = ["img", "pre"]
for tag_name in tags_to_remove:
for tag in soup.find_all(tag_name):
tag.decompose() # Removes the tag and its content
# 2. Unwrap tags whose content should be preserved directly
tags_to_unwrap: list[str] = ["div", "p"]
for tag_name in tags_to_unwrap:
for element in soup.find_all(tag_name):
if isinstance(element, Tag):
element.unwrap() # Removes the tag, keeps its children
# 3. Process <span> tags: extract their text and <br> tags, then remove the <span>
for span_tag in soup.find_all("span"):
if not isinstance(span_tag, Tag):
continue
content_to_insert: list[str | Tag] = []
for child in span_tag.contents:
if isinstance(child, Tag) and child.name == "br":
# Create a new <br> tag object to insert
br_tag = soup.new_tag("br")
content_to_insert.append(br_tag)
elif isinstance(child, str): # It's a NavigableString (text)
# Add the text content directly
content_to_insert.append(child)
# Add handling for other nested tags within span if necessary
# Insert the extracted content before the span tag, in order
for item in content_to_insert:
span_tag.insert_before(item)
# Remove the original span tag
span_tag.decompose()
# 4. Consolidate text nodes and handle <br> tag sequences
# Determine the list of elements to iterate over (direct children of the main parsed content)
content_nodes: list[PageElement] = []
if soup.body: # If BeautifulSoup added <html><body> tags
content_nodes = soup.body.contents
elif soup.html: # If only <html> tag was added
content_nodes = soup.html.contents
else: # If it's a fragment and no top-level tags were added by BS
content_nodes = soup.contents
final_output_parts: list[str] = []
consecutive_br_count = 0
max_br_allowed = 2 # Maximum number of <br> tags to add in sequence
for element in content_nodes:
if isinstance(element, str): # It's a NavigableString (text node)
# First, handle any accumulated <br> tags before this text
if consecutive_br_count > 0:
brs_to_add = 0
if consecutive_br_count == 1:
brs_to_add = 1
elif consecutive_br_count >= max_br_allowed:
brs_to_add = 2
final_output_parts.extend("<br/>" for _ in range(brs_to_add))
consecutive_br_count = 0
# Clean and add the text
text: str = element.replace("\xa0", " ").strip() # \xa0 is &nbsp;
if text:
final_output_parts.append(text)
elif isinstance(element, Tag) and element.name == "br": # It's a <br> tag
consecutive_br_count += 1
else: # Handle other unexpected elements if any (e.g., leftover unknown tags)
# This part depends on how strictly you want to clean.
# For now, we'll try to get their text if they weren't removed.
if consecutive_br_count > 0: # Process pending BRs first
brs_to_add = 0
if consecutive_br_count == 1:
brs_to_add = 1
elif consecutive_br_count >= max_br_allowed:
brs_to_add = 2
final_output_parts.extend("<br/>" for _ in range(brs_to_add))
consecutive_br_count = 0
if hasattr(element, "get_text"):
other_text = element.get_text(separator=" ", strip=True).replace("\xa0", " ")
if other_text:
final_output_parts.append(other_text)
# Handle any trailing <br> tags accumulated at the very end of the content
if consecutive_br_count > 0:
brs_to_add = 0
if consecutive_br_count == 1:
brs_to_add = 1
elif consecutive_br_count >= max_br_allowed:
brs_to_add = 2
final_output_parts.extend("<br/>" for _ in range(brs_to_add))
return "".join(final_output_parts)
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: # noqa: PLR0914
"""Generate an Atom feed from a list of articles. """Generate an Atom feed from a list of articles.
Args: Args:
@ -427,7 +341,13 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
if not article_content: if not article_content:
article_content = article_title article_content = article_title
article_content = clean_html(article_content) converter: CustomLinkMarkdownConverter = CustomLinkMarkdownConverter(
heading_style="ATX",
bullets="-",
strip=["img"],
)
article_content = article_content.replace(" ", " ") # Replace non-breaking spaces with regular spaces # noqa: RUF001
article_content: str = converter.convert(article_content).strip() # type: ignore # noqa: PGH003
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}" article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
article_create_time: str = article.get("createTime", "") article_create_time: str = article.get("createTime", "")
@ -448,7 +368,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
<id>{entry_id}</id> <id>{entry_id}</id>
<title>{escape(article_title)}</title> <title>{escape(article_title)}</title>
<link href="{article_url}" rel="alternate" type="text/html"/> <link href="{article_url}" rel="alternate" type="text/html"/>
<content type="html">{escape(article_content.strip()).replace("\n", "<br/>")}</content> <content type="text">{article_content}</content>
{published} {published}
<updated>{updated}</updated> <updated>{updated}</updated>
{category} {category}