Enhance Markdown processing and link formatting
This commit is contained in:
1
.vscode/settings.json
vendored
1
.vscode/settings.json
vendored
@ -7,6 +7,7 @@
|
|||||||
"levelname",
|
"levelname",
|
||||||
"markdownify",
|
"markdownify",
|
||||||
"markupsafe",
|
"markupsafe",
|
||||||
|
"mdformat",
|
||||||
"TheLovinator",
|
"TheLovinator",
|
||||||
"Wuthering",
|
"Wuthering",
|
||||||
"wutheringwaves"
|
"wutheringwaves"
|
||||||
|
8181
articles_all.xml
8181
articles_all.xml
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -10,4 +10,5 @@ dependencies = [
|
|||||||
"httpx",
|
"httpx",
|
||||||
"markdownify",
|
"markdownify",
|
||||||
"markupsafe",
|
"markupsafe",
|
||||||
|
"mdformat",
|
||||||
]
|
]
|
||||||
|
182
scrape.py
182
scrape.py
@ -1,7 +1,8 @@
|
|||||||
import asyncio
|
import asyncio # noqa: CPY001, D100
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess # noqa: S404
|
import subprocess # noqa: S404
|
||||||
import time
|
import time
|
||||||
@ -11,8 +12,9 @@ from typing import TYPE_CHECKING, Any, Literal
|
|||||||
|
|
||||||
import aiofiles
|
import aiofiles
|
||||||
import httpx
|
import httpx
|
||||||
from markdownify import MarkdownConverter
|
import mdformat
|
||||||
from markupsafe import escape
|
from markdownify import MarkdownConverter # pyright: ignore[reportMissingTypeStubs]
|
||||||
|
from markupsafe import Markup, escape
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Coroutine
|
from collections.abc import Coroutine
|
||||||
@ -104,6 +106,10 @@ def get_file_timestamp(timestamp_str: str) -> float:
|
|||||||
float: The Unix timestamp, or 0 if conversion failed.
|
float: The Unix timestamp, or 0 if conversion failed.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
if not timestamp_str:
|
||||||
|
logger.info("Empty timestamp string")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Parse the timestamp string
|
# Parse the timestamp string
|
||||||
dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
|
dt: datetime = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
|
||||||
@ -279,31 +285,80 @@ def batch_process_timestamps(menu_data: dict[Any, Any], output_dir: Path) -> Non
|
|||||||
logger.error("Failed to update timestamp for %s", file_path)
|
logger.error("Failed to update timestamp for %s", file_path)
|
||||||
|
|
||||||
|
|
||||||
class CustomLinkMarkdownConverter(MarkdownConverter):
|
def format_discord_links(md: str) -> str:
|
||||||
"""Custom Markdown converter to handle links.
|
"""Make links work in Discord.
|
||||||
|
|
||||||
This class is a subclass of MarkdownConverter
|
Discord doesn't support links with titles, so we need to remove them.
|
||||||
and overrides the convert_a method to customize
|
This function also adds angle brackets around the URL to not embed it.
|
||||||
the conversion of <a> tags to Markdown links.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def convert_a(self, el: Any, text: str, **kwargs) -> str: # type: ignore # noqa: ANN003, ANN401, ARG002, PGH003, PLR6301
|
|
||||||
"""Convert <a> tags.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
el (Any): The element to convert.
|
md (str): The Markdown text containing links.
|
||||||
text (str): The text content of the element.
|
|
||||||
kwargs (Any): Additional arguments.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The converted text.
|
str: The modified Markdown text with simplified links.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
href: str | None = el.get("href")
|
|
||||||
if not href:
|
|
||||||
return text
|
|
||||||
|
|
||||||
return f"[{text}](<{href}>)"
|
def repl(match: re.Match[str]) -> str:
|
||||||
|
url: str | Any = match.group(2)
|
||||||
|
display: str = re.sub(pattern=r"^https?://(www\.)?", repl="", string=url)
|
||||||
|
return f"[{display}]({url})"
|
||||||
|
|
||||||
|
# Before: [Link](https://example.com "Link")
|
||||||
|
# After: [Link](https://example.com)
|
||||||
|
formatted_links_md = re.sub(
|
||||||
|
pattern=r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)',
|
||||||
|
repl=repl,
|
||||||
|
string=md,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Before: [Link](https://example.com)
|
||||||
|
# After: [Link](<https://example.com>)
|
||||||
|
add_angle_brackets_md: str = re.sub(
|
||||||
|
pattern=r"\[([^\]]+)\]\((https?://[^\s)]+)\)",
|
||||||
|
repl=r"[\1](<\2>)",
|
||||||
|
string=formatted_links_md,
|
||||||
|
)
|
||||||
|
|
||||||
|
return add_angle_brackets_md
|
||||||
|
|
||||||
|
|
||||||
|
def handle_stars(text: str) -> str:
|
||||||
|
"""Handle stars in the text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The text to process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The processed text with stars replaced by headers.
|
||||||
|
|
||||||
|
"""
|
||||||
|
lines: list[str] = text.strip().splitlines()
|
||||||
|
output: list[str] = []
|
||||||
|
for line in lines:
|
||||||
|
line: str = line.strip() # noqa: PLW2901
|
||||||
|
|
||||||
|
# Before: ✦ Title ✦
|
||||||
|
# After: # Title
|
||||||
|
if line.startswith("✦") and line.endswith("✦"):
|
||||||
|
title: str = line.removeprefix("✦").removesuffix("✦").strip()
|
||||||
|
output.append(f"# {title}")
|
||||||
|
|
||||||
|
# Before: **✦ Title ✦**
|
||||||
|
# After: # Title
|
||||||
|
elif line.startswith("**✦") and line.endswith("✦**"):
|
||||||
|
title: str = line.removeprefix("**✦").removesuffix("✦**").strip()
|
||||||
|
output.append(f"# {title}")
|
||||||
|
|
||||||
|
# Before: ✦ Title
|
||||||
|
# After: * Title
|
||||||
|
elif line.startswith("✦"):
|
||||||
|
title: str = line.removeprefix("✦").strip()
|
||||||
|
output.append(f"* {title}")
|
||||||
|
|
||||||
|
elif line:
|
||||||
|
output.append(line)
|
||||||
|
return "\n\n".join(output)
|
||||||
|
|
||||||
|
|
||||||
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: # noqa: PLR0914
|
def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str: # noqa: PLR0914
|
||||||
@ -341,14 +396,88 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
if not article_content:
|
if not article_content:
|
||||||
article_content = article_title
|
article_content = article_title
|
||||||
|
|
||||||
converter: CustomLinkMarkdownConverter = CustomLinkMarkdownConverter(
|
converter: MarkdownConverter = MarkdownConverter(
|
||||||
heading_style="ATX",
|
heading_style="ATX",
|
||||||
bullets="-",
|
bullets="-",
|
||||||
strip=["img"],
|
strip=["img"],
|
||||||
|
default_title="Link",
|
||||||
)
|
)
|
||||||
article_content = article_content.replace(" ", " ") # Replace non-breaking spaces with regular spaces # noqa: RUF001
|
article_content_converted = str(converter.convert(article_content).strip()) # type: ignore # noqa: PGH003
|
||||||
article_content: str = converter.convert(article_content).strip() # type: ignore # noqa: PGH003
|
|
||||||
article_content = escape(article_content)
|
if not article_content_converted:
|
||||||
|
msg: str = f"Article content is empty for article ID: {article_id}"
|
||||||
|
logger.warning(msg)
|
||||||
|
article_content_converted = "No content available"
|
||||||
|
|
||||||
|
# Remove non-breaking spaces
|
||||||
|
xa0_removed: str = re.sub(r"\xa0", " ", article_content_converted) # Replace non-breaking spaces with regular spaces
|
||||||
|
|
||||||
|
# Replace non-breaking spaces with regular spaces
|
||||||
|
non_breaking_space_removed: str = xa0_removed.replace(
|
||||||
|
" ", # noqa: RUF001
|
||||||
|
" ",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove code blocks that has only spaces and newlines inside them
|
||||||
|
empty_code_block_removed: str = re.sub(
|
||||||
|
pattern=r"```[ \t]*\n[ \t]*\n```",
|
||||||
|
repl="",
|
||||||
|
string=non_breaking_space_removed, # type: ignore # noqa: PGH003
|
||||||
|
)
|
||||||
|
|
||||||
|
# [How to Update] should be # How to Update
|
||||||
|
square_brackets_converted: str = re.sub(
|
||||||
|
pattern=r"^\s*\[([^\]]+)\]\s*$",
|
||||||
|
repl=r"# \1",
|
||||||
|
string=empty_code_block_removed, # type: ignore # noqa: PGH003
|
||||||
|
flags=re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
stars_converted: str = handle_stars(square_brackets_converted)
|
||||||
|
|
||||||
|
# If `● Word` is in the content, replace it `## Word` instead with regex
|
||||||
|
ball_converted: str = re.sub(pattern=r"●\s*(.*?)\n", repl=r"\n\n## \1\n\n", string=stars_converted, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# If `※ Word` is in the content, replace it `* word * ` instead with regex
|
||||||
|
reference_mark_converted: str = re.sub(
|
||||||
|
pattern=r"^\s*※\s*(\S.*?)\s*$",
|
||||||
|
repl=r"\n\n*\1*\n\n",
|
||||||
|
string=ball_converted,
|
||||||
|
flags=re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Replace circled Unicode numbers (①-⑳) with plain numbered text (e.g., "1. ", "2. ", ..., "20. ")
|
||||||
|
number_symbol: dict[str, str] = {
|
||||||
|
"①": "1",
|
||||||
|
"②": "2",
|
||||||
|
"③": "3",
|
||||||
|
"④": "4",
|
||||||
|
"⑤": "5",
|
||||||
|
"⑥": "6",
|
||||||
|
"⑦": "7",
|
||||||
|
"⑧": "8",
|
||||||
|
"⑨": "9",
|
||||||
|
"⑩": "10",
|
||||||
|
}
|
||||||
|
for symbol, number in number_symbol.items():
|
||||||
|
reference_mark_converted = re.sub(
|
||||||
|
pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$",
|
||||||
|
repl=rf"\n\n{number}. \1\n\n",
|
||||||
|
string=reference_mark_converted,
|
||||||
|
flags=re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
space_before_star_added: str = re.sub(pattern=r"\\\*(.*)", repl=r"* \1", string=reference_mark_converted, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003
|
||||||
|
space_before_star_added,
|
||||||
|
options={
|
||||||
|
"number": True, # Allow 1., 2., 3. numbering
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
links_fixed: str = format_discord_links(markdown_formatted)
|
||||||
|
article_escaped: Markup = escape(links_fixed)
|
||||||
|
|
||||||
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
|
article_url: str = f"https://wutheringwaves.kurogames.com/en/main/news/detail/{article_id}"
|
||||||
article_create_time: str = article.get("createTime", "")
|
article_create_time: str = article.get("createTime", "")
|
||||||
@ -361,6 +490,9 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
published = f"<published>{iso_time}</published>"
|
published = f"<published>{iso_time}</published>"
|
||||||
updated = iso_time
|
updated = iso_time
|
||||||
|
|
||||||
|
if article_id == "1004":
|
||||||
|
logger.info("Article ID: %s, Date: %s", article_id, article_create_time)
|
||||||
|
|
||||||
article_category: str = article.get("articleTypeName", "Wuthering Waves")
|
article_category: str = article.get("articleTypeName", "Wuthering Waves")
|
||||||
category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
|
category: str = f'<category term="{escape(article_category)}"/>' if article_category else ""
|
||||||
atom_entries.append(
|
atom_entries.append(
|
||||||
@ -369,7 +501,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
<id>{entry_id}</id>
|
<id>{entry_id}</id>
|
||||||
<title>{escape(article_title)}</title>
|
<title>{escape(article_title)}</title>
|
||||||
<link href="{article_url}" rel="alternate" type="text/html"/>
|
<link href="{article_url}" rel="alternate" type="text/html"/>
|
||||||
<content type="text">{article_content}</content>
|
<content type="text">{article_escaped}</content>
|
||||||
{published}
|
{published}
|
||||||
<updated>{updated}</updated>
|
<updated>{updated}</updated>
|
||||||
{category}
|
{category}
|
||||||
|
Reference in New Issue
Block a user