mirror of
https://github.com/TheLovinator1/wutheringwaves.git
synced 2025-11-28 04:59:49 +01:00
Optimize regex operations and remove unnecessary HTML prettification
Co-authored-by: TheLovinator1 <4153203+TheLovinator1@users.noreply.github.com>
This commit is contained in:
BIN
__pycache__/scrape.cpython-312.pyc
Normal file
BIN
__pycache__/scrape.cpython-312.pyc
Normal file
Binary file not shown.
102
scrape.py
102
scrape.py
@@ -14,7 +14,6 @@ import aiofiles
|
|||||||
import httpx
|
import httpx
|
||||||
import markdown
|
import markdown
|
||||||
import mdformat
|
import mdformat
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from markdownify import MarkdownConverter # pyright: ignore[reportMissingTypeStubs]
|
from markdownify import MarkdownConverter # pyright: ignore[reportMissingTypeStubs]
|
||||||
from markupsafe import Markup, escape
|
from markupsafe import Markup, escape
|
||||||
|
|
||||||
@@ -28,6 +27,21 @@ logging.basicConfig(
|
|||||||
|
|
||||||
logger: logging.Logger = logging.getLogger("wutheringwaves")
|
logger: logging.Logger = logging.getLogger("wutheringwaves")
|
||||||
|
|
||||||
|
# Compile regex patterns for better performance
|
||||||
|
DISCORD_LINK_PATTERN = re.compile(r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)')
|
||||||
|
SQUARE_BRACKETS_PATTERN = re.compile(r"^\s*\[([^\]]+)\]\s*$", re.MULTILINE)
|
||||||
|
BALL_PATTERN = re.compile(r"●\s*(.*?)\n", re.MULTILINE)
|
||||||
|
REFERENCE_MARK_PATTERN = re.compile(r"^\s*※\s*(\S.*?)\s*$", re.MULTILINE)
|
||||||
|
ESCAPED_STAR_PATTERN = re.compile(r"\\\*(.*)", re.MULTILINE)
|
||||||
|
NON_BREAKING_SPACE_PATTERN = re.compile(r"[\xa0 ]") # noqa: RUF001
|
||||||
|
EMPTY_CODE_BLOCK_PATTERN = re.compile(r"```[ \t]*\n[ \t]*\n```")
|
||||||
|
|
||||||
|
# Circled number patterns
|
||||||
|
CIRCLED_NUMBERS = {
|
||||||
|
"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5",
|
||||||
|
"⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9", "⑩": "10",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async def fetch_json(url: str, client: httpx.AsyncClient) -> dict[Any, Any] | None:
|
async def fetch_json(url: str, client: httpx.AsyncClient) -> dict[Any, Any] | None:
|
||||||
"""Fetch JSON data from a URL.
|
"""Fetch JSON data from a URL.
|
||||||
@@ -325,13 +339,7 @@ def format_discord_links(md: str) -> str:
|
|||||||
|
|
||||||
# Before: [Link](https://example.com "Link")
|
# Before: [Link](https://example.com "Link")
|
||||||
# After: [Link](https://example.com)
|
# After: [Link](https://example.com)
|
||||||
formatted_links_md: str = re.sub(
|
return DISCORD_LINK_PATTERN.sub(repl, md)
|
||||||
pattern=r'\[([^\]]+)\]\((https?://[^\s)]+) "\2"\)',
|
|
||||||
repl=repl,
|
|
||||||
string=md,
|
|
||||||
)
|
|
||||||
|
|
||||||
return formatted_links_md
|
|
||||||
|
|
||||||
|
|
||||||
def handle_stars(text: str) -> str:
|
def handle_stars(text: str) -> str:
|
||||||
@@ -422,80 +430,36 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
logger.warning(msg)
|
logger.warning(msg)
|
||||||
article_content_converted = "No content available"
|
article_content_converted = "No content available"
|
||||||
|
|
||||||
# Remove non-breaking spaces
|
# Combine non-breaking space replacements in one pass
|
||||||
xa0_removed: str = re.sub(
|
content = NON_BREAKING_SPACE_PATTERN.sub(" ", article_content_converted)
|
||||||
r"\xa0", " ", article_content_converted
|
|
||||||
) # Replace non-breaking spaces with regular spaces
|
|
||||||
|
|
||||||
# Replace non-breaking spaces with regular spaces
|
# Remove empty code blocks
|
||||||
non_breaking_space_removed: str = xa0_removed.replace(
|
content = EMPTY_CODE_BLOCK_PATTERN.sub("", content)
|
||||||
" ", # noqa: RUF001
|
|
||||||
" ",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Remove code blocks that has only spaces and newlines inside them
|
|
||||||
empty_code_block_removed: str = re.sub(
|
|
||||||
pattern=r"```[ \t]*\n[ \t]*\n```",
|
|
||||||
repl="",
|
|
||||||
string=non_breaking_space_removed, # type: ignore # noqa: PGH003
|
|
||||||
)
|
|
||||||
|
|
||||||
# [How to Update] should be # How to Update
|
# [How to Update] should be # How to Update
|
||||||
square_brackets_converted: str = re.sub(
|
content = SQUARE_BRACKETS_PATTERN.sub(r"# \1", content)
|
||||||
pattern=r"^\s*\[([^\]]+)\]\s*$",
|
|
||||||
repl=r"# \1",
|
|
||||||
string=empty_code_block_removed, # type: ignore # noqa: PGH003
|
|
||||||
flags=re.MULTILINE,
|
|
||||||
)
|
|
||||||
|
|
||||||
stars_converted: str = handle_stars(square_brackets_converted)
|
content = handle_stars(content)
|
||||||
|
|
||||||
# If `● Word` is in the content, replace it `## Word` instead with regex
|
# If `● Word` is in the content, replace it `## Word` instead
|
||||||
ball_converted: str = re.sub(
|
content = BALL_PATTERN.sub(r"\n\n## \1\n\n", content)
|
||||||
pattern=r"●\s*(.*?)\n",
|
|
||||||
repl=r"\n\n## \1\n\n",
|
|
||||||
string=stars_converted,
|
|
||||||
flags=re.MULTILINE,
|
|
||||||
)
|
|
||||||
|
|
||||||
# If `※ Word` is in the content, replace it `* word * ` instead with regex
|
# If `※ Word` is in the content, replace it `* word * ` instead
|
||||||
reference_mark_converted: str = re.sub(
|
content = REFERENCE_MARK_PATTERN.sub(r"\n\n*\1*\n\n", content)
|
||||||
pattern=r"^\s*※\s*(\S.*?)\s*$",
|
|
||||||
repl=r"\n\n*\1*\n\n",
|
|
||||||
string=ball_converted,
|
|
||||||
flags=re.MULTILINE,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Replace circled Unicode numbers (①-⑳) with plain numbered text (e.g., "1. ", "2. ", ..., "20. ")
|
# Replace circled Unicode numbers with plain numbered text
|
||||||
number_symbol: dict[str, str] = {
|
for symbol, number in CIRCLED_NUMBERS.items():
|
||||||
"①": "1",
|
content = re.sub(
|
||||||
"②": "2",
|
|
||||||
"③": "3",
|
|
||||||
"④": "4",
|
|
||||||
"⑤": "5",
|
|
||||||
"⑥": "6",
|
|
||||||
"⑦": "7",
|
|
||||||
"⑧": "8",
|
|
||||||
"⑨": "9",
|
|
||||||
"⑩": "10",
|
|
||||||
}
|
|
||||||
for symbol, number in number_symbol.items():
|
|
||||||
reference_mark_converted = re.sub(
|
|
||||||
pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$",
|
pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$",
|
||||||
repl=rf"\n\n{number}. \1\n\n",
|
repl=rf"\n\n{number}. \1\n\n",
|
||||||
string=reference_mark_converted,
|
string=content,
|
||||||
flags=re.MULTILINE,
|
flags=re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
space_before_star_added: str = re.sub(
|
content = ESCAPED_STAR_PATTERN.sub(r"* \1", content)
|
||||||
pattern=r"\\\*(.*)",
|
|
||||||
repl=r"* \1",
|
|
||||||
string=reference_mark_converted,
|
|
||||||
flags=re.MULTILINE,
|
|
||||||
)
|
|
||||||
|
|
||||||
markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003
|
markdown_formatted: str = mdformat.text( # type: ignore # noqa: PGH003
|
||||||
space_before_star_added,
|
content,
|
||||||
options={
|
options={
|
||||||
"number": True, # Allow 1., 2., 3. numbering
|
"number": True, # Allow 1., 2., 3. numbering
|
||||||
},
|
},
|
||||||
@@ -556,7 +520,7 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
|
|||||||
html_file: Path = html_dir / f"{article_id}.html"
|
html_file: Path = html_dir / f"{article_id}.html"
|
||||||
if not html_file.is_file():
|
if not html_file.is_file():
|
||||||
with html_file.open("w", encoding="utf-8") as f:
|
with html_file.open("w", encoding="utf-8") as f:
|
||||||
f.write(str(BeautifulSoup(html, "html.parser").prettify()))
|
f.write(html)
|
||||||
logger.info("Saved HTML for article %s to %s", article_id, html_file)
|
logger.info("Saved HTML for article %s to %s", article_id, html_file)
|
||||||
|
|
||||||
# Set the file timestamp
|
# Set the file timestamp
|
||||||
|
|||||||
Reference in New Issue
Block a user