Add article data caching and precompiled regex patterns for circled numbers

Co-authored-by: TheLovinator1 <4153203+TheLovinator1@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2025-11-03 05:54:24 +00:00
parent c972592a2f
commit f1924f38ad
3 changed files with 55 additions and 29 deletions

5
.gitignore vendored
View File

@@ -1 +1,6 @@
articles/ArticleMenu.json articles/ArticleMenu.json
__pycache__/
*.pyc
*.pyo
*.pyd
.Python

View File

Binary file not shown.

View File

@@ -36,13 +36,22 @@ ESCAPED_STAR_PATTERN = re.compile(r"\\\*(.*)", re.MULTILINE)
NON_BREAKING_SPACE_PATTERN = re.compile(r"[\xa0 ]") # noqa: RUF001 NON_BREAKING_SPACE_PATTERN = re.compile(r"[\xa0 ]") # noqa: RUF001
EMPTY_CODE_BLOCK_PATTERN = re.compile(r"```[ \t]*\n[ \t]*\n```") EMPTY_CODE_BLOCK_PATTERN = re.compile(r"```[ \t]*\n[ \t]*\n```")
# Circled number patterns # Circled number patterns - precompile for better performance
CIRCLED_NUMBERS = { CIRCLED_NUMBERS = {
"": "1", "": "2", "": "3", "": "4", "": "5", "": ("1", re.compile(r"^\s*①\s*(.*?)\s*$", re.MULTILINE)),
"": "6", "": "7", "": "8", "": "9", "": "10", "": ("2", re.compile(r"^\s*②\s*(.*?)\s*$", re.MULTILINE)),
"": ("3", re.compile(r"^\s*③\s*(.*?)\s*$", re.MULTILINE)),
"": ("4", re.compile(r"^\s*④\s*(.*?)\s*$", re.MULTILINE)),
"": ("5", re.compile(r"^\s*⑤\s*(.*?)\s*$", re.MULTILINE)),
"": ("6", re.compile(r"^\s*⑥\s*(.*?)\s*$", re.MULTILINE)),
"": ("7", re.compile(r"^\s*⑦\s*(.*?)\s*$", re.MULTILINE)),
"": ("8", re.compile(r"^\s*⑧\s*(.*?)\s*$", re.MULTILINE)),
"": ("9", re.compile(r"^\s*⑨\s*(.*?)\s*$", re.MULTILINE)),
"": ("10", re.compile(r"^\s*⑩\s*(.*?)\s*$", re.MULTILINE)),
} }
async def fetch_json(url: str, client: httpx.AsyncClient) -> dict[Any, Any] | None: async def fetch_json(url: str, client: httpx.AsyncClient) -> dict[Any, Any] | None:
"""Fetch JSON data from a URL. """Fetch JSON data from a URL.
@@ -447,14 +456,9 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
# If `※ Word` is in the content, replace it `* word * ` instead # If `※ Word` is in the content, replace it `* word * ` instead
content = REFERENCE_MARK_PATTERN.sub(r"\n\n*\1*\n\n", content) content = REFERENCE_MARK_PATTERN.sub(r"\n\n*\1*\n\n", content)
# Replace circled Unicode numbers with plain numbered text # Replace circled Unicode numbers with plain numbered text (using precompiled patterns)
for symbol, number in CIRCLED_NUMBERS.items(): for symbol, (number, pattern) in CIRCLED_NUMBERS.items():
content = re.sub( content = pattern.sub(rf"\n\n{number}. \1\n\n", content)
pattern=rf"^\s*{re.escape(symbol)}\s*(.*?)\s*$",
repl=rf"\n\n{number}. \1\n\n",
string=content,
flags=re.MULTILINE,
)
content = ESCAPED_STAR_PATTERN.sub(r"* \1", content) content = ESCAPED_STAR_PATTERN.sub(r"* \1", content)
@@ -552,7 +556,30 @@ def generate_atom_feed(articles: list[dict[Any, Any]], file_name: str) -> str:
return atom_feed return atom_feed
def create_atom_feeds(output_dir: Path) -> None: def load_all_articles(output_dir: Path) -> list[dict[Any, Any]]:
"""Load all article JSON files from the output directory.
Args:
output_dir (Path): The directory containing article JSON files.
Returns:
list[dict[Any, Any]]: List of article data dictionaries.
"""
articles: list[dict[Any, Any]] = []
for file in output_dir.glob("*.json"):
if file.stem == "ArticleMenu":
continue
with file.open("r", encoding="utf-8") as f:
try:
article_data: dict[Any, Any] = json.load(f)
articles.append(article_data)
except json.JSONDecodeError:
logger.exception("Error decoding JSON from %s", file)
continue
return articles
def create_atom_feeds(articles: list[dict[Any, Any]], output_dir: Path) -> None:
"""Create Atom feeds for the articles. """Create Atom feeds for the articles.
Current feeds are: Current feeds are:
@@ -560,28 +587,19 @@ def create_atom_feeds(output_dir: Path) -> None:
- All articles - All articles
Args: Args:
articles (list[dict[Any, Any]]): List of article data.
output_dir (Path): The directory to save the RSS feed files. output_dir (Path): The directory to save the RSS feed files.
""" """
menu_data: list[dict[Any, Any]] = [] if not articles:
# Load data from all the articles logger.error("Can't create Atom feeds, no articles provided")
for file in output_dir.glob("*.json"): return
if file.stem == "ArticleMenu": if not articles:
continue logger.error("Can't create Atom feeds, no articles provided")
with file.open("r", encoding="utf-8") as f:
try:
article_data: dict[Any, Any] = json.load(f)
menu_data.append(article_data)
except json.JSONDecodeError:
logger.exception("Error decoding JSON from %s", file)
continue
if not menu_data:
logger.error("Can't create Atom feeds, no articles found in %s", output_dir)
return return
articles_sorted: list[dict[Any, Any]] = sorted( articles_sorted: list[dict[Any, Any]] = sorted(
menu_data, articles,
key=lambda x: get_file_timestamp(x.get("createTime", "")), key=lambda x: get_file_timestamp(x.get("createTime", "")),
reverse=True, reverse=True,
) )
@@ -767,9 +785,12 @@ async def main() -> Literal[1, 0]:
else: else:
logger.info("No new articles to download") logger.info("No new articles to download")
# Load all articles once for efficient processing
all_articles = load_all_articles(output_dir)
add_data_to_articles(menu_data, output_dir) add_data_to_articles(menu_data, output_dir)
add_articles_to_readme(menu_data) add_articles_to_readme(menu_data)
create_atom_feeds(output_dir) create_atom_feeds(all_articles, output_dir)
batch_process_timestamps(menu_data, output_dir) batch_process_timestamps(menu_data, output_dir)
logger.info("Script finished. Articles are in the '%s' directory.", output_dir) logger.info("Script finished. Articles are in the '%s' directory.", output_dir)