diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a3c42c0..908367d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: # An extremely fast Python linter and formatter. - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.5 + rev: v0.11.2 hooks: - id: ruff-format - id: ruff diff --git a/discord_rss_bot/filter/blacklist.py b/discord_rss_bot/filter/blacklist.py index 808d7c9..87b4913 100644 --- a/discord_rss_bot/filter/blacklist.py +++ b/discord_rss_bot/filter/blacklist.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from discord_rss_bot.filter.utils import is_word_in_text +from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text if TYPE_CHECKING: from reader import Entry, Feed, Reader @@ -12,9 +12,14 @@ def feed_has_blacklist_tags(custom_reader: Reader, feed: Feed) -> bool: """Return True if the feed has blacklist tags. The following tags are checked: - - blacklist_title + - blacklist_author + - blacklist_content - blacklist_summary - - blacklist_content. + - blacklist_title + - regex_blacklist_author + - regex_blacklist_content + - regex_blacklist_summary + - regex_blacklist_title Args: custom_reader: The reader. @@ -23,14 +28,29 @@ def feed_has_blacklist_tags(custom_reader: Reader, feed: Feed) -> bool: Returns: bool: If the feed has any of the tags. """ - blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")) - blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")) - blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")) + blacklist_author: str = str(custom_reader.get_tag(feed, "blacklist_author", "")).strip() + blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")).strip() + blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")).strip() + blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")).strip() - return bool(blacklist_title or blacklist_summary or blacklist_content) + regex_blacklist_author: str = str(custom_reader.get_tag(feed, "regex_blacklist_author", "")).strip() + regex_blacklist_content: str = str(custom_reader.get_tag(feed, "regex_blacklist_content", "")).strip() + regex_blacklist_summary: str = str(custom_reader.get_tag(feed, "regex_blacklist_summary", "")).strip() + regex_blacklist_title: str = str(custom_reader.get_tag(feed, "regex_blacklist_title", "")).strip() + + return bool( + blacklist_title + or blacklist_author + or blacklist_content + or blacklist_summary + or regex_blacklist_author + or regex_blacklist_content + or regex_blacklist_summary + or regex_blacklist_title, + ) -def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: +def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: # noqa: PLR0911 """Return True if the entry is in the blacklist. Args: @@ -40,21 +60,58 @@ def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: Returns: bool: If the entry is in the blacklist. """ - blacklist_title: str = str(custom_reader.get_tag(entry.feed, "blacklist_title", "")) - blacklist_summary: str = str(custom_reader.get_tag(entry.feed, "blacklist_summary", "")) - blacklist_content: str = str(custom_reader.get_tag(entry.feed, "blacklist_content", "")) - blacklist_author: str = str(custom_reader.get_tag(entry.feed, "blacklist_author", "")) + feed = entry.feed + + blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")).strip() + blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")).strip() + blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")).strip() + blacklist_author: str = str(custom_reader.get_tag(feed, "blacklist_author", "")).strip() + + regex_blacklist_title: str = str(custom_reader.get_tag(feed, "regex_blacklist_title", "")).strip() + regex_blacklist_summary: str = str(custom_reader.get_tag(feed, "regex_blacklist_summary", "")).strip() + regex_blacklist_content: str = str(custom_reader.get_tag(feed, "regex_blacklist_content", "")).strip() + regex_blacklist_author: str = str(custom_reader.get_tag(feed, "regex_blacklist_author", "")).strip() # TODO(TheLovinator): Also add support for entry_text and more. + # Check regular blacklist if entry.title and blacklist_title and is_word_in_text(blacklist_title, entry.title): return True if entry.summary and blacklist_summary and is_word_in_text(blacklist_summary, entry.summary): return True + if ( + entry.content + and entry.content[0].value + and blacklist_content + and is_word_in_text(blacklist_content, entry.content[0].value) + ): + return True if entry.author and blacklist_author and is_word_in_text(blacklist_author, entry.author): return True + if ( + entry.content + and entry.content[0].value + and blacklist_content + and is_word_in_text(blacklist_content, entry.content[0].value) + ): + return True + + # Check regex blacklist + if entry.title and regex_blacklist_title and is_regex_match(regex_blacklist_title, entry.title): + return True + if entry.summary and regex_blacklist_summary and is_regex_match(regex_blacklist_summary, entry.summary): + return True + if ( + entry.content + and entry.content[0].value + and regex_blacklist_content + and is_regex_match(regex_blacklist_content, entry.content[0].value) + ): + return True + if entry.author and regex_blacklist_author and is_regex_match(regex_blacklist_author, entry.author): + return True return bool( entry.content and entry.content[0].value - and blacklist_content - and is_word_in_text(blacklist_content, entry.content[0].value), + and regex_blacklist_content + and is_regex_match(regex_blacklist_content, entry.content[0].value), ) diff --git a/discord_rss_bot/filter/utils.py b/discord_rss_bot/filter/utils.py index 090518d..ff93e59 100644 --- a/discord_rss_bot/filter/utils.py +++ b/discord_rss_bot/filter/utils.py @@ -1,7 +1,10 @@ from __future__ import annotations +import logging import re +logger: logging.Logger = logging.getLogger(__name__) + def is_word_in_text(word_string: str, text: str) -> bool: """Check if any of the words are in the text. @@ -20,3 +23,50 @@ def is_word_in_text(word_string: str, text: str) -> bool: # Check if any pattern matches the text. return any(pattern.search(text) for pattern in patterns) + + +def is_regex_match(regex_string: str, text: str) -> bool: + """Check if any of the regex patterns match the text. + + Args: + regex_string: A string containing regex patterns, separated by newlines or commas. + text: The text to search in. + + Returns: + bool: True if any regex pattern matches the text, otherwise False. + """ + if not regex_string or not text: + return False + + # Split by newlines first, then by commas (for backward compatibility) + regex_list: list[str] = [] + + # First split by newlines + lines: list[str] = regex_string.split("\n") + for line in lines: + stripped_line: str = line.strip() + if stripped_line: + # For backward compatibility, also split by commas if there are any + if "," in stripped_line: + regex_list.extend([part.strip() for part in stripped_line.split(",") if part.strip()]) + else: + regex_list.append(stripped_line) + + # Attempt to compile and apply each regex pattern + for pattern_str in regex_list: + if not pattern_str: + logger.warning("Empty regex pattern found in the list.") + continue + + try: + pattern: re.Pattern[str] = re.compile(pattern_str, re.IGNORECASE) + if pattern.search(text): + logger.info("Regex pattern matched: %s", pattern_str) + return True + except re.error: + logger.warning("Invalid regex pattern: %s", pattern_str) + continue + + logger.info("No regex patterns matched.") + + return False diff --git a/discord_rss_bot/filter/whitelist.py b/discord_rss_bot/filter/whitelist.py index a55a514..b4b5c23 100644 --- a/discord_rss_bot/filter/whitelist.py +++ b/discord_rss_bot/filter/whitelist.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from discord_rss_bot.filter.utils import is_word_in_text +from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text if TYPE_CHECKING: from reader import Entry, Feed, Reader @@ -12,9 +12,14 @@ def has_white_tags(custom_reader: Reader, feed: Feed) -> bool: """Return True if the feed has whitelist tags. The following tags are checked: - - whitelist_title + - regex_whitelist_author + - regex_whitelist_content + - regex_whitelist_summary + - regex_whitelist_title + - whitelist_author + - whitelist_content - whitelist_summary - - whitelist_content. + - whitelist_title Args: custom_reader: The reader. @@ -23,14 +28,29 @@ def has_white_tags(custom_reader: Reader, feed: Feed) -> bool: Returns: bool: If the feed has any of the tags. """ - whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")) - whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")) - whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")) + whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")).strip() + whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")).strip() + whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")).strip() + whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")).strip() - return bool(whitelist_title or whitelist_summary or whitelist_content) + regex_whitelist_title: str = str(custom_reader.get_tag(feed, "regex_whitelist_title", "")).strip() + regex_whitelist_summary: str = str(custom_reader.get_tag(feed, "regex_whitelist_summary", "")).strip() + regex_whitelist_content: str = str(custom_reader.get_tag(feed, "regex_whitelist_content", "")).strip() + regex_whitelist_author: str = str(custom_reader.get_tag(feed, "regex_whitelist_author", "")).strip() + + return bool( + whitelist_title + or whitelist_author + or whitelist_content + or whitelist_summary + or regex_whitelist_author + or regex_whitelist_content + or regex_whitelist_summary + or regex_whitelist_title, + ) -def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: +def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: # noqa: PLR0911 """Return True if the entry is in the whitelist. Args: @@ -41,20 +61,43 @@ def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: bool: If the entry is in the whitelist. """ feed: Feed = entry.feed - whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")) - whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")) - whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")) - whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")) + # Regular whitelist tags + whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")).strip() + whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")).strip() + whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")).strip() + whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")).strip() + # Regex whitelist tags + regex_whitelist_title: str = str(custom_reader.get_tag(feed, "regex_whitelist_title", "")).strip() + regex_whitelist_summary: str = str(custom_reader.get_tag(feed, "regex_whitelist_summary", "")).strip() + regex_whitelist_content: str = str(custom_reader.get_tag(feed, "regex_whitelist_content", "")).strip() + regex_whitelist_author: str = str(custom_reader.get_tag(feed, "regex_whitelist_author", "")).strip() + + # Check regular whitelist if entry.title and whitelist_title and is_word_in_text(whitelist_title, entry.title): return True if entry.summary and whitelist_summary and is_word_in_text(whitelist_summary, entry.summary): return True if entry.author and whitelist_author and is_word_in_text(whitelist_author, entry.author): return True - return bool( + if ( entry.content and entry.content[0].value and whitelist_content - and is_word_in_text(whitelist_content, entry.content[0].value), + and is_word_in_text(whitelist_content, entry.content[0].value) + ): + return True + + # Check regex whitelist + if entry.title and regex_whitelist_title and is_regex_match(regex_whitelist_title, entry.title): + return True + if entry.summary and regex_whitelist_summary and is_regex_match(regex_whitelist_summary, entry.summary): + return True + if entry.author and regex_whitelist_author and is_regex_match(regex_whitelist_author, entry.author): + return True + return bool( + entry.content + and entry.content[0].value + and regex_whitelist_content + and is_regex_match(regex_whitelist_content, entry.content[0].value), ) diff --git a/discord_rss_bot/main.py b/discord_rss_bot/main.py index 3a1f0ca..a7c6510 100644 --- a/discord_rss_bot/main.py +++ b/discord_rss_bot/main.py @@ -43,7 +43,7 @@ from discord_rss_bot.search import create_html_for_search_results from discord_rss_bot.settings import get_reader if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import AsyncGenerator, Iterable from reader.types import JSONType @@ -88,8 +88,15 @@ reader: Reader = get_reader() @asynccontextmanager -async def lifespan(app: FastAPI) -> typing.AsyncGenerator[None]: - """This is needed for the ASGI server to run.""" +async def lifespan(app: FastAPI) -> AsyncGenerator[None]: + """Lifespan for the FastAPI app. + + Args: + app: The FastAPI app. + + Yields: + None: Nothing. + """ add_missing_tags(reader) scheduler: AsyncIOScheduler = AsyncIOScheduler() @@ -250,6 +257,10 @@ async def post_set_whitelist( whitelist_summary: Annotated[str, Form()] = "", whitelist_content: Annotated[str, Form()] = "", whitelist_author: Annotated[str, Form()] = "", + regex_whitelist_title: Annotated[str, Form()] = "", + regex_whitelist_summary: Annotated[str, Form()] = "", + regex_whitelist_content: Annotated[str, Form()] = "", + regex_whitelist_author: Annotated[str, Form()] = "", feed_url: Annotated[str, Form()] = "", ) -> RedirectResponse: """Set what the whitelist should be sent, if you have this set only words in the whitelist will be sent. @@ -259,6 +270,10 @@ async def post_set_whitelist( whitelist_summary: Whitelisted words for when checking the summary. whitelist_content: Whitelisted words for when checking the content. whitelist_author: Whitelisted words for when checking the author. + regex_whitelist_title: Whitelisted regex for when checking the title. + regex_whitelist_summary: Whitelisted regex for when checking the summary. + regex_whitelist_content: Whitelisted regex for when checking the content. + regex_whitelist_author: Whitelisted regex for when checking the author. feed_url: The feed we should set the whitelist for. Returns: @@ -269,6 +284,10 @@ async def post_set_whitelist( reader.set_tag(clean_feed_url, "whitelist_summary", whitelist_summary) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_content", whitelist_content) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_author", whitelist_author) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_title", regex_whitelist_title) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_summary", regex_whitelist_summary) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_content", regex_whitelist_content) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_author", regex_whitelist_author) # pyright: ignore[reportArgumentType][call-overload] return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303) @@ -287,11 +306,14 @@ async def get_whitelist(feed_url: str, request: Request): clean_feed_url: str = feed_url.strip() feed: Feed = reader.get_feed(urllib.parse.unquote(clean_feed_url)) - # Get previous data, this is used when creating the form. whitelist_title: str = str(reader.get_tag(feed, "whitelist_title", "")) whitelist_summary: str = str(reader.get_tag(feed, "whitelist_summary", "")) whitelist_content: str = str(reader.get_tag(feed, "whitelist_content", "")) whitelist_author: str = str(reader.get_tag(feed, "whitelist_author", "")) + regex_whitelist_title: str = str(reader.get_tag(feed, "regex_whitelist_title", "")) + regex_whitelist_summary: str = str(reader.get_tag(feed, "regex_whitelist_summary", "")) + regex_whitelist_content: str = str(reader.get_tag(feed, "regex_whitelist_content", "")) + regex_whitelist_author: str = str(reader.get_tag(feed, "regex_whitelist_author", "")) context = { "request": request, @@ -300,6 +322,10 @@ async def get_whitelist(feed_url: str, request: Request): "whitelist_summary": whitelist_summary, "whitelist_content": whitelist_content, "whitelist_author": whitelist_author, + "regex_whitelist_title": regex_whitelist_title, + "regex_whitelist_summary": regex_whitelist_summary, + "regex_whitelist_content": regex_whitelist_content, + "regex_whitelist_author": regex_whitelist_author, } return templates.TemplateResponse(request=request, name="whitelist.html", context=context) @@ -310,6 +336,10 @@ async def post_set_blacklist( blacklist_summary: Annotated[str, Form()] = "", blacklist_content: Annotated[str, Form()] = "", blacklist_author: Annotated[str, Form()] = "", + regex_blacklist_title: Annotated[str, Form()] = "", + regex_blacklist_summary: Annotated[str, Form()] = "", + regex_blacklist_content: Annotated[str, Form()] = "", + regex_blacklist_author: Annotated[str, Form()] = "", feed_url: Annotated[str, Form()] = "", ) -> RedirectResponse: """Set the blacklist. @@ -322,6 +352,10 @@ async def post_set_blacklist( blacklist_summary: Blacklisted words for when checking the summary. blacklist_content: Blacklisted words for when checking the content. blacklist_author: Blacklisted words for when checking the author. + regex_blacklist_title: Blacklisted regex for when checking the title. + regex_blacklist_summary: Blacklisted regex for when checking the summary. + regex_blacklist_content: Blacklisted regex for when checking the content. + regex_blacklist_author: Blacklisted regex for when checking the author. feed_url: What feed we should set the blacklist for. Returns: @@ -332,7 +366,10 @@ async def post_set_blacklist( reader.set_tag(clean_feed_url, "blacklist_summary", blacklist_summary) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_content", blacklist_content) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_author", blacklist_author) # pyright: ignore[reportArgumentType][call-overload] - + reader.set_tag(clean_feed_url, "regex_blacklist_title", regex_blacklist_title) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_blacklist_summary", regex_blacklist_summary) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_blacklist_content", regex_blacklist_content) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_blacklist_author", regex_blacklist_author) # pyright: ignore[reportArgumentType][call-overload] return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303) @@ -349,11 +386,14 @@ async def get_blacklist(feed_url: str, request: Request): """ feed: Feed = reader.get_feed(urllib.parse.unquote(feed_url)) - # Get previous data, this is used when creating the form. blacklist_title: str = str(reader.get_tag(feed, "blacklist_title", "")) blacklist_summary: str = str(reader.get_tag(feed, "blacklist_summary", "")) blacklist_content: str = str(reader.get_tag(feed, "blacklist_content", "")) blacklist_author: str = str(reader.get_tag(feed, "blacklist_author", "")) + regex_blacklist_title: str = str(reader.get_tag(feed, "regex_blacklist_title", "")) + regex_blacklist_summary: str = str(reader.get_tag(feed, "regex_blacklist_summary", "")) + regex_blacklist_content: str = str(reader.get_tag(feed, "regex_blacklist_content", "")) + regex_blacklist_author: str = str(reader.get_tag(feed, "regex_blacklist_author", "")) context = { "request": request, @@ -362,6 +402,10 @@ async def get_blacklist(feed_url: str, request: Request): "blacklist_summary": blacklist_summary, "blacklist_content": blacklist_content, "blacklist_author": blacklist_author, + "regex_blacklist_title": regex_blacklist_title, + "regex_blacklist_summary": regex_blacklist_summary, + "regex_blacklist_content": regex_blacklist_content, + "regex_blacklist_author": regex_blacklist_author, } return templates.TemplateResponse(request=request, name="blacklist.html", context=context) @@ -461,7 +505,7 @@ async def get_embed_page(feed_url: str, request: Request): @app.post("/embed", response_class=HTMLResponse) -async def post_embed( # noqa: PLR0913, PLR0917 +async def post_embed( feed_url: Annotated[str, Form()], title: Annotated[str, Form()] = "", description: Annotated[str, Form()] = "", diff --git a/discord_rss_bot/templates/blacklist.html b/discord_rss_bot/templates/blacklist.html index 3632277..ec16bce 100644 --- a/discord_rss_bot/templates/blacklist.html +++ b/discord_rss_bot/templates/blacklist.html @@ -42,6 +42,49 @@ + +
+
+^New Release:.*
+\b(update|version|patch)\s+\d+\.\d+
+.*\[(important|notice)\].*
+
+
+
+
+^New Release:.*
+\b(update|version|patch)\s+\d+\.\d+
+.*\[(important|notice)\].*
+
+
+