From ac63041b28d1ce87685523a7957c3b3360c8229c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joakim=20Hells=C3=A9n?= Date: Thu, 3 Apr 2025 05:44:50 +0200 Subject: [PATCH] =?UTF-8?q?Add=20regex=20support=20to=20blacklist=20and=20?= =?UTF-8?q?whitelist=20filters.=20Strong=20code,=20many=20bananas!=20?= =?UTF-8?q?=F0=9F=A6=8D=F0=9F=A6=8D=F0=9F=A6=8D=F0=9F=A6=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 2 +- discord_rss_bot/filter/blacklist.py | 85 +++++++++++++++++++---- discord_rss_bot/filter/utils.py | 50 ++++++++++++++ discord_rss_bot/filter/whitelist.py | 71 +++++++++++++++---- discord_rss_bot/main.py | 58 ++++++++++++++-- discord_rss_bot/templates/blacklist.html | 43 ++++++++++++ discord_rss_bot/templates/whitelist.html | 45 ++++++++++++- pyproject.toml | 4 +- tests/test_blacklist.py | 86 ++++++++++++++++++++++++ tests/test_utils.py | 50 +++++++++++++- tests/test_whitelist.py | 71 +++++++++++++++++++ 11 files changed, 526 insertions(+), 39 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a3c42c0..908367d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: # An extremely fast Python linter and formatter. - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.5 + rev: v0.11.2 hooks: - id: ruff-format - id: ruff diff --git a/discord_rss_bot/filter/blacklist.py b/discord_rss_bot/filter/blacklist.py index 808d7c9..87b4913 100644 --- a/discord_rss_bot/filter/blacklist.py +++ b/discord_rss_bot/filter/blacklist.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from discord_rss_bot.filter.utils import is_word_in_text +from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text if TYPE_CHECKING: from reader import Entry, Feed, Reader @@ -12,9 +12,14 @@ def feed_has_blacklist_tags(custom_reader: Reader, feed: Feed) -> bool: """Return True if the feed has blacklist tags. The following tags are checked: - - blacklist_title + - blacklist_author + - blacklist_content - blacklist_summary - - blacklist_content. + - blacklist_title + - regex_blacklist_author + - regex_blacklist_content + - regex_blacklist_summary + - regex_blacklist_title Args: custom_reader: The reader. @@ -23,14 +28,29 @@ def feed_has_blacklist_tags(custom_reader: Reader, feed: Feed) -> bool: Returns: bool: If the feed has any of the tags. """ - blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")) - blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")) - blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")) + blacklist_author: str = str(custom_reader.get_tag(feed, "blacklist_author", "")).strip() + blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")).strip() + blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")).strip() + blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")).strip() - return bool(blacklist_title or blacklist_summary or blacklist_content) + regex_blacklist_author: str = str(custom_reader.get_tag(feed, "regex_blacklist_author", "")).strip() + regex_blacklist_content: str = str(custom_reader.get_tag(feed, "regex_blacklist_content", "")).strip() + regex_blacklist_summary: str = str(custom_reader.get_tag(feed, "regex_blacklist_summary", "")).strip() + regex_blacklist_title: str = str(custom_reader.get_tag(feed, "regex_blacklist_title", "")).strip() + + return bool( + blacklist_title + or blacklist_author + or blacklist_content + or blacklist_summary + or regex_blacklist_author + or regex_blacklist_content + or regex_blacklist_summary + or regex_blacklist_title, + ) -def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: +def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: # noqa: PLR0911 """Return True if the entry is in the blacklist. Args: @@ -40,21 +60,58 @@ def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: Returns: bool: If the entry is in the blacklist. """ - blacklist_title: str = str(custom_reader.get_tag(entry.feed, "blacklist_title", "")) - blacklist_summary: str = str(custom_reader.get_tag(entry.feed, "blacklist_summary", "")) - blacklist_content: str = str(custom_reader.get_tag(entry.feed, "blacklist_content", "")) - blacklist_author: str = str(custom_reader.get_tag(entry.feed, "blacklist_author", "")) + feed = entry.feed + + blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")).strip() + blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")).strip() + blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")).strip() + blacklist_author: str = str(custom_reader.get_tag(feed, "blacklist_author", "")).strip() + + regex_blacklist_title: str = str(custom_reader.get_tag(feed, "regex_blacklist_title", "")).strip() + regex_blacklist_summary: str = str(custom_reader.get_tag(feed, "regex_blacklist_summary", "")).strip() + regex_blacklist_content: str = str(custom_reader.get_tag(feed, "regex_blacklist_content", "")).strip() + regex_blacklist_author: str = str(custom_reader.get_tag(feed, "regex_blacklist_author", "")).strip() # TODO(TheLovinator): Also add support for entry_text and more. + # Check regular blacklist if entry.title and blacklist_title and is_word_in_text(blacklist_title, entry.title): return True if entry.summary and blacklist_summary and is_word_in_text(blacklist_summary, entry.summary): return True + if ( + entry.content + and entry.content[0].value + and blacklist_content + and is_word_in_text(blacklist_content, entry.content[0].value) + ): + return True if entry.author and blacklist_author and is_word_in_text(blacklist_author, entry.author): return True + if ( + entry.content + and entry.content[0].value + and blacklist_content + and is_word_in_text(blacklist_content, entry.content[0].value) + ): + return True + + # Check regex blacklist + if entry.title and regex_blacklist_title and is_regex_match(regex_blacklist_title, entry.title): + return True + if entry.summary and regex_blacklist_summary and is_regex_match(regex_blacklist_summary, entry.summary): + return True + if ( + entry.content + and entry.content[0].value + and regex_blacklist_content + and is_regex_match(regex_blacklist_content, entry.content[0].value) + ): + return True + if entry.author and regex_blacklist_author and is_regex_match(regex_blacklist_author, entry.author): + return True return bool( entry.content and entry.content[0].value - and blacklist_content - and is_word_in_text(blacklist_content, entry.content[0].value), + and regex_blacklist_content + and is_regex_match(regex_blacklist_content, entry.content[0].value), ) diff --git a/discord_rss_bot/filter/utils.py b/discord_rss_bot/filter/utils.py index 090518d..ff93e59 100644 --- a/discord_rss_bot/filter/utils.py +++ b/discord_rss_bot/filter/utils.py @@ -1,7 +1,10 @@ from __future__ import annotations +import logging import re +logger: logging.Logger = logging.getLogger(__name__) + def is_word_in_text(word_string: str, text: str) -> bool: """Check if any of the words are in the text. @@ -20,3 +23,50 @@ def is_word_in_text(word_string: str, text: str) -> bool: # Check if any pattern matches the text. return any(pattern.search(text) for pattern in patterns) + + +def is_regex_match(regex_string: str, text: str) -> bool: + """Check if any of the regex patterns match the text. + + Args: + regex_string: A string containing regex patterns, separated by newlines or commas. + text: The text to search in. + + Returns: + bool: True if any regex pattern matches the text, otherwise False. + """ + if not regex_string or not text: + return False + + # Split by newlines first, then by commas (for backward compatibility) + regex_list: list[str] = [] + + # First split by newlines + lines: list[str] = regex_string.split("\n") + for line in lines: + stripped_line: str = line.strip() + if stripped_line: + # For backward compatibility, also split by commas if there are any + if "," in stripped_line: + regex_list.extend([part.strip() for part in stripped_line.split(",") if part.strip()]) + else: + regex_list.append(stripped_line) + + # Attempt to compile and apply each regex pattern + for pattern_str in regex_list: + if not pattern_str: + logger.warning("Empty regex pattern found in the list.") + continue + + try: + pattern: re.Pattern[str] = re.compile(pattern_str, re.IGNORECASE) + if pattern.search(text): + logger.info("Regex pattern matched: %s", pattern_str) + return True + except re.error: + logger.warning("Invalid regex pattern: %s", pattern_str) + continue + + logger.info("No regex patterns matched.") + + return False diff --git a/discord_rss_bot/filter/whitelist.py b/discord_rss_bot/filter/whitelist.py index a55a514..b4b5c23 100644 --- a/discord_rss_bot/filter/whitelist.py +++ b/discord_rss_bot/filter/whitelist.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from discord_rss_bot.filter.utils import is_word_in_text +from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text if TYPE_CHECKING: from reader import Entry, Feed, Reader @@ -12,9 +12,14 @@ def has_white_tags(custom_reader: Reader, feed: Feed) -> bool: """Return True if the feed has whitelist tags. The following tags are checked: - - whitelist_title + - regex_whitelist_author + - regex_whitelist_content + - regex_whitelist_summary + - regex_whitelist_title + - whitelist_author + - whitelist_content - whitelist_summary - - whitelist_content. + - whitelist_title Args: custom_reader: The reader. @@ -23,14 +28,29 @@ def has_white_tags(custom_reader: Reader, feed: Feed) -> bool: Returns: bool: If the feed has any of the tags. """ - whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")) - whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")) - whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")) + whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")).strip() + whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")).strip() + whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")).strip() + whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")).strip() - return bool(whitelist_title or whitelist_summary or whitelist_content) + regex_whitelist_title: str = str(custom_reader.get_tag(feed, "regex_whitelist_title", "")).strip() + regex_whitelist_summary: str = str(custom_reader.get_tag(feed, "regex_whitelist_summary", "")).strip() + regex_whitelist_content: str = str(custom_reader.get_tag(feed, "regex_whitelist_content", "")).strip() + regex_whitelist_author: str = str(custom_reader.get_tag(feed, "regex_whitelist_author", "")).strip() + + return bool( + whitelist_title + or whitelist_author + or whitelist_content + or whitelist_summary + or regex_whitelist_author + or regex_whitelist_content + or regex_whitelist_summary + or regex_whitelist_title, + ) -def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: +def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: # noqa: PLR0911 """Return True if the entry is in the whitelist. Args: @@ -41,20 +61,43 @@ def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: bool: If the entry is in the whitelist. """ feed: Feed = entry.feed - whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")) - whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")) - whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")) - whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")) + # Regular whitelist tags + whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")).strip() + whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")).strip() + whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")).strip() + whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")).strip() + # Regex whitelist tags + regex_whitelist_title: str = str(custom_reader.get_tag(feed, "regex_whitelist_title", "")).strip() + regex_whitelist_summary: str = str(custom_reader.get_tag(feed, "regex_whitelist_summary", "")).strip() + regex_whitelist_content: str = str(custom_reader.get_tag(feed, "regex_whitelist_content", "")).strip() + regex_whitelist_author: str = str(custom_reader.get_tag(feed, "regex_whitelist_author", "")).strip() + + # Check regular whitelist if entry.title and whitelist_title and is_word_in_text(whitelist_title, entry.title): return True if entry.summary and whitelist_summary and is_word_in_text(whitelist_summary, entry.summary): return True if entry.author and whitelist_author and is_word_in_text(whitelist_author, entry.author): return True - return bool( + if ( entry.content and entry.content[0].value and whitelist_content - and is_word_in_text(whitelist_content, entry.content[0].value), + and is_word_in_text(whitelist_content, entry.content[0].value) + ): + return True + + # Check regex whitelist + if entry.title and regex_whitelist_title and is_regex_match(regex_whitelist_title, entry.title): + return True + if entry.summary and regex_whitelist_summary and is_regex_match(regex_whitelist_summary, entry.summary): + return True + if entry.author and regex_whitelist_author and is_regex_match(regex_whitelist_author, entry.author): + return True + return bool( + entry.content + and entry.content[0].value + and regex_whitelist_content + and is_regex_match(regex_whitelist_content, entry.content[0].value), ) diff --git a/discord_rss_bot/main.py b/discord_rss_bot/main.py index 3a1f0ca..a7c6510 100644 --- a/discord_rss_bot/main.py +++ b/discord_rss_bot/main.py @@ -43,7 +43,7 @@ from discord_rss_bot.search import create_html_for_search_results from discord_rss_bot.settings import get_reader if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import AsyncGenerator, Iterable from reader.types import JSONType @@ -88,8 +88,15 @@ reader: Reader = get_reader() @asynccontextmanager -async def lifespan(app: FastAPI) -> typing.AsyncGenerator[None]: - """This is needed for the ASGI server to run.""" +async def lifespan(app: FastAPI) -> AsyncGenerator[None]: + """Lifespan for the FastAPI app. + + Args: + app: The FastAPI app. + + Yields: + None: Nothing. + """ add_missing_tags(reader) scheduler: AsyncIOScheduler = AsyncIOScheduler() @@ -250,6 +257,10 @@ async def post_set_whitelist( whitelist_summary: Annotated[str, Form()] = "", whitelist_content: Annotated[str, Form()] = "", whitelist_author: Annotated[str, Form()] = "", + regex_whitelist_title: Annotated[str, Form()] = "", + regex_whitelist_summary: Annotated[str, Form()] = "", + regex_whitelist_content: Annotated[str, Form()] = "", + regex_whitelist_author: Annotated[str, Form()] = "", feed_url: Annotated[str, Form()] = "", ) -> RedirectResponse: """Set what the whitelist should be sent, if you have this set only words in the whitelist will be sent. @@ -259,6 +270,10 @@ async def post_set_whitelist( whitelist_summary: Whitelisted words for when checking the summary. whitelist_content: Whitelisted words for when checking the content. whitelist_author: Whitelisted words for when checking the author. + regex_whitelist_title: Whitelisted regex for when checking the title. + regex_whitelist_summary: Whitelisted regex for when checking the summary. + regex_whitelist_content: Whitelisted regex for when checking the content. + regex_whitelist_author: Whitelisted regex for when checking the author. feed_url: The feed we should set the whitelist for. Returns: @@ -269,6 +284,10 @@ async def post_set_whitelist( reader.set_tag(clean_feed_url, "whitelist_summary", whitelist_summary) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_content", whitelist_content) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_author", whitelist_author) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_title", regex_whitelist_title) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_summary", regex_whitelist_summary) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_content", regex_whitelist_content) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_whitelist_author", regex_whitelist_author) # pyright: ignore[reportArgumentType][call-overload] return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303) @@ -287,11 +306,14 @@ async def get_whitelist(feed_url: str, request: Request): clean_feed_url: str = feed_url.strip() feed: Feed = reader.get_feed(urllib.parse.unquote(clean_feed_url)) - # Get previous data, this is used when creating the form. whitelist_title: str = str(reader.get_tag(feed, "whitelist_title", "")) whitelist_summary: str = str(reader.get_tag(feed, "whitelist_summary", "")) whitelist_content: str = str(reader.get_tag(feed, "whitelist_content", "")) whitelist_author: str = str(reader.get_tag(feed, "whitelist_author", "")) + regex_whitelist_title: str = str(reader.get_tag(feed, "regex_whitelist_title", "")) + regex_whitelist_summary: str = str(reader.get_tag(feed, "regex_whitelist_summary", "")) + regex_whitelist_content: str = str(reader.get_tag(feed, "regex_whitelist_content", "")) + regex_whitelist_author: str = str(reader.get_tag(feed, "regex_whitelist_author", "")) context = { "request": request, @@ -300,6 +322,10 @@ async def get_whitelist(feed_url: str, request: Request): "whitelist_summary": whitelist_summary, "whitelist_content": whitelist_content, "whitelist_author": whitelist_author, + "regex_whitelist_title": regex_whitelist_title, + "regex_whitelist_summary": regex_whitelist_summary, + "regex_whitelist_content": regex_whitelist_content, + "regex_whitelist_author": regex_whitelist_author, } return templates.TemplateResponse(request=request, name="whitelist.html", context=context) @@ -310,6 +336,10 @@ async def post_set_blacklist( blacklist_summary: Annotated[str, Form()] = "", blacklist_content: Annotated[str, Form()] = "", blacklist_author: Annotated[str, Form()] = "", + regex_blacklist_title: Annotated[str, Form()] = "", + regex_blacklist_summary: Annotated[str, Form()] = "", + regex_blacklist_content: Annotated[str, Form()] = "", + regex_blacklist_author: Annotated[str, Form()] = "", feed_url: Annotated[str, Form()] = "", ) -> RedirectResponse: """Set the blacklist. @@ -322,6 +352,10 @@ async def post_set_blacklist( blacklist_summary: Blacklisted words for when checking the summary. blacklist_content: Blacklisted words for when checking the content. blacklist_author: Blacklisted words for when checking the author. + regex_blacklist_title: Blacklisted regex for when checking the title. + regex_blacklist_summary: Blacklisted regex for when checking the summary. + regex_blacklist_content: Blacklisted regex for when checking the content. + regex_blacklist_author: Blacklisted regex for when checking the author. feed_url: What feed we should set the blacklist for. Returns: @@ -332,7 +366,10 @@ async def post_set_blacklist( reader.set_tag(clean_feed_url, "blacklist_summary", blacklist_summary) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_content", blacklist_content) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_author", blacklist_author) # pyright: ignore[reportArgumentType][call-overload] - + reader.set_tag(clean_feed_url, "regex_blacklist_title", regex_blacklist_title) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_blacklist_summary", regex_blacklist_summary) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_blacklist_content", regex_blacklist_content) # pyright: ignore[reportArgumentType][call-overload] + reader.set_tag(clean_feed_url, "regex_blacklist_author", regex_blacklist_author) # pyright: ignore[reportArgumentType][call-overload] return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303) @@ -349,11 +386,14 @@ async def get_blacklist(feed_url: str, request: Request): """ feed: Feed = reader.get_feed(urllib.parse.unquote(feed_url)) - # Get previous data, this is used when creating the form. blacklist_title: str = str(reader.get_tag(feed, "blacklist_title", "")) blacklist_summary: str = str(reader.get_tag(feed, "blacklist_summary", "")) blacklist_content: str = str(reader.get_tag(feed, "blacklist_content", "")) blacklist_author: str = str(reader.get_tag(feed, "blacklist_author", "")) + regex_blacklist_title: str = str(reader.get_tag(feed, "regex_blacklist_title", "")) + regex_blacklist_summary: str = str(reader.get_tag(feed, "regex_blacklist_summary", "")) + regex_blacklist_content: str = str(reader.get_tag(feed, "regex_blacklist_content", "")) + regex_blacklist_author: str = str(reader.get_tag(feed, "regex_blacklist_author", "")) context = { "request": request, @@ -362,6 +402,10 @@ async def get_blacklist(feed_url: str, request: Request): "blacklist_summary": blacklist_summary, "blacklist_content": blacklist_content, "blacklist_author": blacklist_author, + "regex_blacklist_title": regex_blacklist_title, + "regex_blacklist_summary": regex_blacklist_summary, + "regex_blacklist_content": regex_blacklist_content, + "regex_blacklist_author": regex_blacklist_author, } return templates.TemplateResponse(request=request, name="blacklist.html", context=context) @@ -461,7 +505,7 @@ async def get_embed_page(feed_url: str, request: Request): @app.post("/embed", response_class=HTMLResponse) -async def post_embed( # noqa: PLR0913, PLR0917 +async def post_embed( feed_url: Annotated[str, Form()], title: Annotated[str, Form()] = "", description: Annotated[str, Form()] = "", diff --git a/discord_rss_bot/templates/blacklist.html b/discord_rss_bot/templates/blacklist.html index 3632277..ec16bce 100644 --- a/discord_rss_bot/templates/blacklist.html +++ b/discord_rss_bot/templates/blacklist.html @@ -42,6 +42,49 @@ + +
+
+
    +
  • + Regular expression patterns for advanced filtering. Each pattern should be on a new + line. +
  • +
  • Patterns are case-insensitive.
  • +
  • + Examples: + +
    +^New Release:.*
    +\b(update|version|patch)\s+\d+\.\d+
    +.*\[(important|notice)\].*
    +
    +
    +
  • +
+
+ + + + + + + + + + + +
diff --git a/discord_rss_bot/templates/whitelist.html b/discord_rss_bot/templates/whitelist.html index 5a958f6..61755e2 100644 --- a/discord_rss_bot/templates/whitelist.html +++ b/discord_rss_bot/templates/whitelist.html @@ -1,6 +1,6 @@ {% extends "base.html" %} {% block title %} -| Blacklist +| Whitelist {% endblock title %} {% block content %}
@@ -42,6 +42,49 @@ + +
+
+
    +
  • + Regular expression patterns for advanced filtering. Each pattern should be on a new + line. +
  • +
  • Patterns are case-insensitive.
  • +
  • + Examples: + +
    +^New Release:.*
    +\b(update|version|patch)\s+\d+\.\d+
    +.*\[(important|notice)\].*
    +
    +
    +
  • +
+
+ + + + + + + + + + + +
diff --git a/pyproject.toml b/pyproject.toml index 4cda1f6..21ab35a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ platformdirs = "*" python-dotenv = "*" python-multipart = "*" reader = "*" -sentry-sdk = {version = "*", extras = ["fastapi"]} +sentry-sdk = { version = "*", extras = ["fastapi"] } uvicorn = "*" [tool.poetry.group.dev.dependencies] @@ -86,6 +86,8 @@ lint.ignore = [ "PLR6301", # Checks for the presence of unused self parameter in methods definitions. "RUF029", # Checks for functions declared async that do not await or otherwise use features requiring the function to be declared async. "TD003", # Checks that a TODO comment is associated with a link to a relevant issue or ticket. + "PLR0913", # Checks for function definitions that include too many arguments. + "PLR0917", # Checks for function definitions that include too many positional arguments. # Conflicting lint rules when using Ruff's formatter # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules diff --git a/tests/test_blacklist.py b/tests/test_blacklist.py index 4f5a317..d2a785b 100644 --- a/tests/test_blacklist.py +++ b/tests/test_blacklist.py @@ -39,6 +39,13 @@ def test_has_black_tags() -> None: check_if_has_tag(reader, feed, "blacklist_title") check_if_has_tag(reader, feed, "blacklist_summary") check_if_has_tag(reader, feed, "blacklist_content") + check_if_has_tag(reader, feed, "blacklist_author") + + # Test regex blacklist tags + check_if_has_tag(reader, feed, "regex_blacklist_title") + check_if_has_tag(reader, feed, "regex_blacklist_summary") + check_if_has_tag(reader, feed, "regex_blacklist_content") + check_if_has_tag(reader, feed, "regex_blacklist_author") # Clean up reader.delete_feed(feed_url) @@ -74,6 +81,7 @@ def test_should_be_skipped() -> None: # Test entry without any blacklists assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + # Test standard blacklist functionality reader.set_tag(feed, "blacklist_title", "fvnnnfnfdnfdnfd") # pyright: ignore[reportArgumentType] assert entry_should_be_skipped(reader, first_entry[0]) is True, f"Entry should be skipped: {first_entry[0]}" reader.delete_tag(feed, "blacklist_title") @@ -113,3 +121,81 @@ def test_should_be_skipped() -> None: assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" reader.delete_tag(feed, "blacklist_author") assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + +def test_regex_should_be_skipped() -> None: + """Test the regex filtering functionality for blacklist.""" + reader: Reader = get_reader() + + # Add feed and update entries + reader.add_feed(feed_url) + feed: Feed = reader.get_feed(feed_url) + reader.update_feeds() + + # Get first entry + first_entry: list[Entry] = [] + entries: Iterable[Entry] = reader.get_entries(feed=feed) + assert entries is not None, f"Entries should not be None: {entries}" + for entry in entries: + first_entry.append(entry) + break + assert len(first_entry) == 1, f"First entry should be added: {first_entry}" + + # Test entry without any regex blacklists + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + # Test regex blacklist for title + reader.set_tag(feed, "regex_blacklist_title", r"fvnnn\w+") # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is True, ( + f"Entry should be skipped with regex title match: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_title") + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + # Test regex blacklist for summary + reader.set_tag(feed, "regex_blacklist_summary", r"ffdnfdn\w+") # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is True, ( + f"Entry should be skipped with regex summary match: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_summary") + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + # Test regex blacklist for content + reader.set_tag(feed, "regex_blacklist_content", r"ffdnfdnfdn\w+") # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is True, ( + f"Entry should be skipped with regex content match: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_content") + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + # Test regex blacklist for author + reader.set_tag(feed, "regex_blacklist_author", r"TheLovinator\d*") # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is True, ( + f"Entry should be skipped with regex author match: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_author") + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + # Test invalid regex pattern (should not raise an exception) + reader.set_tag(feed, "regex_blacklist_title", r"[incomplete") # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is False, ( + f"Entry should not be skipped with invalid regex: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_title") + + # Test multiple regex patterns separated by commas + reader.set_tag(feed, "regex_blacklist_author", r"pattern1,TheLovinator\d*,pattern3") # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is True, ( + f"Entry should be skipped with one matching pattern in list: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_author") + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" + + # Test newline-separated regex patterns + newline_patterns = "pattern1\nTheLovinator\\d*\npattern3" + reader.set_tag(feed, "regex_blacklist_author", newline_patterns) # pyright: ignore[reportArgumentType] + assert entry_should_be_skipped(reader, first_entry[0]) is True, ( + f"Entry should be skipped with newline-separated patterns: {first_entry[0]}" + ) + reader.delete_tag(feed, "regex_blacklist_author") + assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" diff --git a/tests/test_utils.py b/tests/test_utils.py index 0bccb6b..5274eb8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from discord_rss_bot.filter.utils import is_word_in_text +from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text def test_is_word_in_text() -> None: @@ -14,3 +14,51 @@ def test_is_word_in_text() -> None: assert is_word_in_text("Alert,Forma", "Outbreak - Mutagen Mass - Rhea (Saturn)") is False, msg_false assert is_word_in_text("Alert,Forma", "Outbreak - Mutagen Mass - Rhea (Saturn)") is False, msg_false assert is_word_in_text("word1,word2", "This is a sample text containing none of the words.") is False, msg_false + + +def test_is_regex_match() -> None: + msg_true = "Should return True" + msg_false = "Should return False" + + # Test basic regex patterns + assert is_regex_match(r"word\d+", "This text contains word123") is True, msg_true + assert is_regex_match(r"^Hello", "Hello world") is True, msg_true + assert is_regex_match(r"world$", "Hello world") is True, msg_true + + # Test case insensitivity + assert is_regex_match(r"hello", "This text contains HELLO") is True, msg_true + + # Test comma-separated patterns + assert is_regex_match(r"pattern1,pattern2", "This contains pattern2") is True, msg_true + assert is_regex_match(r"pattern1, pattern2", "This contains pattern1") is True, msg_true + + # Test regex that shouldn't match + assert is_regex_match(r"^start", "This doesn't start with the pattern") is False, msg_false + assert is_regex_match(r"end$", "This doesn't end with the pattern") is False, msg_false + + # Test with empty input + assert is_regex_match("", "Some text") is False, msg_false + assert is_regex_match("pattern", "") is False, msg_false + + # Test with invalid regex (should not raise an exception and return False) + assert is_regex_match(r"[incomplete", "Some text") is False, msg_false + + # Test with multiple patterns where one is invalid + assert is_regex_match(r"valid, [invalid, \w+", "Contains word") is True, msg_true + + # Test newline-separated patterns + newline_patterns = "pattern1\n^start\ncontains\\d+" + assert is_regex_match(newline_patterns, "This contains123 text") is True, msg_true + assert is_regex_match(newline_patterns, "start of line") is True, msg_true + assert is_regex_match(newline_patterns, "pattern1 is here") is True, msg_true + assert is_regex_match(newline_patterns, "None of these match") is False, msg_false + + # Test mixed newline and comma patterns (for backward compatibility) + mixed_patterns = "pattern1\npattern2,pattern3\npattern4" + assert is_regex_match(mixed_patterns, "Contains pattern3") is True, msg_true + assert is_regex_match(mixed_patterns, "Contains pattern4") is True, msg_true + + # Test with empty lines and spaces + whitespace_patterns = "\\s+\n \n\npattern\n\n" + assert is_regex_match(whitespace_patterns, "text with spaces") is True, msg_true + assert is_regex_match(whitespace_patterns, "text with pattern") is True, msg_true diff --git a/tests/test_whitelist.py b/tests/test_whitelist.py index cf39aa0..9fbb712 100644 --- a/tests/test_whitelist.py +++ b/tests/test_whitelist.py @@ -38,6 +38,13 @@ def test_has_white_tags() -> None: check_if_has_tag(reader, feed, "whitelist_title") check_if_has_tag(reader, feed, "whitelist_summary") check_if_has_tag(reader, feed, "whitelist_content") + check_if_has_tag(reader, feed, "whitelist_author") + + # Test regex whitelist tags + check_if_has_tag(reader, feed, "regex_whitelist_title") + check_if_has_tag(reader, feed, "regex_whitelist_summary") + check_if_has_tag(reader, feed, "regex_whitelist_content") + check_if_has_tag(reader, feed, "regex_whitelist_author") # Clean up reader.delete_feed(feed_url) @@ -109,3 +116,67 @@ def test_should_be_sent() -> None: assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" reader.delete_tag(feed, "whitelist_author") assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + +def test_regex_should_be_sent() -> None: + """Test the regex filtering functionality for whitelist.""" + reader: Reader = get_reader() + + # Add feed and update entries + reader.add_feed(feed_url) + feed: Feed = reader.get_feed(feed_url) + reader.update_feeds() + + # Get first entry + first_entry: list[Entry] = [] + entries: Iterable[Entry] = reader.get_entries(feed=feed) + assert entries is not None, "Entries should not be None" + for entry in entries: + first_entry.append(entry) + break + assert len(first_entry) == 1, "First entry should be added" + + # Test entry without any regex whitelists + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + # Test regex whitelist for title + reader.set_tag(feed, "regex_whitelist_title", r"fvnnn\w+") # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex title match" + reader.delete_tag(feed, "regex_whitelist_title") + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + # Test regex whitelist for summary + reader.set_tag(feed, "regex_whitelist_summary", r"ffdnfdn\w+") # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex summary match" + reader.delete_tag(feed, "regex_whitelist_summary") + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + # Test regex whitelist for content + reader.set_tag(feed, "regex_whitelist_content", r"ffdnfdnfdn\w+") # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex content match" + reader.delete_tag(feed, "regex_whitelist_content") + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + # Test regex whitelist for author + reader.set_tag(feed, "regex_whitelist_author", r"TheLovinator\d*") # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex author match" + reader.delete_tag(feed, "regex_whitelist_author") + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + # Test invalid regex pattern (should not raise an exception) + reader.set_tag(feed, "regex_whitelist_title", r"[incomplete") # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent with invalid regex" + reader.delete_tag(feed, "regex_whitelist_title") + + # Test multiple regex patterns separated by commas + reader.set_tag(feed, "regex_whitelist_author", r"pattern1,TheLovinator\d*,pattern3") # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with one matching pattern in list" + reader.delete_tag(feed, "regex_whitelist_author") + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" + + # Test newline-separated regex patterns + newline_patterns = "pattern1\nTheLovinator\\d*\npattern3" + reader.set_tag(feed, "regex_whitelist_author", newline_patterns) # pyright: ignore[reportArgumentType] + assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with newline-separated patterns" + reader.delete_tag(feed, "regex_whitelist_author") + assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"