Add regex support to blacklist and whitelist filters. Strong code, many bananas! 🦍🦍🦍🦍

This commit is contained in:
2025-04-03 05:44:50 +02:00
parent 84e39c9f79
commit ac63041b28
11 changed files with 526 additions and 39 deletions

View File

@ -38,7 +38,7 @@ repos:
# An extremely fast Python linter and formatter. # An extremely fast Python linter and formatter.
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.5 rev: v0.11.2
hooks: hooks:
- id: ruff-format - id: ruff-format
- id: ruff - id: ruff

View File

@ -2,7 +2,7 @@ from __future__ import annotations
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from discord_rss_bot.filter.utils import is_word_in_text from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text
if TYPE_CHECKING: if TYPE_CHECKING:
from reader import Entry, Feed, Reader from reader import Entry, Feed, Reader
@ -12,9 +12,14 @@ def feed_has_blacklist_tags(custom_reader: Reader, feed: Feed) -> bool:
"""Return True if the feed has blacklist tags. """Return True if the feed has blacklist tags.
The following tags are checked: The following tags are checked:
- blacklist_title - blacklist_author
- blacklist_content
- blacklist_summary - blacklist_summary
- blacklist_content. - blacklist_title
- regex_blacklist_author
- regex_blacklist_content
- regex_blacklist_summary
- regex_blacklist_title
Args: Args:
custom_reader: The reader. custom_reader: The reader.
@ -23,14 +28,29 @@ def feed_has_blacklist_tags(custom_reader: Reader, feed: Feed) -> bool:
Returns: Returns:
bool: If the feed has any of the tags. bool: If the feed has any of the tags.
""" """
blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")) blacklist_author: str = str(custom_reader.get_tag(feed, "blacklist_author", "")).strip()
blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")) blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")).strip()
blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")) blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")).strip()
blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")).strip()
return bool(blacklist_title or blacklist_summary or blacklist_content) regex_blacklist_author: str = str(custom_reader.get_tag(feed, "regex_blacklist_author", "")).strip()
regex_blacklist_content: str = str(custom_reader.get_tag(feed, "regex_blacklist_content", "")).strip()
regex_blacklist_summary: str = str(custom_reader.get_tag(feed, "regex_blacklist_summary", "")).strip()
regex_blacklist_title: str = str(custom_reader.get_tag(feed, "regex_blacklist_title", "")).strip()
return bool(
blacklist_title
or blacklist_author
or blacklist_content
or blacklist_summary
or regex_blacklist_author
or regex_blacklist_content
or regex_blacklist_summary
or regex_blacklist_title,
)
def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool: # noqa: PLR0911
"""Return True if the entry is in the blacklist. """Return True if the entry is in the blacklist.
Args: Args:
@ -40,21 +60,58 @@ def entry_should_be_skipped(custom_reader: Reader, entry: Entry) -> bool:
Returns: Returns:
bool: If the entry is in the blacklist. bool: If the entry is in the blacklist.
""" """
blacklist_title: str = str(custom_reader.get_tag(entry.feed, "blacklist_title", "")) feed = entry.feed
blacklist_summary: str = str(custom_reader.get_tag(entry.feed, "blacklist_summary", ""))
blacklist_content: str = str(custom_reader.get_tag(entry.feed, "blacklist_content", "")) blacklist_title: str = str(custom_reader.get_tag(feed, "blacklist_title", "")).strip()
blacklist_author: str = str(custom_reader.get_tag(entry.feed, "blacklist_author", "")) blacklist_summary: str = str(custom_reader.get_tag(feed, "blacklist_summary", "")).strip()
blacklist_content: str = str(custom_reader.get_tag(feed, "blacklist_content", "")).strip()
blacklist_author: str = str(custom_reader.get_tag(feed, "blacklist_author", "")).strip()
regex_blacklist_title: str = str(custom_reader.get_tag(feed, "regex_blacklist_title", "")).strip()
regex_blacklist_summary: str = str(custom_reader.get_tag(feed, "regex_blacklist_summary", "")).strip()
regex_blacklist_content: str = str(custom_reader.get_tag(feed, "regex_blacklist_content", "")).strip()
regex_blacklist_author: str = str(custom_reader.get_tag(feed, "regex_blacklist_author", "")).strip()
# TODO(TheLovinator): Also add support for entry_text and more. # TODO(TheLovinator): Also add support for entry_text and more.
# Check regular blacklist
if entry.title and blacklist_title and is_word_in_text(blacklist_title, entry.title): if entry.title and blacklist_title and is_word_in_text(blacklist_title, entry.title):
return True return True
if entry.summary and blacklist_summary and is_word_in_text(blacklist_summary, entry.summary): if entry.summary and blacklist_summary and is_word_in_text(blacklist_summary, entry.summary):
return True return True
if (
entry.content
and entry.content[0].value
and blacklist_content
and is_word_in_text(blacklist_content, entry.content[0].value)
):
return True
if entry.author and blacklist_author and is_word_in_text(blacklist_author, entry.author): if entry.author and blacklist_author and is_word_in_text(blacklist_author, entry.author):
return True return True
if (
entry.content
and entry.content[0].value
and blacklist_content
and is_word_in_text(blacklist_content, entry.content[0].value)
):
return True
# Check regex blacklist
if entry.title and regex_blacklist_title and is_regex_match(regex_blacklist_title, entry.title):
return True
if entry.summary and regex_blacklist_summary and is_regex_match(regex_blacklist_summary, entry.summary):
return True
if (
entry.content
and entry.content[0].value
and regex_blacklist_content
and is_regex_match(regex_blacklist_content, entry.content[0].value)
):
return True
if entry.author and regex_blacklist_author and is_regex_match(regex_blacklist_author, entry.author):
return True
return bool( return bool(
entry.content entry.content
and entry.content[0].value and entry.content[0].value
and blacklist_content and regex_blacklist_content
and is_word_in_text(blacklist_content, entry.content[0].value), and is_regex_match(regex_blacklist_content, entry.content[0].value),
) )

View File

@ -1,7 +1,10 @@
from __future__ import annotations from __future__ import annotations
import logging
import re import re
logger: logging.Logger = logging.getLogger(__name__)
def is_word_in_text(word_string: str, text: str) -> bool: def is_word_in_text(word_string: str, text: str) -> bool:
"""Check if any of the words are in the text. """Check if any of the words are in the text.
@ -20,3 +23,50 @@ def is_word_in_text(word_string: str, text: str) -> bool:
# Check if any pattern matches the text. # Check if any pattern matches the text.
return any(pattern.search(text) for pattern in patterns) return any(pattern.search(text) for pattern in patterns)
def is_regex_match(regex_string: str, text: str) -> bool:
"""Check if any of the regex patterns match the text.
Args:
regex_string: A string containing regex patterns, separated by newlines or commas.
text: The text to search in.
Returns:
bool: True if any regex pattern matches the text, otherwise False.
"""
if not regex_string or not text:
return False
# Split by newlines first, then by commas (for backward compatibility)
regex_list: list[str] = []
# First split by newlines
lines: list[str] = regex_string.split("\n")
for line in lines:
stripped_line: str = line.strip()
if stripped_line:
# For backward compatibility, also split by commas if there are any
if "," in stripped_line:
regex_list.extend([part.strip() for part in stripped_line.split(",") if part.strip()])
else:
regex_list.append(stripped_line)
# Attempt to compile and apply each regex pattern
for pattern_str in regex_list:
if not pattern_str:
logger.warning("Empty regex pattern found in the list.")
continue
try:
pattern: re.Pattern[str] = re.compile(pattern_str, re.IGNORECASE)
if pattern.search(text):
logger.info("Regex pattern matched: %s", pattern_str)
return True
except re.error:
logger.warning("Invalid regex pattern: %s", pattern_str)
continue
logger.info("No regex patterns matched.")
return False

View File

@ -2,7 +2,7 @@ from __future__ import annotations
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from discord_rss_bot.filter.utils import is_word_in_text from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text
if TYPE_CHECKING: if TYPE_CHECKING:
from reader import Entry, Feed, Reader from reader import Entry, Feed, Reader
@ -12,9 +12,14 @@ def has_white_tags(custom_reader: Reader, feed: Feed) -> bool:
"""Return True if the feed has whitelist tags. """Return True if the feed has whitelist tags.
The following tags are checked: The following tags are checked:
- whitelist_title - regex_whitelist_author
- regex_whitelist_content
- regex_whitelist_summary
- regex_whitelist_title
- whitelist_author
- whitelist_content
- whitelist_summary - whitelist_summary
- whitelist_content. - whitelist_title
Args: Args:
custom_reader: The reader. custom_reader: The reader.
@ -23,14 +28,29 @@ def has_white_tags(custom_reader: Reader, feed: Feed) -> bool:
Returns: Returns:
bool: If the feed has any of the tags. bool: If the feed has any of the tags.
""" """
whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")) whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")).strip()
whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")) whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")).strip()
whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")) whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")).strip()
whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")).strip()
return bool(whitelist_title or whitelist_summary or whitelist_content) regex_whitelist_title: str = str(custom_reader.get_tag(feed, "regex_whitelist_title", "")).strip()
regex_whitelist_summary: str = str(custom_reader.get_tag(feed, "regex_whitelist_summary", "")).strip()
regex_whitelist_content: str = str(custom_reader.get_tag(feed, "regex_whitelist_content", "")).strip()
regex_whitelist_author: str = str(custom_reader.get_tag(feed, "regex_whitelist_author", "")).strip()
return bool(
whitelist_title
or whitelist_author
or whitelist_content
or whitelist_summary
or regex_whitelist_author
or regex_whitelist_content
or regex_whitelist_summary
or regex_whitelist_title,
)
def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: def should_be_sent(custom_reader: Reader, entry: Entry) -> bool: # noqa: PLR0911
"""Return True if the entry is in the whitelist. """Return True if the entry is in the whitelist.
Args: Args:
@ -41,20 +61,43 @@ def should_be_sent(custom_reader: Reader, entry: Entry) -> bool:
bool: If the entry is in the whitelist. bool: If the entry is in the whitelist.
""" """
feed: Feed = entry.feed feed: Feed = entry.feed
whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")) # Regular whitelist tags
whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")) whitelist_title: str = str(custom_reader.get_tag(feed, "whitelist_title", "")).strip()
whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")) whitelist_summary: str = str(custom_reader.get_tag(feed, "whitelist_summary", "")).strip()
whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")) whitelist_content: str = str(custom_reader.get_tag(feed, "whitelist_content", "")).strip()
whitelist_author: str = str(custom_reader.get_tag(feed, "whitelist_author", "")).strip()
# Regex whitelist tags
regex_whitelist_title: str = str(custom_reader.get_tag(feed, "regex_whitelist_title", "")).strip()
regex_whitelist_summary: str = str(custom_reader.get_tag(feed, "regex_whitelist_summary", "")).strip()
regex_whitelist_content: str = str(custom_reader.get_tag(feed, "regex_whitelist_content", "")).strip()
regex_whitelist_author: str = str(custom_reader.get_tag(feed, "regex_whitelist_author", "")).strip()
# Check regular whitelist
if entry.title and whitelist_title and is_word_in_text(whitelist_title, entry.title): if entry.title and whitelist_title and is_word_in_text(whitelist_title, entry.title):
return True return True
if entry.summary and whitelist_summary and is_word_in_text(whitelist_summary, entry.summary): if entry.summary and whitelist_summary and is_word_in_text(whitelist_summary, entry.summary):
return True return True
if entry.author and whitelist_author and is_word_in_text(whitelist_author, entry.author): if entry.author and whitelist_author and is_word_in_text(whitelist_author, entry.author):
return True return True
return bool( if (
entry.content entry.content
and entry.content[0].value and entry.content[0].value
and whitelist_content and whitelist_content
and is_word_in_text(whitelist_content, entry.content[0].value), and is_word_in_text(whitelist_content, entry.content[0].value)
):
return True
# Check regex whitelist
if entry.title and regex_whitelist_title and is_regex_match(regex_whitelist_title, entry.title):
return True
if entry.summary and regex_whitelist_summary and is_regex_match(regex_whitelist_summary, entry.summary):
return True
if entry.author and regex_whitelist_author and is_regex_match(regex_whitelist_author, entry.author):
return True
return bool(
entry.content
and entry.content[0].value
and regex_whitelist_content
and is_regex_match(regex_whitelist_content, entry.content[0].value),
) )

View File

@ -43,7 +43,7 @@ from discord_rss_bot.search import create_html_for_search_results
from discord_rss_bot.settings import get_reader from discord_rss_bot.settings import get_reader
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import Iterable from collections.abc import AsyncGenerator, Iterable
from reader.types import JSONType from reader.types import JSONType
@ -88,8 +88,15 @@ reader: Reader = get_reader()
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI) -> typing.AsyncGenerator[None]: async def lifespan(app: FastAPI) -> AsyncGenerator[None]:
"""This is needed for the ASGI server to run.""" """Lifespan for the FastAPI app.
Args:
app: The FastAPI app.
Yields:
None: Nothing.
"""
add_missing_tags(reader) add_missing_tags(reader)
scheduler: AsyncIOScheduler = AsyncIOScheduler() scheduler: AsyncIOScheduler = AsyncIOScheduler()
@ -250,6 +257,10 @@ async def post_set_whitelist(
whitelist_summary: Annotated[str, Form()] = "", whitelist_summary: Annotated[str, Form()] = "",
whitelist_content: Annotated[str, Form()] = "", whitelist_content: Annotated[str, Form()] = "",
whitelist_author: Annotated[str, Form()] = "", whitelist_author: Annotated[str, Form()] = "",
regex_whitelist_title: Annotated[str, Form()] = "",
regex_whitelist_summary: Annotated[str, Form()] = "",
regex_whitelist_content: Annotated[str, Form()] = "",
regex_whitelist_author: Annotated[str, Form()] = "",
feed_url: Annotated[str, Form()] = "", feed_url: Annotated[str, Form()] = "",
) -> RedirectResponse: ) -> RedirectResponse:
"""Set what the whitelist should be sent, if you have this set only words in the whitelist will be sent. """Set what the whitelist should be sent, if you have this set only words in the whitelist will be sent.
@ -259,6 +270,10 @@ async def post_set_whitelist(
whitelist_summary: Whitelisted words for when checking the summary. whitelist_summary: Whitelisted words for when checking the summary.
whitelist_content: Whitelisted words for when checking the content. whitelist_content: Whitelisted words for when checking the content.
whitelist_author: Whitelisted words for when checking the author. whitelist_author: Whitelisted words for when checking the author.
regex_whitelist_title: Whitelisted regex for when checking the title.
regex_whitelist_summary: Whitelisted regex for when checking the summary.
regex_whitelist_content: Whitelisted regex for when checking the content.
regex_whitelist_author: Whitelisted regex for when checking the author.
feed_url: The feed we should set the whitelist for. feed_url: The feed we should set the whitelist for.
Returns: Returns:
@ -269,6 +284,10 @@ async def post_set_whitelist(
reader.set_tag(clean_feed_url, "whitelist_summary", whitelist_summary) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_summary", whitelist_summary) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "whitelist_content", whitelist_content) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_content", whitelist_content) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "whitelist_author", whitelist_author) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "whitelist_author", whitelist_author) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_whitelist_title", regex_whitelist_title) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_whitelist_summary", regex_whitelist_summary) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_whitelist_content", regex_whitelist_content) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_whitelist_author", regex_whitelist_author) # pyright: ignore[reportArgumentType][call-overload]
return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303) return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303)
@ -287,11 +306,14 @@ async def get_whitelist(feed_url: str, request: Request):
clean_feed_url: str = feed_url.strip() clean_feed_url: str = feed_url.strip()
feed: Feed = reader.get_feed(urllib.parse.unquote(clean_feed_url)) feed: Feed = reader.get_feed(urllib.parse.unquote(clean_feed_url))
# Get previous data, this is used when creating the form.
whitelist_title: str = str(reader.get_tag(feed, "whitelist_title", "")) whitelist_title: str = str(reader.get_tag(feed, "whitelist_title", ""))
whitelist_summary: str = str(reader.get_tag(feed, "whitelist_summary", "")) whitelist_summary: str = str(reader.get_tag(feed, "whitelist_summary", ""))
whitelist_content: str = str(reader.get_tag(feed, "whitelist_content", "")) whitelist_content: str = str(reader.get_tag(feed, "whitelist_content", ""))
whitelist_author: str = str(reader.get_tag(feed, "whitelist_author", "")) whitelist_author: str = str(reader.get_tag(feed, "whitelist_author", ""))
regex_whitelist_title: str = str(reader.get_tag(feed, "regex_whitelist_title", ""))
regex_whitelist_summary: str = str(reader.get_tag(feed, "regex_whitelist_summary", ""))
regex_whitelist_content: str = str(reader.get_tag(feed, "regex_whitelist_content", ""))
regex_whitelist_author: str = str(reader.get_tag(feed, "regex_whitelist_author", ""))
context = { context = {
"request": request, "request": request,
@ -300,6 +322,10 @@ async def get_whitelist(feed_url: str, request: Request):
"whitelist_summary": whitelist_summary, "whitelist_summary": whitelist_summary,
"whitelist_content": whitelist_content, "whitelist_content": whitelist_content,
"whitelist_author": whitelist_author, "whitelist_author": whitelist_author,
"regex_whitelist_title": regex_whitelist_title,
"regex_whitelist_summary": regex_whitelist_summary,
"regex_whitelist_content": regex_whitelist_content,
"regex_whitelist_author": regex_whitelist_author,
} }
return templates.TemplateResponse(request=request, name="whitelist.html", context=context) return templates.TemplateResponse(request=request, name="whitelist.html", context=context)
@ -310,6 +336,10 @@ async def post_set_blacklist(
blacklist_summary: Annotated[str, Form()] = "", blacklist_summary: Annotated[str, Form()] = "",
blacklist_content: Annotated[str, Form()] = "", blacklist_content: Annotated[str, Form()] = "",
blacklist_author: Annotated[str, Form()] = "", blacklist_author: Annotated[str, Form()] = "",
regex_blacklist_title: Annotated[str, Form()] = "",
regex_blacklist_summary: Annotated[str, Form()] = "",
regex_blacklist_content: Annotated[str, Form()] = "",
regex_blacklist_author: Annotated[str, Form()] = "",
feed_url: Annotated[str, Form()] = "", feed_url: Annotated[str, Form()] = "",
) -> RedirectResponse: ) -> RedirectResponse:
"""Set the blacklist. """Set the blacklist.
@ -322,6 +352,10 @@ async def post_set_blacklist(
blacklist_summary: Blacklisted words for when checking the summary. blacklist_summary: Blacklisted words for when checking the summary.
blacklist_content: Blacklisted words for when checking the content. blacklist_content: Blacklisted words for when checking the content.
blacklist_author: Blacklisted words for when checking the author. blacklist_author: Blacklisted words for when checking the author.
regex_blacklist_title: Blacklisted regex for when checking the title.
regex_blacklist_summary: Blacklisted regex for when checking the summary.
regex_blacklist_content: Blacklisted regex for when checking the content.
regex_blacklist_author: Blacklisted regex for when checking the author.
feed_url: What feed we should set the blacklist for. feed_url: What feed we should set the blacklist for.
Returns: Returns:
@ -332,7 +366,10 @@ async def post_set_blacklist(
reader.set_tag(clean_feed_url, "blacklist_summary", blacklist_summary) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_summary", blacklist_summary) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "blacklist_content", blacklist_content) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_content", blacklist_content) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "blacklist_author", blacklist_author) # pyright: ignore[reportArgumentType][call-overload] reader.set_tag(clean_feed_url, "blacklist_author", blacklist_author) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_blacklist_title", regex_blacklist_title) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_blacklist_summary", regex_blacklist_summary) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_blacklist_content", regex_blacklist_content) # pyright: ignore[reportArgumentType][call-overload]
reader.set_tag(clean_feed_url, "regex_blacklist_author", regex_blacklist_author) # pyright: ignore[reportArgumentType][call-overload]
return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303) return RedirectResponse(url=f"/feed?feed_url={urllib.parse.quote(clean_feed_url)}", status_code=303)
@ -349,11 +386,14 @@ async def get_blacklist(feed_url: str, request: Request):
""" """
feed: Feed = reader.get_feed(urllib.parse.unquote(feed_url)) feed: Feed = reader.get_feed(urllib.parse.unquote(feed_url))
# Get previous data, this is used when creating the form.
blacklist_title: str = str(reader.get_tag(feed, "blacklist_title", "")) blacklist_title: str = str(reader.get_tag(feed, "blacklist_title", ""))
blacklist_summary: str = str(reader.get_tag(feed, "blacklist_summary", "")) blacklist_summary: str = str(reader.get_tag(feed, "blacklist_summary", ""))
blacklist_content: str = str(reader.get_tag(feed, "blacklist_content", "")) blacklist_content: str = str(reader.get_tag(feed, "blacklist_content", ""))
blacklist_author: str = str(reader.get_tag(feed, "blacklist_author", "")) blacklist_author: str = str(reader.get_tag(feed, "blacklist_author", ""))
regex_blacklist_title: str = str(reader.get_tag(feed, "regex_blacklist_title", ""))
regex_blacklist_summary: str = str(reader.get_tag(feed, "regex_blacklist_summary", ""))
regex_blacklist_content: str = str(reader.get_tag(feed, "regex_blacklist_content", ""))
regex_blacklist_author: str = str(reader.get_tag(feed, "regex_blacklist_author", ""))
context = { context = {
"request": request, "request": request,
@ -362,6 +402,10 @@ async def get_blacklist(feed_url: str, request: Request):
"blacklist_summary": blacklist_summary, "blacklist_summary": blacklist_summary,
"blacklist_content": blacklist_content, "blacklist_content": blacklist_content,
"blacklist_author": blacklist_author, "blacklist_author": blacklist_author,
"regex_blacklist_title": regex_blacklist_title,
"regex_blacklist_summary": regex_blacklist_summary,
"regex_blacklist_content": regex_blacklist_content,
"regex_blacklist_author": regex_blacklist_author,
} }
return templates.TemplateResponse(request=request, name="blacklist.html", context=context) return templates.TemplateResponse(request=request, name="blacklist.html", context=context)
@ -461,7 +505,7 @@ async def get_embed_page(feed_url: str, request: Request):
@app.post("/embed", response_class=HTMLResponse) @app.post("/embed", response_class=HTMLResponse)
async def post_embed( # noqa: PLR0913, PLR0917 async def post_embed(
feed_url: Annotated[str, Form()], feed_url: Annotated[str, Form()],
title: Annotated[str, Form()] = "", title: Annotated[str, Form()] = "",
description: Annotated[str, Form()] = "", description: Annotated[str, Form()] = "",

View File

@ -42,6 +42,49 @@
<label for="blacklist_author" class="col-sm-6 col-form-label">Blacklist - Author</label> <label for="blacklist_author" class="col-sm-6 col-form-label">Blacklist - Author</label>
<input name="blacklist_author" type="text" class="form-control bg-dark border-dark text-muted" <input name="blacklist_author" type="text" class="form-control bg-dark border-dark text-muted"
id="blacklist_author" value="{%- if blacklist_author -%}{{ blacklist_author }}{%- endif -%}" /> id="blacklist_author" value="{%- if blacklist_author -%}{{ blacklist_author }}{%- endif -%}" />
<div class="mt-4">
<div class="form-text">
<ul class="list-inline">
<li>
Regular expression patterns for advanced filtering. Each pattern should be on a new
line.
</li>
<li>Patterns are case-insensitive.</li>
<li>
Examples:
<code>
<pre>
^New Release:.*
\b(update|version|patch)\s+\d+\.\d+
.*\[(important|notice)\].*
</pre>
</code>
</li>
</ul>
</div>
<label for="regex_blacklist_title" class="col-sm-6 col-form-label">Regex Blacklist - Title</label>
<textarea name="regex_blacklist_title" class="form-control bg-dark border-dark text-muted"
id="regex_blacklist_title"
rows="3">{%- if regex_blacklist_title -%}{{ regex_blacklist_title }}{%- endif -%}</textarea>
<label for="regex_blacklist_summary" class="col-sm-6 col-form-label">Regex Blacklist -
Summary</label>
<textarea name="regex_blacklist_summary" class="form-control bg-dark border-dark text-muted"
id="regex_blacklist_summary"
rows="3">{%- if regex_blacklist_summary -%}{{ regex_blacklist_summary }}{%- endif -%}</textarea>
<label for="regex_blacklist_content" class="col-sm-6 col-form-label">Regex Blacklist -
Content</label>
<textarea name="regex_blacklist_content" class="form-control bg-dark border-dark text-muted"
id="regex_blacklist_content"
rows="3">{%- if regex_blacklist_content -%}{{ regex_blacklist_content }}{%- endif -%}</textarea>
<label for="regex_blacklist_author" class="col-sm-6 col-form-label">Regex Blacklist - Author</label>
<textarea name="regex_blacklist_author" class="form-control bg-dark border-dark text-muted"
id="regex_blacklist_author"
rows="3">{%- if regex_blacklist_author -%}{{ regex_blacklist_author }}{%- endif -%}</textarea>
</div>
</div> </div>
</div> </div>
<!-- Add a hidden feed_url field to the form --> <!-- Add a hidden feed_url field to the form -->

View File

@ -1,6 +1,6 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block title %} {% block title %}
| Blacklist | Whitelist
{% endblock title %} {% endblock title %}
{% block content %} {% block content %}
<div class="p-2 border border-dark"> <div class="p-2 border border-dark">
@ -42,6 +42,49 @@
<label for="whitelist_author" class="col-sm-6 col-form-label">Whitelist - Author</label> <label for="whitelist_author" class="col-sm-6 col-form-label">Whitelist - Author</label>
<input name="whitelist_author" type="text" class="form-control bg-dark border-dark text-muted" <input name="whitelist_author" type="text" class="form-control bg-dark border-dark text-muted"
id="whitelist_author" value="{%- if whitelist_author -%} {{ whitelist_author }} {%- endif -%}" /> id="whitelist_author" value="{%- if whitelist_author -%} {{ whitelist_author }} {%- endif -%}" />
<div class="mt-4">
<div class="form-text">
<ul class="list-inline">
<li>
Regular expression patterns for advanced filtering. Each pattern should be on a new
line.
</li>
<li>Patterns are case-insensitive.</li>
<li>
Examples:
<code>
<pre>
^New Release:.*
\b(update|version|patch)\s+\d+\.\d+
.*\[(important|notice)\].*
</pre>
</code>
</li>
</ul>
</div>
<label for="regex_whitelist_title" class="col-sm-6 col-form-label">Regex Whitelist - Title</label>
<textarea name="regex_whitelist_title" class="form-control bg-dark border-dark text-muted"
id="regex_whitelist_title"
rows="3">{%- if regex_whitelist_title -%}{{ regex_whitelist_title }}{%- endif -%}</textarea>
<label for="regex_whitelist_summary" class="col-sm-6 col-form-label">Regex Whitelist -
Summary</label>
<textarea name="regex_whitelist_summary" class="form-control bg-dark border-dark text-muted"
id="regex_whitelist_summary"
rows="3">{%- if regex_whitelist_summary -%}{{ regex_whitelist_summary }}{%- endif -%}</textarea>
<label for="regex_whitelist_content" class="col-sm-6 col-form-label">Regex Whitelist -
Content</label>
<textarea name="regex_whitelist_content" class="form-control bg-dark border-dark text-muted"
id="regex_whitelist_content"
rows="3">{%- if regex_whitelist_content -%}{{ regex_whitelist_content }}{%- endif -%}</textarea>
<label for="regex_whitelist_author" class="col-sm-6 col-form-label">Regex Whitelist - Author</label>
<textarea name="regex_whitelist_author" class="form-control bg-dark border-dark text-muted"
id="regex_whitelist_author"
rows="3">{%- if regex_whitelist_author -%}{{ regex_whitelist_author }}{%- endif -%}</textarea>
</div>
</div> </div>
</div> </div>
<!-- Add a hidden feed_url field to the form --> <!-- Add a hidden feed_url field to the form -->

View File

@ -42,7 +42,7 @@ platformdirs = "*"
python-dotenv = "*" python-dotenv = "*"
python-multipart = "*" python-multipart = "*"
reader = "*" reader = "*"
sentry-sdk = {version = "*", extras = ["fastapi"]} sentry-sdk = { version = "*", extras = ["fastapi"] }
uvicorn = "*" uvicorn = "*"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
@ -86,6 +86,8 @@ lint.ignore = [
"PLR6301", # Checks for the presence of unused self parameter in methods definitions. "PLR6301", # Checks for the presence of unused self parameter in methods definitions.
"RUF029", # Checks for functions declared async that do not await or otherwise use features requiring the function to be declared async. "RUF029", # Checks for functions declared async that do not await or otherwise use features requiring the function to be declared async.
"TD003", # Checks that a TODO comment is associated with a link to a relevant issue or ticket. "TD003", # Checks that a TODO comment is associated with a link to a relevant issue or ticket.
"PLR0913", # Checks for function definitions that include too many arguments.
"PLR0917", # Checks for function definitions that include too many positional arguments.
# Conflicting lint rules when using Ruff's formatter # Conflicting lint rules when using Ruff's formatter
# https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules

View File

@ -39,6 +39,13 @@ def test_has_black_tags() -> None:
check_if_has_tag(reader, feed, "blacklist_title") check_if_has_tag(reader, feed, "blacklist_title")
check_if_has_tag(reader, feed, "blacklist_summary") check_if_has_tag(reader, feed, "blacklist_summary")
check_if_has_tag(reader, feed, "blacklist_content") check_if_has_tag(reader, feed, "blacklist_content")
check_if_has_tag(reader, feed, "blacklist_author")
# Test regex blacklist tags
check_if_has_tag(reader, feed, "regex_blacklist_title")
check_if_has_tag(reader, feed, "regex_blacklist_summary")
check_if_has_tag(reader, feed, "regex_blacklist_content")
check_if_has_tag(reader, feed, "regex_blacklist_author")
# Clean up # Clean up
reader.delete_feed(feed_url) reader.delete_feed(feed_url)
@ -74,6 +81,7 @@ def test_should_be_skipped() -> None:
# Test entry without any blacklists # Test entry without any blacklists
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test standard blacklist functionality
reader.set_tag(feed, "blacklist_title", "fvnnnfnfdnfdnfd") # pyright: ignore[reportArgumentType] reader.set_tag(feed, "blacklist_title", "fvnnnfnfdnfdnfd") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, f"Entry should be skipped: {first_entry[0]}" assert entry_should_be_skipped(reader, first_entry[0]) is True, f"Entry should be skipped: {first_entry[0]}"
reader.delete_tag(feed, "blacklist_title") reader.delete_tag(feed, "blacklist_title")
@ -113,3 +121,81 @@ def test_should_be_skipped() -> None:
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
reader.delete_tag(feed, "blacklist_author") reader.delete_tag(feed, "blacklist_author")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}" assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
def test_regex_should_be_skipped() -> None:
"""Test the regex filtering functionality for blacklist."""
reader: Reader = get_reader()
# Add feed and update entries
reader.add_feed(feed_url)
feed: Feed = reader.get_feed(feed_url)
reader.update_feeds()
# Get first entry
first_entry: list[Entry] = []
entries: Iterable[Entry] = reader.get_entries(feed=feed)
assert entries is not None, f"Entries should not be None: {entries}"
for entry in entries:
first_entry.append(entry)
break
assert len(first_entry) == 1, f"First entry should be added: {first_entry}"
# Test entry without any regex blacklists
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test regex blacklist for title
reader.set_tag(feed, "regex_blacklist_title", r"fvnnn\w+") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, (
f"Entry should be skipped with regex title match: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_title")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test regex blacklist for summary
reader.set_tag(feed, "regex_blacklist_summary", r"ffdnfdn\w+") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, (
f"Entry should be skipped with regex summary match: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_summary")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test regex blacklist for content
reader.set_tag(feed, "regex_blacklist_content", r"ffdnfdnfdn\w+") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, (
f"Entry should be skipped with regex content match: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_content")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test regex blacklist for author
reader.set_tag(feed, "regex_blacklist_author", r"TheLovinator\d*") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, (
f"Entry should be skipped with regex author match: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_author")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test invalid regex pattern (should not raise an exception)
reader.set_tag(feed, "regex_blacklist_title", r"[incomplete") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is False, (
f"Entry should not be skipped with invalid regex: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_title")
# Test multiple regex patterns separated by commas
reader.set_tag(feed, "regex_blacklist_author", r"pattern1,TheLovinator\d*,pattern3") # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, (
f"Entry should be skipped with one matching pattern in list: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_author")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"
# Test newline-separated regex patterns
newline_patterns = "pattern1\nTheLovinator\\d*\npattern3"
reader.set_tag(feed, "regex_blacklist_author", newline_patterns) # pyright: ignore[reportArgumentType]
assert entry_should_be_skipped(reader, first_entry[0]) is True, (
f"Entry should be skipped with newline-separated patterns: {first_entry[0]}"
)
reader.delete_tag(feed, "regex_blacklist_author")
assert entry_should_be_skipped(reader, first_entry[0]) is False, f"Entry should not be skipped: {first_entry[0]}"

View File

@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from discord_rss_bot.filter.utils import is_word_in_text from discord_rss_bot.filter.utils import is_regex_match, is_word_in_text
def test_is_word_in_text() -> None: def test_is_word_in_text() -> None:
@ -14,3 +14,51 @@ def test_is_word_in_text() -> None:
assert is_word_in_text("Alert,Forma", "Outbreak - Mutagen Mass - Rhea (Saturn)") is False, msg_false assert is_word_in_text("Alert,Forma", "Outbreak - Mutagen Mass - Rhea (Saturn)") is False, msg_false
assert is_word_in_text("Alert,Forma", "Outbreak - Mutagen Mass - Rhea (Saturn)") is False, msg_false assert is_word_in_text("Alert,Forma", "Outbreak - Mutagen Mass - Rhea (Saturn)") is False, msg_false
assert is_word_in_text("word1,word2", "This is a sample text containing none of the words.") is False, msg_false assert is_word_in_text("word1,word2", "This is a sample text containing none of the words.") is False, msg_false
def test_is_regex_match() -> None:
msg_true = "Should return True"
msg_false = "Should return False"
# Test basic regex patterns
assert is_regex_match(r"word\d+", "This text contains word123") is True, msg_true
assert is_regex_match(r"^Hello", "Hello world") is True, msg_true
assert is_regex_match(r"world$", "Hello world") is True, msg_true
# Test case insensitivity
assert is_regex_match(r"hello", "This text contains HELLO") is True, msg_true
# Test comma-separated patterns
assert is_regex_match(r"pattern1,pattern2", "This contains pattern2") is True, msg_true
assert is_regex_match(r"pattern1, pattern2", "This contains pattern1") is True, msg_true
# Test regex that shouldn't match
assert is_regex_match(r"^start", "This doesn't start with the pattern") is False, msg_false
assert is_regex_match(r"end$", "This doesn't end with the pattern") is False, msg_false
# Test with empty input
assert is_regex_match("", "Some text") is False, msg_false
assert is_regex_match("pattern", "") is False, msg_false
# Test with invalid regex (should not raise an exception and return False)
assert is_regex_match(r"[incomplete", "Some text") is False, msg_false
# Test with multiple patterns where one is invalid
assert is_regex_match(r"valid, [invalid, \w+", "Contains word") is True, msg_true
# Test newline-separated patterns
newline_patterns = "pattern1\n^start\ncontains\\d+"
assert is_regex_match(newline_patterns, "This contains123 text") is True, msg_true
assert is_regex_match(newline_patterns, "start of line") is True, msg_true
assert is_regex_match(newline_patterns, "pattern1 is here") is True, msg_true
assert is_regex_match(newline_patterns, "None of these match") is False, msg_false
# Test mixed newline and comma patterns (for backward compatibility)
mixed_patterns = "pattern1\npattern2,pattern3\npattern4"
assert is_regex_match(mixed_patterns, "Contains pattern3") is True, msg_true
assert is_regex_match(mixed_patterns, "Contains pattern4") is True, msg_true
# Test with empty lines and spaces
whitespace_patterns = "\\s+\n \n\npattern\n\n"
assert is_regex_match(whitespace_patterns, "text with spaces") is True, msg_true
assert is_regex_match(whitespace_patterns, "text with pattern") is True, msg_true

View File

@ -38,6 +38,13 @@ def test_has_white_tags() -> None:
check_if_has_tag(reader, feed, "whitelist_title") check_if_has_tag(reader, feed, "whitelist_title")
check_if_has_tag(reader, feed, "whitelist_summary") check_if_has_tag(reader, feed, "whitelist_summary")
check_if_has_tag(reader, feed, "whitelist_content") check_if_has_tag(reader, feed, "whitelist_content")
check_if_has_tag(reader, feed, "whitelist_author")
# Test regex whitelist tags
check_if_has_tag(reader, feed, "regex_whitelist_title")
check_if_has_tag(reader, feed, "regex_whitelist_summary")
check_if_has_tag(reader, feed, "regex_whitelist_content")
check_if_has_tag(reader, feed, "regex_whitelist_author")
# Clean up # Clean up
reader.delete_feed(feed_url) reader.delete_feed(feed_url)
@ -109,3 +116,67 @@ def test_should_be_sent() -> None:
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
reader.delete_tag(feed, "whitelist_author") reader.delete_tag(feed, "whitelist_author")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent" assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
def test_regex_should_be_sent() -> None:
"""Test the regex filtering functionality for whitelist."""
reader: Reader = get_reader()
# Add feed and update entries
reader.add_feed(feed_url)
feed: Feed = reader.get_feed(feed_url)
reader.update_feeds()
# Get first entry
first_entry: list[Entry] = []
entries: Iterable[Entry] = reader.get_entries(feed=feed)
assert entries is not None, "Entries should not be None"
for entry in entries:
first_entry.append(entry)
break
assert len(first_entry) == 1, "First entry should be added"
# Test entry without any regex whitelists
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
# Test regex whitelist for title
reader.set_tag(feed, "regex_whitelist_title", r"fvnnn\w+") # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex title match"
reader.delete_tag(feed, "regex_whitelist_title")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
# Test regex whitelist for summary
reader.set_tag(feed, "regex_whitelist_summary", r"ffdnfdn\w+") # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex summary match"
reader.delete_tag(feed, "regex_whitelist_summary")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
# Test regex whitelist for content
reader.set_tag(feed, "regex_whitelist_content", r"ffdnfdnfdn\w+") # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex content match"
reader.delete_tag(feed, "regex_whitelist_content")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
# Test regex whitelist for author
reader.set_tag(feed, "regex_whitelist_author", r"TheLovinator\d*") # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with regex author match"
reader.delete_tag(feed, "regex_whitelist_author")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
# Test invalid regex pattern (should not raise an exception)
reader.set_tag(feed, "regex_whitelist_title", r"[incomplete") # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent with invalid regex"
reader.delete_tag(feed, "regex_whitelist_title")
# Test multiple regex patterns separated by commas
reader.set_tag(feed, "regex_whitelist_author", r"pattern1,TheLovinator\d*,pattern3") # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with one matching pattern in list"
reader.delete_tag(feed, "regex_whitelist_author")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"
# Test newline-separated regex patterns
newline_patterns = "pattern1\nTheLovinator\\d*\npattern3"
reader.set_tag(feed, "regex_whitelist_author", newline_patterns) # pyright: ignore[reportArgumentType]
assert should_be_sent(reader, first_entry[0]) is True, "Entry should be sent with newline-separated patterns"
reader.delete_tag(feed, "regex_whitelist_author")
assert should_be_sent(reader, first_entry[0]) is False, "Entry should not be sent"