From c41780fca09f2acc1ac140548afdd5048bc1a368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joakim=20Hells=C3=A9n?= Date: Tue, 30 Jan 2024 22:40:41 +0100 Subject: [PATCH] Validate URLs before adding --- .vscode/settings.json | 5 + feeds/admin.py | 63 ++++++++++ feeds/migrations/0002_blocklist.py | 25 ++++ feeds/migrations/0003_blocklist_active.py | 18 +++ feeds/migrations/0004_alter_blocklist_url.py | 18 +++ feeds/models.py | 18 +++ feeds/validator.py | 126 +++++++++++++++++++ feeds/views.py | 90 +++++++++++-- feedvault/settings.py | 22 ++++ static/Feed.svg | 1 - static/style.css | 8 +- templates/feeds.html | 8 +- 12 files changed, 386 insertions(+), 16 deletions(-) create mode 100644 feeds/migrations/0002_blocklist.py create mode 100644 feeds/migrations/0003_blocklist_active.py create mode 100644 feeds/migrations/0004_alter_blocklist_url.py create mode 100644 feeds/validator.py delete mode 100644 static/Feed.svg diff --git a/.vscode/settings.json b/.vscode/settings.json index b1ba086..f933597 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,13 +1,18 @@ { "cSpell.words": [ + "arpa", + "blocklist", + "blocklists", "chartboost", "feedburner", "feedparser", "feedvault", "leftright", + "levelname", "PGHOST", "PGPORT", "PGUSER", + "regexes", "webmail" ] } diff --git a/feeds/admin.py b/feeds/admin.py index 8163cdc..08b85b9 100644 --- a/feeds/admin.py +++ b/feeds/admin.py @@ -4,3 +4,66 @@ https://docs.djangoproject.com/en/5.0/ref/contrib/admin/ """ from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar + +from django.contrib import admin + +from feeds.models import ( + Author, + Blocklist, + Cloud, + Contributor, + Feed, + Generator, + Image, + Info, + Link, + Publisher, + Rights, + Subtitle, + Tags, + TextInput, + Title, +) +from feeds.validator import update_blocklist + +if TYPE_CHECKING: + from django.db.models.query import QuerySet + from django.http import HttpRequest + +admin.site.register(Author) +admin.site.register(Cloud) +admin.site.register(Contributor) +admin.site.register(Feed) +admin.site.register(Generator) +admin.site.register(Image) +admin.site.register(Info) +admin.site.register(Link) +admin.site.register(Publisher) +admin.site.register(Rights) +admin.site.register(Subtitle) +admin.site.register(Tags) +admin.site.register(TextInput) +admin.site.register(Title) + + +# Add button to update blocklist on the admin page +@admin.register(Blocklist) +class BlocklistAdmin(admin.ModelAdmin): + """Admin interface for blocklist.""" + + actions: ClassVar[list[str]] = ["_update_blocklist", "delete_all_blocklist"] + list_display: ClassVar[list[str]] = ["url", "active"] + + @admin.action(description="Update blocklist") + def _update_blocklist(self: admin.ModelAdmin, request: HttpRequest, queryset: QuerySet) -> None: # noqa: ARG002 + """Update blocklist.""" + msg: str = update_blocklist() + self.message_user(request=request, message=msg) + + @admin.action(description="Delete all blocklists") + def delete_all_blocklist(self: admin.ModelAdmin, request: HttpRequest, queryset: QuerySet) -> None: # noqa: ARG002 + """Delete all blocklist from database.""" + Blocklist.objects.all().delete() + self.message_user(request=request, message="Deleted all blocklists") diff --git a/feeds/migrations/0002_blocklist.py b/feeds/migrations/0002_blocklist.py new file mode 100644 index 0000000..d3d1b4b --- /dev/null +++ b/feeds/migrations/0002_blocklist.py @@ -0,0 +1,25 @@ +# Generated by Django 5.0.1 on 2024-01-30 16:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('feeds', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Blocklist', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('url', models.URLField(help_text='The URL to block.', unique=True)), + ], + options={ + 'verbose_name': 'Blocklist', + 'verbose_name_plural': 'Blocklists', + 'db_table_comment': 'A list of URLs to block.', + }, + ), + ] diff --git a/feeds/migrations/0003_blocklist_active.py b/feeds/migrations/0003_blocklist_active.py new file mode 100644 index 0000000..6198e40 --- /dev/null +++ b/feeds/migrations/0003_blocklist_active.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.1 on 2024-01-30 16:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('feeds', '0002_blocklist'), + ] + + operations = [ + migrations.AddField( + model_name='blocklist', + name='active', + field=models.BooleanField(default=True, help_text='Is this URL still blocked?'), + ), + ] diff --git a/feeds/migrations/0004_alter_blocklist_url.py b/feeds/migrations/0004_alter_blocklist_url.py new file mode 100644 index 0000000..bfa2ab2 --- /dev/null +++ b/feeds/migrations/0004_alter_blocklist_url.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.1 on 2024-01-30 18:22 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('feeds', '0003_blocklist_active'), + ] + + operations = [ + migrations.AlterField( + model_name='blocklist', + name='url', + field=models.CharField(help_text='The URL to block.', max_length=2000, unique=True), + ), + ] diff --git a/feeds/models.py b/feeds/models.py index 9472e39..12c05c7 100644 --- a/feeds/models.py +++ b/feeds/models.py @@ -963,3 +963,21 @@ class Feed(models.Model): def __str__(self: Feed) -> str: """Feed URL.""" return f"{self.url}" + + +class Blocklist(models.Model): + """A list of URLs to block.""" + + url = models.CharField(max_length=2000, unique=True, help_text="The URL to block.") + active = models.BooleanField(default=True, help_text="Is this URL still blocked?") + + class Meta: + """Blocklist meta.""" + + verbose_name: typing.ClassVar[str] = "Blocklist" + verbose_name_plural: typing.ClassVar[str] = "Blocklists" + db_table_comment: typing.ClassVar[str] = "A list of URLs to block." + + def __str__(self: Blocklist) -> str: + """Blocklist URL.""" + return f"{self.url}" diff --git a/feeds/validator.py b/feeds/validator.py new file mode 100644 index 0000000..6a1a8da --- /dev/null +++ b/feeds/validator.py @@ -0,0 +1,126 @@ +"""Validate feeds before adding them to the database.""" + +from __future__ import annotations + +import ipaddress +import logging +import re +from urllib.parse import urlparse + +import requests +from django.core.exceptions import ValidationError +from django.core.validators import URLValidator + +from feeds.models import Blocklist + +BLOCKLISTS: list[str] = [ + "https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-dnscrypt-blocked-names.txt", + "https://malware-filter.gitlab.io/malware-filter/phishing-filter-dnscrypt-blocked-names.txt", +] + +logger: logging.Logger = logging.getLogger(__name__) + + +def validate_scheme(feed_url: str) -> bool: + """Validate the scheme of a URL. Only allow http and https. + + Args: + feed_url: The URL to validate. + + Returns: + True if the URL is valid, False otherwise. + """ + validator = URLValidator(schemes=["http", "https"]) + # TODO(TheLovinator): Should we allow other schemes? # noqa: TD003 + try: + validator(feed_url) + except ValidationError: + return False + else: + return True + + +def is_ip(feed_url: str) -> bool: + """Check if feed is an IP address.""" + try: + ipaddress.ip_address(feed_url) + except ValueError: + logger.info(f"{feed_url} is not an IP address") # noqa: G004 + return False + else: + logger.info(f"{feed_url} is an IP address") # noqa: G004 + return True + + +def update_blocklist() -> str: + """Download the blocklist and add to database.""" + # URLs found in the blocklist + found_urls = set() + + for _blocklist in BLOCKLISTS: + with requests.get(url=_blocklist, timeout=10) as r: + r.raise_for_status() + + logger.debug(f"Downloaded {_blocklist}") # noqa: G004 + + # Split the blocklist into a list of URLs + blocked_urls = set(r.text.splitlines()) + + # Remove comments and whitespace + blocked_urls = {url for url in blocked_urls if not url.startswith("#")} + blocked_urls = {url.strip() for url in blocked_urls} + + logger.debug(f"Found {len(blocked_urls)} URLs in {_blocklist}") # noqa: G004 + + # Add URLs to the found URLs set + found_urls.update(blocked_urls) + + logger.debug(f"Found {len(found_urls)} URLs in total") # noqa: G004 + + # Mark all URLs as inactive + Blocklist.objects.all().update(active=False) + + logger.debug("Marked all URLs as inactive") + + # Bulk create the blocklist + Blocklist.objects.bulk_create( + [Blocklist(url=url, active=True) for url in found_urls], + update_conflicts=True, + unique_fields=["url"], + update_fields=["active"], + batch_size=1000, + ) + + logger.debug(f"Added {len(found_urls)} URLs to the blocklist") # noqa: G004 + return f"Added {len(found_urls)} URLs to the blocklist" + + +def is_local(feed_url: str) -> bool: + """Check if feed is a local address.""" + # Regexes from https://github.com/gwarser/filter-lists + regexes: list[str] = [ + # 10.0.0.0 - 10.255.255.255 + r"^\w+:\/\/10\.(?:(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))\.){2}(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))[:/]", + # 172.16.0.0 - 172.31.255.255 + r"^\w+:\/\/172\.(?:1[6-9]|2\d|3[01])(?:\.(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))){2}[:/]", + # 192.168.0.0 - 192.168.255.255 + r"^\w+:\/\/192\.168(?:\.(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))){2}[:/]", + # https://en.wikipedia.org/wiki/Private_network#Link-local_addresses + r"^\w+:\/\/169\.254\.(?:[1-9]\d?|1\d{2}|2(?:[0-4]\d|5[0-4]))\.(?:[1-9]?\d|1\d{2}|2(?:[0-4]\d|5[0-5]))[:/]", + # https://en.wikipedia.org/wiki/IPv6_address#Transition_from_IPv4 + r"^\w+:\/\/\[::ffff:(?:7f[0-9a-f]{2}|a[0-9a-f]{2}|ac1[0-9a-f]|c0a8|a9fe):[0-9a-f]{1,4}\][:/]", + # localhost + r"^\w+:\/\/127\.(?:(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))\.){2}(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))[:/]", + ] + + domain: str | None = urlparse(feed_url).hostname + if not domain: + return False + + if domain in {"localhost", "127.0.0.1", "::1", "0.0.0.0", "::", "local", "[::1]"}: # noqa: S104 + return True + + if domain.endswith((".local", ".home.arpa")): + return True + + return any(re.match(regex, feed_url) for regex in regexes) diff --git a/feeds/views.py b/feeds/views.py index d3be0d6..b012264 100644 --- a/feeds/views.py +++ b/feeds/views.py @@ -6,18 +6,24 @@ FeedsView - /feeds from __future__ import annotations +import logging import typing +from urllib import parse from django.contrib import messages +from django.core.exceptions import ValidationError from django.db import connection -from django.shortcuts import redirect +from django.shortcuts import redirect, render from django.views.generic.base import TemplateView from django.views.generic.list import ListView -from feeds.models import Feed +from feeds.models import Blocklist, Feed +from feeds.validator import is_ip, is_local, validate_scheme if typing.TYPE_CHECKING: - from django.http import HttpRequest, HttpResponseRedirect + from django.http import HttpRequest, HttpResponse + +logger: logging.Logger = logging.getLogger(__name__) def get_database_size() -> int: @@ -52,6 +58,9 @@ class IndexView(TemplateView): context: dict = super().get_context_data(**kwargs) context["feed_count"] = Feed.objects.count() context["database_size"] = get_database_size() + + logger.info(f"Found {context['feed_count']} feeds in the database") # noqa: G004 + logger.info(f"Database size is {context['database_size']} MB") # noqa: G004 return context @@ -72,33 +81,90 @@ class FeedsView(ListView): return context -def add_feeds(request: HttpRequest) -> HttpResponseRedirect: +def add_feeds(request: HttpRequest) -> HttpResponse: """Add feeds to the database. Args: request: The request object. Returns: - A redirect to the index page. + A redirect to the index page if there are errors, otherwise a redirect to the feeds page. """ if request.method == "POST": - urls = request.POST.get("urls") + urls: str | None = request.POST.get("urls") if not urls: messages.error(request, "No URLs provided") - return redirect("feeds:index", permanent=False) + return render(request, "index.html") if urls == "Test": - messages.error(request, "Hello, world!") - return redirect("feeds:index", permanent=False) + messages.error(request, "Test test hello") + return render(request, "index.html") for url in urls.splitlines(): - print(f"Adding {url} to the database...") # noqa: T201 + check_feeds(feed_urls=[url], request=request) - return redirect("feeds:feeds", permanent=False) + return render(request, "index.html") msg: str = f"You must use a POST request. You used a {request.method} request. You can find out how to use this endpoint here: http://127.0.0.1:8000/. If you think this is a mistake, please contact the administrator." # noqa: E501 messages.error(request, msg) - return redirect("feeds:index", permanent=False) + return render(request, "index.html") + + +def check_feeds(feed_urls: list[str], request: HttpRequest) -> HttpResponse: + """Check feeds before adding them to the database. + + Args: + feed_urls: The feed URLs to check. + request: The request object. + + Returns: + A redirect to the index page if there are errors, otherwise a redirect to the feeds page. + """ + for url in feed_urls: + url_html: str = f"{url}" + if Feed.objects.filter(url=url).exists(): + msg: str = f"{url_html} is already in the database." + messages.error(request, msg) + continue + + # Only allow HTTP and HTTPS URLs + if not validate_scheme(feed_url=url): + msg = f"{url_html} is not a HTTP or HTTPS URL." + messages.error(request, msg) + continue + + # Don't allow IP addresses + if is_ip(feed_url=url): + msg = f"{url_html} is an IP address. IP addresses are not allowed." + messages.error(request, msg) + continue + + # Check if in blocklist + domain: str = parse.urlparse(url).netloc + if Blocklist.objects.filter(url=domain).exists(): + msg = f"{url_html} is in the blocklist." + messages.error(request, msg) + continue + + # Check if local URL + if is_local(feed_url=url): + msg = f"{url_html} is a local URL." + messages.error(request, msg) + continue + + # Create feed + try: + Feed.objects.create(url=url) + msg = f"{url_html} was added to the database." + messages.success(request, msg) + except ValidationError: + msg = f"{url_html} is not a valid URL." + messages.error(request, msg) + + # Return to feeds page if no errors + # TODO(TheLovinator): Return to search page with our new feeds # noqa: TD003 + logger.info(f"Added {len(feed_urls)} feeds to the database") # noqa: G004 + return redirect("feeds:feeds") class APIView(TemplateView): diff --git a/feedvault/settings.py b/feedvault/settings.py index 3a7ccca..f415994 100644 --- a/feedvault/settings.py +++ b/feedvault/settings.py @@ -189,3 +189,25 @@ else: "BACKEND": "django.core.cache.backends.dummy.DummyCache", }, } + + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "console": { + "class": "logging.StreamHandler", + }, + }, + "root": { + "handlers": ["console"], + "level": "DEBUG", + }, + "loggers": { + "django": { + "handlers": ["console"], + "level": os.getenv("DJANGO_LOG_LEVEL", "INFO"), + "propagate": False, + }, + }, +} diff --git a/static/Feed.svg b/static/Feed.svg deleted file mode 100644 index bfa1926..0000000 --- a/static/Feed.svg +++ /dev/null @@ -1 +0,0 @@ - diff --git a/static/style.css b/static/style.css index d863cf0..f1b1a7d 100644 --- a/static/style.css +++ b/static/style.css @@ -44,10 +44,14 @@ textarea { resize: vertical; } +.messages { + list-style-type: none; +} + .error { color: red; } -.messages { - list-style-type: none; +.success { + color: green; } diff --git a/templates/feeds.html b/templates/feeds.html index 86998e7..ee8d58d 100644 --- a/templates/feeds.html +++ b/templates/feeds.html @@ -6,7 +6,13 @@ {% for feed in feeds %}

- {{ feed.title }} + + {% if feed.title %} + {{ feed.title }} + {% else %} + {{ feed.url }} + {% endif %} +

{{ feed.description }}