Validate URLs before adding

This commit is contained in:
Joakim Hellsén 2024-01-30 22:40:41 +01:00
commit c41780fca0
12 changed files with 386 additions and 16 deletions

View file

@ -1,13 +1,18 @@
{ {
"cSpell.words": [ "cSpell.words": [
"arpa",
"blocklist",
"blocklists",
"chartboost", "chartboost",
"feedburner", "feedburner",
"feedparser", "feedparser",
"feedvault", "feedvault",
"leftright", "leftright",
"levelname",
"PGHOST", "PGHOST",
"PGPORT", "PGPORT",
"PGUSER", "PGUSER",
"regexes",
"webmail" "webmail"
] ]
} }

View file

@ -4,3 +4,66 @@ https://docs.djangoproject.com/en/5.0/ref/contrib/admin/
""" """
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING, ClassVar
from django.contrib import admin
from feeds.models import (
Author,
Blocklist,
Cloud,
Contributor,
Feed,
Generator,
Image,
Info,
Link,
Publisher,
Rights,
Subtitle,
Tags,
TextInput,
Title,
)
from feeds.validator import update_blocklist
if TYPE_CHECKING:
from django.db.models.query import QuerySet
from django.http import HttpRequest
admin.site.register(Author)
admin.site.register(Cloud)
admin.site.register(Contributor)
admin.site.register(Feed)
admin.site.register(Generator)
admin.site.register(Image)
admin.site.register(Info)
admin.site.register(Link)
admin.site.register(Publisher)
admin.site.register(Rights)
admin.site.register(Subtitle)
admin.site.register(Tags)
admin.site.register(TextInput)
admin.site.register(Title)
# Add button to update blocklist on the admin page
@admin.register(Blocklist)
class BlocklistAdmin(admin.ModelAdmin):
"""Admin interface for blocklist."""
actions: ClassVar[list[str]] = ["_update_blocklist", "delete_all_blocklist"]
list_display: ClassVar[list[str]] = ["url", "active"]
@admin.action(description="Update blocklist")
def _update_blocklist(self: admin.ModelAdmin, request: HttpRequest, queryset: QuerySet) -> None: # noqa: ARG002
"""Update blocklist."""
msg: str = update_blocklist()
self.message_user(request=request, message=msg)
@admin.action(description="Delete all blocklists")
def delete_all_blocklist(self: admin.ModelAdmin, request: HttpRequest, queryset: QuerySet) -> None: # noqa: ARG002
"""Delete all blocklist from database."""
Blocklist.objects.all().delete()
self.message_user(request=request, message="Deleted all blocklists")

View file

@ -0,0 +1,25 @@
# Generated by Django 5.0.1 on 2024-01-30 16:41
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('feeds', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='Blocklist',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.URLField(help_text='The URL to block.', unique=True)),
],
options={
'verbose_name': 'Blocklist',
'verbose_name_plural': 'Blocklists',
'db_table_comment': 'A list of URLs to block.',
},
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 5.0.1 on 2024-01-30 16:45
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('feeds', '0002_blocklist'),
]
operations = [
migrations.AddField(
model_name='blocklist',
name='active',
field=models.BooleanField(default=True, help_text='Is this URL still blocked?'),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 5.0.1 on 2024-01-30 18:22
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('feeds', '0003_blocklist_active'),
]
operations = [
migrations.AlterField(
model_name='blocklist',
name='url',
field=models.CharField(help_text='The URL to block.', max_length=2000, unique=True),
),
]

View file

@ -963,3 +963,21 @@ class Feed(models.Model):
def __str__(self: Feed) -> str: def __str__(self: Feed) -> str:
"""Feed URL.""" """Feed URL."""
return f"{self.url}" return f"{self.url}"
class Blocklist(models.Model):
"""A list of URLs to block."""
url = models.CharField(max_length=2000, unique=True, help_text="The URL to block.")
active = models.BooleanField(default=True, help_text="Is this URL still blocked?")
class Meta:
"""Blocklist meta."""
verbose_name: typing.ClassVar[str] = "Blocklist"
verbose_name_plural: typing.ClassVar[str] = "Blocklists"
db_table_comment: typing.ClassVar[str] = "A list of URLs to block."
def __str__(self: Blocklist) -> str:
"""Blocklist URL."""
return f"{self.url}"

126
feeds/validator.py Normal file
View file

@ -0,0 +1,126 @@
"""Validate feeds before adding them to the database."""
from __future__ import annotations
import ipaddress
import logging
import re
from urllib.parse import urlparse
import requests
from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from feeds.models import Blocklist
BLOCKLISTS: list[str] = [
"https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-dnscrypt-blocked-names.txt",
"https://malware-filter.gitlab.io/malware-filter/phishing-filter-dnscrypt-blocked-names.txt",
]
logger: logging.Logger = logging.getLogger(__name__)
def validate_scheme(feed_url: str) -> bool:
"""Validate the scheme of a URL. Only allow http and https.
Args:
feed_url: The URL to validate.
Returns:
True if the URL is valid, False otherwise.
"""
validator = URLValidator(schemes=["http", "https"])
# TODO(TheLovinator): Should we allow other schemes? # noqa: TD003
try:
validator(feed_url)
except ValidationError:
return False
else:
return True
def is_ip(feed_url: str) -> bool:
"""Check if feed is an IP address."""
try:
ipaddress.ip_address(feed_url)
except ValueError:
logger.info(f"{feed_url} is not an IP address") # noqa: G004
return False
else:
logger.info(f"{feed_url} is an IP address") # noqa: G004
return True
def update_blocklist() -> str:
"""Download the blocklist and add to database."""
# URLs found in the blocklist
found_urls = set()
for _blocklist in BLOCKLISTS:
with requests.get(url=_blocklist, timeout=10) as r:
r.raise_for_status()
logger.debug(f"Downloaded {_blocklist}") # noqa: G004
# Split the blocklist into a list of URLs
blocked_urls = set(r.text.splitlines())
# Remove comments and whitespace
blocked_urls = {url for url in blocked_urls if not url.startswith("#")}
blocked_urls = {url.strip() for url in blocked_urls}
logger.debug(f"Found {len(blocked_urls)} URLs in {_blocklist}") # noqa: G004
# Add URLs to the found URLs set
found_urls.update(blocked_urls)
logger.debug(f"Found {len(found_urls)} URLs in total") # noqa: G004
# Mark all URLs as inactive
Blocklist.objects.all().update(active=False)
logger.debug("Marked all URLs as inactive")
# Bulk create the blocklist
Blocklist.objects.bulk_create(
[Blocklist(url=url, active=True) for url in found_urls],
update_conflicts=True,
unique_fields=["url"],
update_fields=["active"],
batch_size=1000,
)
logger.debug(f"Added {len(found_urls)} URLs to the blocklist") # noqa: G004
return f"Added {len(found_urls)} URLs to the blocklist"
def is_local(feed_url: str) -> bool:
"""Check if feed is a local address."""
# Regexes from https://github.com/gwarser/filter-lists
regexes: list[str] = [
# 10.0.0.0 - 10.255.255.255
r"^\w+:\/\/10\.(?:(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))\.){2}(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))[:/]",
# 172.16.0.0 - 172.31.255.255
r"^\w+:\/\/172\.(?:1[6-9]|2\d|3[01])(?:\.(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))){2}[:/]",
# 192.168.0.0 - 192.168.255.255
r"^\w+:\/\/192\.168(?:\.(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))){2}[:/]",
# https://en.wikipedia.org/wiki/Private_network#Link-local_addresses
r"^\w+:\/\/169\.254\.(?:[1-9]\d?|1\d{2}|2(?:[0-4]\d|5[0-4]))\.(?:[1-9]?\d|1\d{2}|2(?:[0-4]\d|5[0-5]))[:/]",
# https://en.wikipedia.org/wiki/IPv6_address#Transition_from_IPv4
r"^\w+:\/\/\[::ffff:(?:7f[0-9a-f]{2}|a[0-9a-f]{2}|ac1[0-9a-f]|c0a8|a9fe):[0-9a-f]{1,4}\][:/]",
# localhost
r"^\w+:\/\/127\.(?:(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))\.){2}(?:[1-9]?\d|1\d\d|2(?:[0-4]\d|5[0-5]))[:/]",
]
domain: str | None = urlparse(feed_url).hostname
if not domain:
return False
if domain in {"localhost", "127.0.0.1", "::1", "0.0.0.0", "::", "local", "[::1]"}: # noqa: S104
return True
if domain.endswith((".local", ".home.arpa")):
return True
return any(re.match(regex, feed_url) for regex in regexes)

View file

@ -6,18 +6,24 @@ FeedsView - /feeds
from __future__ import annotations from __future__ import annotations
import logging
import typing import typing
from urllib import parse
from django.contrib import messages from django.contrib import messages
from django.core.exceptions import ValidationError
from django.db import connection from django.db import connection
from django.shortcuts import redirect from django.shortcuts import redirect, render
from django.views.generic.base import TemplateView from django.views.generic.base import TemplateView
from django.views.generic.list import ListView from django.views.generic.list import ListView
from feeds.models import Feed from feeds.models import Blocklist, Feed
from feeds.validator import is_ip, is_local, validate_scheme
if typing.TYPE_CHECKING: if typing.TYPE_CHECKING:
from django.http import HttpRequest, HttpResponseRedirect from django.http import HttpRequest, HttpResponse
logger: logging.Logger = logging.getLogger(__name__)
def get_database_size() -> int: def get_database_size() -> int:
@ -52,6 +58,9 @@ class IndexView(TemplateView):
context: dict = super().get_context_data(**kwargs) context: dict = super().get_context_data(**kwargs)
context["feed_count"] = Feed.objects.count() context["feed_count"] = Feed.objects.count()
context["database_size"] = get_database_size() context["database_size"] = get_database_size()
logger.info(f"Found {context['feed_count']} feeds in the database") # noqa: G004
logger.info(f"Database size is {context['database_size']} MB") # noqa: G004
return context return context
@ -72,33 +81,90 @@ class FeedsView(ListView):
return context return context
def add_feeds(request: HttpRequest) -> HttpResponseRedirect: def add_feeds(request: HttpRequest) -> HttpResponse:
"""Add feeds to the database. """Add feeds to the database.
Args: Args:
request: The request object. request: The request object.
Returns: Returns:
A redirect to the index page. A redirect to the index page if there are errors, otherwise a redirect to the feeds page.
""" """
if request.method == "POST": if request.method == "POST":
urls = request.POST.get("urls") urls: str | None = request.POST.get("urls")
if not urls: if not urls:
messages.error(request, "No URLs provided") messages.error(request, "No URLs provided")
return redirect("feeds:index", permanent=False) return render(request, "index.html")
if urls == "Test": if urls == "Test":
messages.error(request, "Hello, world!") messages.error(request, "Test test hello")
return redirect("feeds:index", permanent=False) return render(request, "index.html")
for url in urls.splitlines(): for url in urls.splitlines():
print(f"Adding {url} to the database...") # noqa: T201 check_feeds(feed_urls=[url], request=request)
return redirect("feeds:feeds", permanent=False) return render(request, "index.html")
msg: str = f"You must use a POST request. You used a {request.method} request. You can find out how to use this endpoint here: <a href=''>http://127.0.0.1:8000/</a>. If you think this is a mistake, please contact the administrator." # noqa: E501 msg: str = f"You must use a POST request. You used a {request.method} request. You can find out how to use this endpoint here: <a href=''>http://127.0.0.1:8000/</a>. If you think this is a mistake, please contact the administrator." # noqa: E501
messages.error(request, msg) messages.error(request, msg)
return redirect("feeds:index", permanent=False) return render(request, "index.html")
def check_feeds(feed_urls: list[str], request: HttpRequest) -> HttpResponse:
"""Check feeds before adding them to the database.
Args:
feed_urls: The feed URLs to check.
request: The request object.
Returns:
A redirect to the index page if there are errors, otherwise a redirect to the feeds page.
"""
for url in feed_urls:
url_html: str = f"<a href='{url}'>{url}</a>"
if Feed.objects.filter(url=url).exists():
msg: str = f"{url_html} is already in the database."
messages.error(request, msg)
continue
# Only allow HTTP and HTTPS URLs
if not validate_scheme(feed_url=url):
msg = f"{url_html} is not a HTTP or HTTPS URL."
messages.error(request, msg)
continue
# Don't allow IP addresses
if is_ip(feed_url=url):
msg = f"{url_html} is an IP address. IP addresses are not allowed."
messages.error(request, msg)
continue
# Check if in blocklist
domain: str = parse.urlparse(url).netloc
if Blocklist.objects.filter(url=domain).exists():
msg = f"{url_html} is in the blocklist."
messages.error(request, msg)
continue
# Check if local URL
if is_local(feed_url=url):
msg = f"{url_html} is a local URL."
messages.error(request, msg)
continue
# Create feed
try:
Feed.objects.create(url=url)
msg = f"{url_html} was added to the database."
messages.success(request, msg)
except ValidationError:
msg = f"{url_html} is not a valid URL."
messages.error(request, msg)
# Return to feeds page if no errors
# TODO(TheLovinator): Return to search page with our new feeds # noqa: TD003
logger.info(f"Added {len(feed_urls)} feeds to the database") # noqa: G004
return redirect("feeds:feeds")
class APIView(TemplateView): class APIView(TemplateView):

View file

@ -189,3 +189,25 @@ else:
"BACKEND": "django.core.cache.backends.dummy.DummyCache", "BACKEND": "django.core.cache.backends.dummy.DummyCache",
}, },
} }
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
"handlers": {
"console": {
"class": "logging.StreamHandler",
},
},
"root": {
"handlers": ["console"],
"level": "DEBUG",
},
"loggers": {
"django": {
"handlers": ["console"],
"level": os.getenv("DJANGO_LOG_LEVEL", "INFO"),
"propagate": False,
},
},
}

View file

@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" width="128" height="128" viewBox="0 0 256 256" cursor="default"><defs><linearGradient x1=".085" y1=".085" x2=".915" y2=".915" id="prefix__a"><stop offset="0" stop-color="#E3702D"/><stop offset=".107" stop-color="#EA7D31"/><stop offset=".35" stop-color="#F69537"/><stop offset=".5" stop-color="#FB9E3A"/><stop offset=".702" stop-color="#EA7C31"/><stop offset=".887" stop-color="#DE642B"/><stop offset="1" stop-color="#D95B29"/></linearGradient></defs><rect width="256" height="256" rx="55" ry="55" fill="#CC5D15"/><rect width="246" height="246" rx="50" ry="50" x="5" y="5" fill="#F49C52"/><rect width="236" height="236" rx="47" ry="47" x="10" y="10" fill="url(#prefix__a)"/><circle cx="68" cy="189" r="24" fill="#FFF"/><path d="M160 213h-34a82 82 0 00-82-82V97a116 116 0 01116 116z" fill="#FFF"/><path d="M184 213A140 140 0 0044 73V38a175 175 0 01175 175z" fill="#FFF"/></svg>

Before

Width:  |  Height:  |  Size: 916 B

View file

@ -44,10 +44,14 @@ textarea {
resize: vertical; resize: vertical;
} }
.messages {
list-style-type: none;
}
.error { .error {
color: red; color: red;
} }
.messages { .success {
list-style-type: none; color: green;
} }

View file

@ -6,7 +6,13 @@
{% for feed in feeds %} {% for feed in feeds %}
<div class="feed"> <div class="feed">
<h2> <h2>
<a href="{{ feed.url }}">{{ feed.title }}</a> <a href="{{ feed.url }}">
{% if feed.title %}
{{ feed.title }}
{% else %}
{{ feed.url }}
{% endif %}
</a>
</h2> </h2>
<p>{{ feed.description }}</p> <p>{{ feed.description }}</p>
<p> <p>