Add tldextract for improved domain extraction and add new tests for extract_domain function

This commit is contained in:
2025-04-16 13:32:31 +02:00
parent 8b50003eda
commit cd0f63d59a
3 changed files with 25 additions and 6 deletions

View File

@ -7,6 +7,7 @@ import re
from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse
import tldextract
from discord_webhook import DiscordEmbed, DiscordWebhook
from fastapi import HTTPException
from reader import Entry, EntryNotFoundError, Feed, FeedExistsError, Reader, ReaderError, StorageError, TagNotFoundError
@ -70,12 +71,10 @@ def extract_domain(url: str) -> str: # noqa: PLR0911
if domain in domain_mapping:
return domain_mapping[domain]
# For other domains, capitalize the first part before the TLD
parts: list[str] = domain.split(".")
min_domain_parts = 2
if len(parts) >= min_domain_parts:
return parts[0].capitalize()
# Use tldextract to get the domain (SLD)
ext = tldextract.extract(url)
if ext.domain:
return ext.domain.capitalize()
return domain.capitalize()
except (ValueError, AttributeError, TypeError) as e:
logger.warning("Error extracting domain from %s: %s", url, e)