Add tldextract for improved domain extraction and add new tests for extract_domain function
This commit is contained in:
@ -7,6 +7,7 @@ import re
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import ParseResult, urlparse
|
||||
|
||||
import tldextract
|
||||
from discord_webhook import DiscordEmbed, DiscordWebhook
|
||||
from fastapi import HTTPException
|
||||
from reader import Entry, EntryNotFoundError, Feed, FeedExistsError, Reader, ReaderError, StorageError, TagNotFoundError
|
||||
@ -70,12 +71,10 @@ def extract_domain(url: str) -> str: # noqa: PLR0911
|
||||
if domain in domain_mapping:
|
||||
return domain_mapping[domain]
|
||||
|
||||
# For other domains, capitalize the first part before the TLD
|
||||
parts: list[str] = domain.split(".")
|
||||
min_domain_parts = 2
|
||||
if len(parts) >= min_domain_parts:
|
||||
return parts[0].capitalize()
|
||||
|
||||
# Use tldextract to get the domain (SLD)
|
||||
ext = tldextract.extract(url)
|
||||
if ext.domain:
|
||||
return ext.domain.capitalize()
|
||||
return domain.capitalize()
|
||||
except (ValueError, AttributeError, TypeError) as e:
|
||||
logger.warning("Error extracting domain from %s: %s", url, e)
|
||||
|
Reference in New Issue
Block a user