From 155b94cff23e09fcb383b7764bd9ba0cc3ee701c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joakim=20Hells=C3=A9n?= Date: Sun, 29 Jan 2023 01:07:36 +0100 Subject: [PATCH] Remove images and URLs without any text when converting HTML to Markdown --- discord_rss_bot/markdown.py | 13 +++++++++++-- tests/test_markdown.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/discord_rss_bot/markdown.py b/discord_rss_bot/markdown.py index ac159b1..5e48b9b 100644 --- a/discord_rss_bot/markdown.py +++ b/discord_rss_bot/markdown.py @@ -10,6 +10,9 @@ def convert_html_to_md(html: str) -> str: Returns: Our markdown. """ + if not html: + return html + soup: BeautifulSoup = BeautifulSoup(html, features="lxml") for bold in soup.find_all("b") + soup.find_all("strong"): @@ -24,9 +27,15 @@ def convert_html_to_md(html: str) -> str: for code in soup.find_all("code") + soup.find_all("pre"): code.replace_with(f"`{code.text}`") + for image in soup.find_all("img"): + image.decompose() + for link in soup.find_all("a") + soup.find_all("link"): - link_text: str = link.text or link.get("href") or "Link" - link.replace_with(f"[{link_text}]({link.get('href')})") + if not link.get_text().strip(): + link.decompose() + else: + link_text: str = link.text or link.get("href") + link.replace_with(f"[{link_text}]({link.get('href')})") for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"): strikethrough.replace_with(f"~~{strikethrough.text}~~") diff --git a/tests/test_markdown.py b/tests/test_markdown.py index 7c51fa1..8f7c233 100644 --- a/tests/test_markdown.py +++ b/tests/test_markdown.py @@ -63,5 +63,5 @@ def test_convert_to_md() -> None: assert ( convert_html_to_md(nvidia_entry) == "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n" # noqa: E501 - "Plus new options to mirror your camera and take a selfie.[https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)" # noqa: E501 + "Plus new options to mirror your camera and take a selfie." )