Remove images and URLs without any text when converting HTML to Markdown

2023-01-29 01:07:36 +01:00 · 2023-01-29 01:07:36 +01:00 · 155b94cff2
commit 155b94cff2
parent 4d8a0e9ded
2 changed files with 12 additions and 3 deletions
--- a/discord_rss_bot/markdown.py
+++ b/discord_rss_bot/markdown.py
@ -10,6 +10,9 @@ def convert_html_to_md(html: str) -> str:
    Returns:
        Our markdown.
    """
+    if not html:
+        return html
+
    soup: BeautifulSoup = BeautifulSoup(html, features="lxml")

    for bold in soup.find_all("b") + soup.find_all("strong"):
@ -24,8 +27,14 @@ def convert_html_to_md(html: str) -> str:
    for code in soup.find_all("code") + soup.find_all("pre"):
        code.replace_with(f"`{code.text}`")

+    for image in soup.find_all("img"):
+        image.decompose()
+
    for link in soup.find_all("a") + soup.find_all("link"):
-        link_text: str = link.text or link.get("href") or "Link"
+        if not link.get_text().strip():
+            link.decompose()
+        else:
+            link_text: str = link.text or link.get("href")
            link.replace_with(f"[{link_text}]({link.get('href')})")

    for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@ -63,5 +63,5 @@ def test_convert_to_md() -> None:
    assert (
        convert_html_to_md(nvidia_entry)
        == "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n"  # noqa: E501
-        "Plus new options to mirror your camera and take a selfie.[https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)"  # noqa: E501
+        "Plus new options to mirror your camera and take a selfie."
    )