From 4d8a0e9ded069c61e9b439126fc0556b96b478d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Joakim=20Hells=C3=A9n?= <tlovinator@gmail.com>
Date: Sat, 28 Jan 2023 23:01:17 +0100
Subject: [PATCH] Update HTML to markdown converter

---
 discord_rss_bot/markdown.py | 36 +++++++++++-----------------------
 tests/test_markdown.py      | 39 +++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 42 deletions(-)
diff --git a/discord_rss_bot/markdown.py b/discord_rss_bot/markdown.py
index 88ed723..ac159b1 100644
--- a/discord_rss_bot/markdown.py
+++ b/discord_rss_bot/markdown.py
@@ -1,57 +1,43 @@
-from functools import lru_cache
-
 from bs4 import BeautifulSoup
 
 
-@lru_cache(maxsize=2048)
 def convert_html_to_md(html: str) -> str:
-    """Convert HTML to Markdown.
+    """Convert HTML to markdown.
 
-    Discord supports:
-    - Bold with **text**
-    - Italic with *text*
-    - Blockquote with >>> text
-    - Code with `text`
-        - Fence code with ```text```
-    - Links with [text](url)
-    - Syntax highlighting with ```language
-    - Strikethrough with ~~text~~
+    Args:
+        html: The HTML to convert.
+
+    Returns:
+        Our markdown.
     """
     soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
 
-    # Bold
     for bold in soup.find_all("b") + soup.find_all("strong"):
         bold.replace_with(f"**{bold.text}**")
 
-    # Italic
     for italic in soup.find_all("i") + soup.find_all("em"):
         italic.replace_with(f"*{italic.text}*")
 
-    # Blockquote
     for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
         blockquote.replace_with(f">>> {blockquote.text}")
 
-    # Code
     for code in soup.find_all("code") + soup.find_all("pre"):
         code.replace_with(f"`{code.text}`")
 
-    # Links
     for link in soup.find_all("a") + soup.find_all("link"):
-        link_text = link.text or link.get("href") or "Link"
+        link_text: str = link.text or link.get("href") or "Link"
         link.replace_with(f"[{link_text}]({link.get('href')})")
 
-    # Strikethrough
     for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
         strikethrough.replace_with(f"~~{strikethrough.text}~~")
 
-    # <br> tags
     for br in soup.find_all("br"):
         br.replace_with("\n")
 
+    clean_soup: BeautifulSoup = BeautifulSoup(str(soup).replace("</p>", "</p>\n"), features="lxml")
+
     # Remove all other tags
-    for tag in soup.find_all(True):
+    for tag in clean_soup.find_all(True):
         tag.replace_with(tag.text)
 
-    # Remove all leading and trailing whitespace
-    soup_text: str = soup.text
-    return soup_text.strip()
+    return clean_soup.text.strip()
diff --git a/tests/test_markdown.py b/tests/test_markdown.py
index 4020316..7c51fa1 100644
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@@ -1,7 +1,7 @@
 from discord_rss_bot.markdown import convert_html_to_md
 
 
-def test_convert_to_md():
+def test_convert_to_md() -> None:
     # Test bold
     assert convert_html_to_md("<b>bold</b>") == "**bold**"
 
@@ -28,22 +28,16 @@ def test_convert_to_md():
 
     # Test multiple tags
     assert (
-        convert_html_to_md(
-            '<b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s>'
-        )
+        convert_html_to_md('<b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s>')  # noqa: E501
         == "**bold** *italic* [link](https://example.com) `code` ~~strikethrough~~"
     )
 
     # Test removing all other tags
     assert convert_html_to_md("<p>paragraph</p>") == "paragraph"
-    assert convert_html_to_md("<p>paragraph</p><p>paragraph</p>") == "paragraphparagraph"
+    assert convert_html_to_md("<p>paragraph</p><p>paragraph</p>") == "paragraph\nparagraph"
 
     # Test <br> tags
-    assert (
-        convert_html_to_md("<p>paragraph<br>paragraph</p>")
-        == """paragraph
-paragraph"""
-    )
+    assert convert_html_to_md("<p>paragraph<br>paragraph</p>") == "paragraph\nparagraph"
 
     # Test removing trailing newline
     assert convert_html_to_md("paragraph ") == "paragraph"
@@ -52,11 +46,22 @@ paragraph"""
     assert convert_html_to_md(" paragraph ") == "paragraph"
 
     # Test removing leading and trailing whitespace and trailing newline
-    assert (
-        convert_html_to_md(
-            """ paragraph
-                              
-                                """  # noqa: W293
-        )
-        == "paragraph"
+    assert convert_html_to_md(" paragraph\n \n") == "paragraph"
+
+    # Test real entry
+    nvidia_entry: str = (
+        '<p><a href="https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/">'
+        "NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements</a></p>"
+        '<div class="field field-name-field-short-description field-type-text-long field-label-hidden">'
+        '<div class="field-items"><div class="field-item even">Plus new options to mirror your camera and take a selfie.</div>'  # noqa: E501
+        '</div></div><div class="field field-name-field-thumbnail-image field-type-image field-label-hidden">'
+        '<div class="field-items"><div class="field-item even"><a href="https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/">'  # noqa: E501
+        '<img width="210" src="https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/news/jan-2023-nvidia-broadcast-update/broadcast-owned-asset-625x330-newsfeed.png"'  # noqa: E501
+        ' title="NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements" '
+        'alt="NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements"></a></div></div></div>'  # noqa: E501
+    )
+    assert (
+        convert_html_to_md(nvidia_entry)
+        == "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n"  # noqa: E501
+        "Plus new options to mirror your camera and take a selfie.[https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)"  # noqa: E501
     )