diff --git a/discord_rss_bot/markdown.py b/discord_rss_bot/markdown.py index 88ed723..ac159b1 100644 --- a/discord_rss_bot/markdown.py +++ b/discord_rss_bot/markdown.py @@ -1,57 +1,43 @@ -from functools import lru_cache - from bs4 import BeautifulSoup -@lru_cache(maxsize=2048) def convert_html_to_md(html: str) -> str: - """Convert HTML to Markdown. + """Convert HTML to markdown. - Discord supports: - - Bold with **text** - - Italic with *text* - - Blockquote with >>> text - - Code with `text` - - Fence code with ```text``` - - Links with [text](url) - - Syntax highlighting with ```language - - Strikethrough with ~~text~~ + Args: + html: The HTML to convert. + + Returns: + Our markdown. """ soup: BeautifulSoup = BeautifulSoup(html, features="lxml") - # Bold for bold in soup.find_all("b") + soup.find_all("strong"): bold.replace_with(f"**{bold.text}**") - # Italic for italic in soup.find_all("i") + soup.find_all("em"): italic.replace_with(f"*{italic.text}*") - # Blockquote for blockquote in soup.find_all("blockquote") + soup.find_all("q"): blockquote.replace_with(f">>> {blockquote.text}") - # Code for code in soup.find_all("code") + soup.find_all("pre"): code.replace_with(f"`{code.text}`") - # Links for link in soup.find_all("a") + soup.find_all("link"): - link_text = link.text or link.get("href") or "Link" + link_text: str = link.text or link.get("href") or "Link" link.replace_with(f"[{link_text}]({link.get('href')})") - # Strikethrough for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"): strikethrough.replace_with(f"~~{strikethrough.text}~~") - #
tags for br in soup.find_all("br"): br.replace_with("\n") + clean_soup: BeautifulSoup = BeautifulSoup(str(soup).replace("

", "

\n"), features="lxml") + # Remove all other tags - for tag in soup.find_all(True): + for tag in clean_soup.find_all(True): tag.replace_with(tag.text) - # Remove all leading and trailing whitespace - soup_text: str = soup.text - return soup_text.strip() + return clean_soup.text.strip() diff --git a/tests/test_markdown.py b/tests/test_markdown.py index 4020316..7c51fa1 100644 --- a/tests/test_markdown.py +++ b/tests/test_markdown.py @@ -1,7 +1,7 @@ from discord_rss_bot.markdown import convert_html_to_md -def test_convert_to_md(): +def test_convert_to_md() -> None: # Test bold assert convert_html_to_md("bold") == "**bold**" @@ -28,22 +28,16 @@ def test_convert_to_md(): # Test multiple tags assert ( - convert_html_to_md( - 'bold italic link code strikethrough' - ) + convert_html_to_md('bold italic link code strikethrough') # noqa: E501 == "**bold** *italic* [link](https://example.com) `code` ~~strikethrough~~" ) # Test removing all other tags assert convert_html_to_md("

paragraph

") == "paragraph" - assert convert_html_to_md("

paragraph

paragraph

") == "paragraphparagraph" + assert convert_html_to_md("

paragraph

paragraph

") == "paragraph\nparagraph" # Test
tags - assert ( - convert_html_to_md("

paragraph
paragraph

") - == """paragraph -paragraph""" - ) + assert convert_html_to_md("

paragraph
paragraph

") == "paragraph\nparagraph" # Test removing trailing newline assert convert_html_to_md("paragraph ") == "paragraph" @@ -52,11 +46,22 @@ paragraph""" assert convert_html_to_md(" paragraph ") == "paragraph" # Test removing leading and trailing whitespace and trailing newline - assert ( - convert_html_to_md( - """ paragraph - - """ # noqa: W293 - ) - == "paragraph" + assert convert_html_to_md(" paragraph\n \n") == "paragraph" + + # Test real entry + nvidia_entry: str = ( + '

' + "NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements

" + '
' + '
Plus new options to mirror your camera and take a selfie.
' # noqa: E501 + '
' + '
' # noqa: E501 + ) + assert ( + convert_html_to_md(nvidia_entry) + == "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n" # noqa: E501 + "Plus new options to mirror your camera and take a selfie.[https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)" # noqa: E501 )