From 4d8a0e9ded069c61e9b439126fc0556b96b478d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Joakim=20Hells=C3=A9n?=
Date: Sat, 28 Jan 2023 23:01:17 +0100
Subject: [PATCH] Update HTML to markdown converter
---
discord_rss_bot/markdown.py | 36 +++++++++++-----------------------
tests/test_markdown.py | 39 +++++++++++++++++++++----------------
2 files changed, 33 insertions(+), 42 deletions(-)
diff --git a/discord_rss_bot/markdown.py b/discord_rss_bot/markdown.py
index 88ed723..ac159b1 100644
--- a/discord_rss_bot/markdown.py
+++ b/discord_rss_bot/markdown.py
@@ -1,57 +1,43 @@
-from functools import lru_cache
-
from bs4 import BeautifulSoup
-@lru_cache(maxsize=2048)
def convert_html_to_md(html: str) -> str:
- """Convert HTML to Markdown.
+ """Convert HTML to markdown.
- Discord supports:
- - Bold with **text**
- - Italic with *text*
- - Blockquote with >>> text
- - Code with `text`
- - Fence code with ```text```
- - Links with [text](url)
- - Syntax highlighting with ```language
- - Strikethrough with ~~text~~
+ Args:
+ html: The HTML to convert.
+
+ Returns:
+ Our markdown.
"""
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
- # Bold
for bold in soup.find_all("b") + soup.find_all("strong"):
bold.replace_with(f"**{bold.text}**")
- # Italic
for italic in soup.find_all("i") + soup.find_all("em"):
italic.replace_with(f"*{italic.text}*")
- # Blockquote
for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
blockquote.replace_with(f">>> {blockquote.text}")
- # Code
for code in soup.find_all("code") + soup.find_all("pre"):
code.replace_with(f"`{code.text}`")
- # Links
for link in soup.find_all("a") + soup.find_all("link"):
- link_text = link.text or link.get("href") or "Link"
+ link_text: str = link.text or link.get("href") or "Link"
link.replace_with(f"[{link_text}]({link.get('href')})")
- # Strikethrough
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
strikethrough.replace_with(f"~~{strikethrough.text}~~")
- #
tags
for br in soup.find_all("br"):
br.replace_with("\n")
+ clean_soup: BeautifulSoup = BeautifulSoup(str(soup).replace("
", "\n"), features="lxml")
+
# Remove all other tags
- for tag in soup.find_all(True):
+ for tag in clean_soup.find_all(True):
tag.replace_with(tag.text)
- # Remove all leading and trailing whitespace
- soup_text: str = soup.text
- return soup_text.strip()
+ return clean_soup.text.strip()
diff --git a/tests/test_markdown.py b/tests/test_markdown.py
index 4020316..7c51fa1 100644
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@@ -1,7 +1,7 @@
from discord_rss_bot.markdown import convert_html_to_md
-def test_convert_to_md():
+def test_convert_to_md() -> None:
# Test bold
assert convert_html_to_md("'
+ '
Plus new options to mirror your camera and take a selfie.
' # noqa: E501
+ '