Update HTML to markdown converter
This commit is contained in:
@ -1,57 +1,43 @@
|
|||||||
from functools import lru_cache
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=2048)
|
|
||||||
def convert_html_to_md(html: str) -> str:
|
def convert_html_to_md(html: str) -> str:
|
||||||
"""Convert HTML to Markdown.
|
"""Convert HTML to markdown.
|
||||||
|
|
||||||
Discord supports:
|
Args:
|
||||||
- Bold with **text**
|
html: The HTML to convert.
|
||||||
- Italic with *text*
|
|
||||||
- Blockquote with >>> text
|
Returns:
|
||||||
- Code with `text`
|
Our markdown.
|
||||||
- Fence code with ```text```
|
|
||||||
- Links with [text](url)
|
|
||||||
- Syntax highlighting with ```language
|
|
||||||
- Strikethrough with ~~text~~
|
|
||||||
"""
|
"""
|
||||||
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
|
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
|
||||||
|
|
||||||
# Bold
|
|
||||||
for bold in soup.find_all("b") + soup.find_all("strong"):
|
for bold in soup.find_all("b") + soup.find_all("strong"):
|
||||||
bold.replace_with(f"**{bold.text}**")
|
bold.replace_with(f"**{bold.text}**")
|
||||||
|
|
||||||
# Italic
|
|
||||||
for italic in soup.find_all("i") + soup.find_all("em"):
|
for italic in soup.find_all("i") + soup.find_all("em"):
|
||||||
italic.replace_with(f"*{italic.text}*")
|
italic.replace_with(f"*{italic.text}*")
|
||||||
|
|
||||||
# Blockquote
|
|
||||||
for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
|
for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
|
||||||
blockquote.replace_with(f">>> {blockquote.text}")
|
blockquote.replace_with(f">>> {blockquote.text}")
|
||||||
|
|
||||||
# Code
|
|
||||||
for code in soup.find_all("code") + soup.find_all("pre"):
|
for code in soup.find_all("code") + soup.find_all("pre"):
|
||||||
code.replace_with(f"`{code.text}`")
|
code.replace_with(f"`{code.text}`")
|
||||||
|
|
||||||
# Links
|
|
||||||
for link in soup.find_all("a") + soup.find_all("link"):
|
for link in soup.find_all("a") + soup.find_all("link"):
|
||||||
link_text = link.text or link.get("href") or "Link"
|
link_text: str = link.text or link.get("href") or "Link"
|
||||||
link.replace_with(f"[{link_text}]({link.get('href')})")
|
link.replace_with(f"[{link_text}]({link.get('href')})")
|
||||||
|
|
||||||
# Strikethrough
|
|
||||||
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
|
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
|
||||||
strikethrough.replace_with(f"~~{strikethrough.text}~~")
|
strikethrough.replace_with(f"~~{strikethrough.text}~~")
|
||||||
|
|
||||||
# <br> tags
|
|
||||||
for br in soup.find_all("br"):
|
for br in soup.find_all("br"):
|
||||||
br.replace_with("\n")
|
br.replace_with("\n")
|
||||||
|
|
||||||
|
clean_soup: BeautifulSoup = BeautifulSoup(str(soup).replace("</p>", "</p>\n"), features="lxml")
|
||||||
|
|
||||||
# Remove all other tags
|
# Remove all other tags
|
||||||
for tag in soup.find_all(True):
|
for tag in clean_soup.find_all(True):
|
||||||
tag.replace_with(tag.text)
|
tag.replace_with(tag.text)
|
||||||
|
|
||||||
# Remove all leading and trailing whitespace
|
return clean_soup.text.strip()
|
||||||
soup_text: str = soup.text
|
|
||||||
return soup_text.strip()
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from discord_rss_bot.markdown import convert_html_to_md
|
from discord_rss_bot.markdown import convert_html_to_md
|
||||||
|
|
||||||
|
|
||||||
def test_convert_to_md():
|
def test_convert_to_md() -> None:
|
||||||
# Test bold
|
# Test bold
|
||||||
assert convert_html_to_md("<b>bold</b>") == "**bold**"
|
assert convert_html_to_md("<b>bold</b>") == "**bold**"
|
||||||
|
|
||||||
@ -28,22 +28,16 @@ def test_convert_to_md():
|
|||||||
|
|
||||||
# Test multiple tags
|
# Test multiple tags
|
||||||
assert (
|
assert (
|
||||||
convert_html_to_md(
|
convert_html_to_md('<b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s>') # noqa: E501
|
||||||
'<b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s>'
|
|
||||||
)
|
|
||||||
== "**bold** *italic* [link](https://example.com) `code` ~~strikethrough~~"
|
== "**bold** *italic* [link](https://example.com) `code` ~~strikethrough~~"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test removing all other tags
|
# Test removing all other tags
|
||||||
assert convert_html_to_md("<p>paragraph</p>") == "paragraph"
|
assert convert_html_to_md("<p>paragraph</p>") == "paragraph"
|
||||||
assert convert_html_to_md("<p>paragraph</p><p>paragraph</p>") == "paragraphparagraph"
|
assert convert_html_to_md("<p>paragraph</p><p>paragraph</p>") == "paragraph\nparagraph"
|
||||||
|
|
||||||
# Test <br> tags
|
# Test <br> tags
|
||||||
assert (
|
assert convert_html_to_md("<p>paragraph<br>paragraph</p>") == "paragraph\nparagraph"
|
||||||
convert_html_to_md("<p>paragraph<br>paragraph</p>")
|
|
||||||
== """paragraph
|
|
||||||
paragraph"""
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test removing trailing newline
|
# Test removing trailing newline
|
||||||
assert convert_html_to_md("paragraph ") == "paragraph"
|
assert convert_html_to_md("paragraph ") == "paragraph"
|
||||||
@ -52,11 +46,22 @@ paragraph"""
|
|||||||
assert convert_html_to_md(" paragraph ") == "paragraph"
|
assert convert_html_to_md(" paragraph ") == "paragraph"
|
||||||
|
|
||||||
# Test removing leading and trailing whitespace and trailing newline
|
# Test removing leading and trailing whitespace and trailing newline
|
||||||
assert (
|
assert convert_html_to_md(" paragraph\n \n") == "paragraph"
|
||||||
convert_html_to_md(
|
|
||||||
""" paragraph
|
|
||||||
|
|
||||||
""" # noqa: W293
|
# Test real entry
|
||||||
)
|
nvidia_entry: str = (
|
||||||
== "paragraph"
|
'<p><a href="https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/">'
|
||||||
|
"NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements</a></p>"
|
||||||
|
'<div class="field field-name-field-short-description field-type-text-long field-label-hidden">'
|
||||||
|
'<div class="field-items"><div class="field-item even">Plus new options to mirror your camera and take a selfie.</div>' # noqa: E501
|
||||||
|
'</div></div><div class="field field-name-field-thumbnail-image field-type-image field-label-hidden">'
|
||||||
|
'<div class="field-items"><div class="field-item even"><a href="https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/">' # noqa: E501
|
||||||
|
'<img width="210" src="https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/news/jan-2023-nvidia-broadcast-update/broadcast-owned-asset-625x330-newsfeed.png"' # noqa: E501
|
||||||
|
' title="NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements" '
|
||||||
|
'alt="NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements"></a></div></div></div>' # noqa: E501
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
convert_html_to_md(nvidia_entry)
|
||||||
|
== "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n" # noqa: E501
|
||||||
|
"Plus new options to mirror your camera and take a selfie.[https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)" # noqa: E501
|
||||||
)
|
)
|
||||||
|
Reference in New Issue
Block a user