Remove images and URLs without any text when converting HTML to Markdown
This commit is contained in:
@ -10,6 +10,9 @@ def convert_html_to_md(html: str) -> str:
|
||||
Returns:
|
||||
Our markdown.
|
||||
"""
|
||||
if not html:
|
||||
return html
|
||||
|
||||
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
|
||||
|
||||
for bold in soup.find_all("b") + soup.find_all("strong"):
|
||||
@ -24,8 +27,14 @@ def convert_html_to_md(html: str) -> str:
|
||||
for code in soup.find_all("code") + soup.find_all("pre"):
|
||||
code.replace_with(f"`{code.text}`")
|
||||
|
||||
for image in soup.find_all("img"):
|
||||
image.decompose()
|
||||
|
||||
for link in soup.find_all("a") + soup.find_all("link"):
|
||||
link_text: str = link.text or link.get("href") or "Link"
|
||||
if not link.get_text().strip():
|
||||
link.decompose()
|
||||
else:
|
||||
link_text: str = link.text or link.get("href")
|
||||
link.replace_with(f"[{link_text}]({link.get('href')})")
|
||||
|
||||
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
|
||||
|
@ -63,5 +63,5 @@ def test_convert_to_md() -> None:
|
||||
assert (
|
||||
convert_html_to_md(nvidia_entry)
|
||||
== "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n" # noqa: E501
|
||||
"Plus new options to mirror your camera and take a selfie.[https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)" # noqa: E501
|
||||
"Plus new options to mirror your camera and take a selfie."
|
||||
)
|
||||
|
Reference in New Issue
Block a user