Remove images and URLs without any text when converting HTML to Markdown

This commit is contained in:
2023-01-29 01:07:36 +01:00
parent 4d8a0e9ded
commit 155b94cff2
2 changed files with 12 additions and 3 deletions

View File

@ -10,6 +10,9 @@ def convert_html_to_md(html: str) -> str:
Returns:
Our markdown.
"""
if not html:
return html
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
for bold in soup.find_all("b") + soup.find_all("strong"):
@ -24,9 +27,15 @@ def convert_html_to_md(html: str) -> str:
for code in soup.find_all("code") + soup.find_all("pre"):
code.replace_with(f"`{code.text}`")
for image in soup.find_all("img"):
image.decompose()
for link in soup.find_all("a") + soup.find_all("link"):
link_text: str = link.text or link.get("href") or "Link"
link.replace_with(f"[{link_text}]({link.get('href')})")
if not link.get_text().strip():
link.decompose()
else:
link_text: str = link.text or link.get("href")
link.replace_with(f"[{link_text}]({link.get('href')})")
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
strikethrough.replace_with(f"~~{strikethrough.text}~~")