discord-rss-bot/discord_rss_bot/markdown.py

from bs4 import BeautifulSoup


def convert_html_to_md(html: str) -> str:
    """Convert HTML to markdown.

    Args:
        html: The HTML to convert.

    Returns:
        Our markdown.
    """
    if not html:
        return html

    soup: BeautifulSoup = BeautifulSoup(html, features="lxml")

    for bold in soup.find_all("b") + soup.find_all("strong"):
        bold.replace_with(f"**{bold.text}**")

    for italic in soup.find_all("i") + soup.find_all("em"):
        italic.replace_with(f"*{italic.text}*")

    for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
        blockquote.replace_with(f">>> {blockquote.text}")

    for code in soup.find_all("code") + soup.find_all("pre"):
        code.replace_with(f"`{code.text}`")

    for image in soup.find_all("img"):
        image.decompose()

    for link in soup.find_all("a") + soup.find_all("link"):
        if not link.get_text().strip():
            link.decompose()
        else:
            link_text: str = link.text or link.get("href")
            link_text = link_text.replace("http://", "").replace("https://", "")
            link.replace_with(f"[{link_text}]({link.get('href')})")

    for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
        strikethrough.replace_with(f"~~{strikethrough.text}~~")

    for br in soup.find_all("br"):
        br.replace_with("\n")

    clean_soup: BeautifulSoup = BeautifulSoup(str(soup).replace("</p>", "</p>\n"), features="lxml")

    # Remove all other tags
    for tag in clean_soup.find_all(True):
        tag.replace_with(tag.text)

    return clean_soup.text.strip()