Create our own HTML to Markdown converter

2023-01-21 22:48:00 +01:00
parent 0bcc551a3f
commit d4fd70965e
7 changed files with 203 additions and 78 deletions
--- a/discord_rss_bot/custom_filters.py
+++ b/discord_rss_bot/custom_filters.py
@ -1,7 +1,6 @@
 import urllib.parse
 from functools import lru_cache

-import html2text
 from reader import Entry, Reader

 from discord_rss_bot.filter.blacklist import has_black_tags, should_be_skipped
@ -54,14 +53,3 @@ def entry_is_blacklisted(entry_to_check: Entry) -> bool:

    """
    return bool(has_black_tags(reader, entry_to_check.feed) and should_be_skipped(reader, entry_to_check))
-
-
-@lru_cache()
-def convert_to_md(thing: str) -> str:
-    """Discord does not support tables so we need to remove them from the markdown."""
-    text_maker: html2text.HTML2Text = html2text.HTML2Text()
-
-    # Ignore tables
-    text_maker.ignore_tables = True
-
-    return text_maker.handle(thing) if thing else ""
--- a/discord_rss_bot/custom_message.py
+++ b/discord_rss_bot/custom_message.py
@ -1,32 +1,43 @@
-import re
-from functools import lru_cache
-
+from bs4 import BeautifulSoup
 from reader import Entry, Feed, Reader, TagNotFoundError

-from discord_rss_bot.custom_filters import convert_to_md
+from discord_rss_bot.markdown import convert_html_to_md
 from discord_rss_bot.settings import get_reader


-def get_images_from_entry(entry: Entry, summary: bool = False) -> list[str]:
+def get_images_from_entry(entry: Entry):
    """Get images from a entry.

    Args:
        entry: The entry to get the images from.
-        summary: Whether to get the images from the summary or the content.

    Returns:
        Returns a list of images.
    """
-    # This regex will match any markdown image that follows the format of ![alt text](image url).
-    image_regex = r"!\[(.*)\]\((.*)\)"

-    if summary:
-        return re.findall(image_regex, convert_to_md(entry.summary)) if entry.summary else []
+    def return_image(found_images):
+        soup: BeautifulSoup = BeautifulSoup(found_images, "html.parser")
+        images = soup.find_all("img")
+        for image in images:
+            image_src = image["src"] or ""
+            image_alt: str = "Link to image"
+            if image.get("alt"):
+                image_alt = image.get("alt")
+            return [(image_src, image_alt)]

-    return re.findall(image_regex, convert_to_md(entry.content[0].value)) if entry.content else []
+    images = []
+    # Get the images from the summary with beautiful soup
+    if entry.summary:
+        images = return_image(entry.summary)
+
+    # Get the images from the content with beautiful soup
+    if entry.content:
+        images = return_image(entry.content[0].value)
+
+    # No images found
+    return images


-@lru_cache()
 def try_to_replace(custom_message: str, template: str, replace_with: str) -> str:
    """Try to replace a tag in custom_message.

@ -45,19 +56,6 @@ def try_to_replace(custom_message: str, template: str, replace_with: str) -> str
        return custom_message


-@lru_cache()
-def remove_image_tags(message: str) -> str:
-    """Remove image tags from message.
-
-    Args:
-        message: The message to remove the tags from.
-
-    Returns:
-        Returns the message with the image tags removed.
-    """
-    return re.sub(r"!\[(.*)\]\((.*)\)", "", message)
-
-
 def replace_tags(feed: Feed, entry: Entry) -> str:
    """Replace tags in custom_message.

@ -75,17 +73,15 @@ def replace_tags(feed: Feed, entry: Entry) -> str:
    content = ""
    if entry.summary:
        summary: str = entry.summary
-        summary = convert_to_md(summary)
-        summary = remove_image_tags(message=summary)
+        summary = convert_html_to_md(summary)

    if entry.content:
        for content_item in entry.content:
            content: str = content_item.value
-            content = convert_to_md(content)
-            content = remove_image_tags(message=content)
+            content = convert_html_to_md(content)

    if images := get_images_from_entry(entry=entry):
-        first_image: str = images[0][1]
+        first_image: str = images[0][0]
    else:
        first_image = ""

@ -123,10 +119,7 @@ def replace_tags(feed: Feed, entry: Entry) -> str:
        for template, replace_with in replacement.items():
            custom_message = try_to_replace(custom_message, template, replace_with)

-    # Replace \\n with newlines.
-    custom_message_with_newlines = custom_message.replace("\\n", "\n")
-
-    return custom_message_with_newlines
+    return custom_message.replace("\\n", "\n")


 def get_custom_message(custom_reader: Reader, feed: Feed) -> str:
--- a/discord_rss_bot/main.py
+++ b/discord_rss_bot/main.py
@ -12,11 +12,12 @@ from reader import Entry, EntryCounts, EntrySearchCounts, EntrySearchResult, Fee
 from starlette.responses import RedirectResponse

 from discord_rss_bot import settings
-from discord_rss_bot.custom_filters import convert_to_md, encode_url, entry_is_blacklisted, entry_is_whitelisted
-from discord_rss_bot.custom_message import get_custom_message, get_images_from_entry, remove_image_tags
+from discord_rss_bot.custom_filters import encode_url, entry_is_blacklisted, entry_is_whitelisted
+from discord_rss_bot.custom_message import get_custom_message, get_images_from_entry, replace_tags
 from discord_rss_bot.feeds import get_entry_from_id, send_entry_to_discord, send_to_discord
 from discord_rss_bot.filter.blacklist import get_blacklist_content, get_blacklist_summary, get_blacklist_title
 from discord_rss_bot.filter.whitelist import get_whitelist_content, get_whitelist_summary, get_whitelist_title
+from discord_rss_bot.markdown import convert_html_to_md
 from discord_rss_bot.search import create_html_for_search_results
 from discord_rss_bot.settings import default_custom_message, get_reader, list_webhooks

@ -30,8 +31,7 @@ reader: Reader = get_reader()
 templates.env.filters["encode_url"] = encode_url
 templates.env.filters["entry_is_whitelisted"] = entry_is_whitelisted
 templates.env.filters["entry_is_blacklisted"] = entry_is_blacklisted
-templates.env.filters["discord_markdown"] = convert_to_md
-templates.env.filters["remove_image_tags"] = remove_image_tags
+templates.env.filters["discord_markdown"] = convert_html_to_md


@app.post("/add_webhook")
@ -429,19 +429,13 @@ def create_html_for_feed(entries: Iterable[Entry]) -> str:
        first_image = ""
        first_image_text = ""
        if images := get_images_from_entry(entry=entry):
-            first_image: str = images[0][1]
-            first_image_text: str = images[0][0]
+            first_image: str = images[0][0]
+            first_image_text: str = images[0][1]

        # Get the text from the entry.
-        text = "<div class='text-muted'>No content available.</div>"
-        if entry.summary:
-            summary: str = convert_to_md(entry.summary)
-            summary = remove_image_tags(message=summary)
-            text: str = f"<div class='text-muted'>{summary}</div>"
-        elif entry.content:
-            content: str = convert_to_md(entry.content[0].value)
-            content = remove_image_tags(message=content)
-            text = f"<div class='text-muted'>{content}</div>"
+        text = replace_tags(entry.feed, entry)
+        if not text:
+            text = "<div class='text-muted'>No content available.</div>"

        published = ""
        if entry.published:
--- a/discord_rss_bot/markdown.py
+++ b/discord_rss_bot/markdown.py
@ -0,0 +1,68 @@
+from functools import lru_cache
+
+from bs4 import BeautifulSoup
+
+
+@lru_cache(maxsize=2048)
+def convert_html_to_md(html: str) -> str:
+    """Convert HTML to Markdown.
+
+    Discord supports:
+    - Bold with **text**
+    - Italic with *text*
+    - Blockquote with >>> text
+    - Code with `text`
+        - Fence code with ```text```
+    - Links with [text](url)
+    - Syntax highlighting with ```language
+    - Strikethrough with ~~text~~
+    """
+    soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
+
+    # Bold
+    for bold in soup.find_all("b") + soup.find_all("strong"):
+        bold.replace_with(f"**{bold.text}**")
+
+    # Italic
+    for italic in soup.find_all("i") + soup.find_all("em"):
+        italic.replace_with(f"*{italic.text}*")
+
+    # Blockquote
+    for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
+        blockquote.replace_with(f">>> {blockquote.text}")
+
+    # Code
+    for code in soup.find_all("code") + soup.find_all("pre"):
+        code.replace_with(f"`{code.text}`")
+
+    # Links
+    for link in soup.find_all("a") + soup.find_all("link"):
+        link_text = link.text or link.get("href") or "Link"
+        link.replace_with(f"[{link_text}]({link.get('href')})")
+
+    # Strikethrough
+    for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
+        strikethrough.replace_with(f"~~{strikethrough.text}~~")
+
+    # <br> tags
+    for br in soup.find_all("br"):
+        br.replace_with("\n")
+
+    # Remove all other tags
+    for tag in soup.find_all(True):
+        tag.replace_with(tag.text)
+
+    # If the text ends with a newline, remove it
+    # return soup.text[:-1] if soup.text.endswith("\n") else soup.text
+    return soup.text
+
+
+# Test the function
+if __name__ == "__main__":
+    html: str = """
+    <p><b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s></p>
+    <blockquote>blockquote</blockquote>
+    <pre><code>pre code</code></pre>
+    <strong>strong</strong>
+    """
+    print(convert_html_to_md(html))
--- a/discord_rss_bot/templates/custom.html
+++ b/discord_rss_bot/templates/custom.html
@ -119,7 +119,7 @@
                                        {% raw %}
                                            {{entry_content}}
                                        {% endraw %}
-                                    </code>{{entry.content[0].value|discord_markdown|remove_image_tags}}
+                                    </code>{{entry.content[0].value|discord_markdown}}
                                </li>
                                <li>
                                    <code>
@ -175,7 +175,7 @@
                                        {% raw %}
                                            {{entry_summary}}
                                        {% endraw %}
-                                    </code>{{entry.summary|discord_markdown|remove_image_tags}}
+                                    </code>{{entry.summary|discord_markdown}}
                                </li>
                                <li>
                                    <code>
@ -222,7 +222,7 @@
                                    <code>
                                        <pre>
 {{feed.title -}}
-{{- entry.content[0].value|discord_markdown|remove_image_tags -}}
+{{- entry.content[0].value|discord_markdown -}}
                            </pre>
                                    </code>
                                </li>