Fix markdown looking like shit

2024-05-24 01:11:38 +02:00
parent 1368607e29
commit 73b171dbfd
8 changed files with 79 additions and 151 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,8 +1,3 @@
 {
-    "cSpell.words": [
-        "botuser",
-        "Genshins",
-        "levelname",
-        "pipx"
-    ]
+  "cSpell.words": ["botuser", "Genshins", "levelname", "markdownify", "pipx"]
 }
--- a/discord_rss_bot/custom_message.py
+++ b/discord_rss_bot/custom_message.py
@@ -5,10 +5,10 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING

 from bs4 import BeautifulSoup
+from markdownify import markdownify
 from reader import Entry, Feed, Reader, TagNotFoundError

 from discord_rss_bot.is_url_valid import is_url_valid
-from discord_rss_bot.markdown import convert_html_to_md
 from discord_rss_bot.settings import get_reader, logger

 if TYPE_CHECKING:
@@ -68,8 +68,8 @@ def replace_tags_in_text_message(entry: Entry) -> str:

    first_image: str = get_first_image(summary, content)

-    summary = convert_html_to_md(summary)
-    content = convert_html_to_md(content)
+    summary = markdownify(summary)
+    content = markdownify(content)

    list_of_replacements = [
        {"{{feed_author}}": feed.author},
@@ -96,7 +96,7 @@ def replace_tags_in_text_message(entry: Entry) -> str:
        {"{{entry_read_modified}}": entry.read_modified},
        {"{{entry_summary}}": summary},
        {"{{entry_summary_raw}}": entry.summary or ""},
-        {"{{entry_text}}": content or summary},
+        {"{{entry_text}}": summary or content},
        {"{{entry_title}}": entry.title},
        {"{{entry_updated}}": entry.updated},
        {"{{image_1}}": first_image},
@@ -106,7 +106,8 @@ def replace_tags_in_text_message(entry: Entry) -> str:
        for template, replace_with in replacement.items():
            custom_message = try_to_replace(custom_message, template, replace_with)

-    return custom_message.replace("\\n", "\n")
+    our_custom_message = custom_message.replace("\\n", "\n")
+    return our_custom_message  # noqa: RET504


 def get_first_image(summary: str | None, content: str | None) -> str:
@@ -163,8 +164,8 @@ def replace_tags_in_embed(feed: Feed, entry: Entry) -> CustomEmbed:

    first_image: str = get_first_image(summary, content)

-    summary = convert_html_to_md(summary)
-    content = convert_html_to_md(content)
+    summary = markdownify(summary)
+    content = markdownify(content)

    feed_added: str = feed.added.strftime("%Y-%m-%d %H:%M:%S") if feed.added else "Never"
    feed_last_updated: str = feed.last_updated.strftime("%Y-%m-%d %H:%M:%S") if feed.last_updated else "Never"
@@ -198,7 +199,7 @@ def replace_tags_in_embed(feed: Feed, entry: Entry) -> CustomEmbed:
        {"{{entry_read_modified}}": entry_read_modified or ""},
        {"{{entry_summary}}": summary or ""},
        {"{{entry_summary_raw}}": entry.summary or ""},
-        {"{{entry_text}}": content or summary or ""},
+        {"{{entry_text}}": summary or content or ""},
        {"{{entry_title}}": entry.title or ""},
        {"{{entry_updated}}": entry_updated or ""},
        {"{{image_1}}": first_image or ""},
--- a/discord_rss_bot/feeds.py
+++ b/discord_rss_bot/feeds.py
@@ -2,12 +2,11 @@ from __future__ import annotations

 import datetime
 import pprint
-import textwrap
 from typing import TYPE_CHECKING

 from discord_webhook import DiscordEmbed, DiscordWebhook
 from fastapi import HTTPException
-from reader import Entry, Feed, FeedExistsError, Reader, TagNotFoundError
+from reader import Entry, EntryNotFoundError, Feed, FeedExistsError, Reader, StorageError, TagNotFoundError

 from discord_rss_bot import custom_message
 from discord_rss_bot.filter.blacklist import should_be_skipped
@@ -43,8 +42,6 @@ def send_entry_to_discord(entry: Entry, custom_reader: Reader | None = None) ->
    # This has to be a string for some reason so don't change it to "not custom_message.get_custom_message()"
    if custom_message.get_custom_message(reader, entry.feed) != "":  # noqa: PLC1901
        webhook_message = custom_message.replace_tags_in_text_message(entry=entry)
-    else:
-        webhook_message: str = str(default_custom_message)

    if not webhook_message:
        webhook_message = "No message found."
@@ -62,6 +59,38 @@ def send_entry_to_discord(entry: Entry, custom_reader: Reader | None = None) ->
    return None


+def set_description(custom_embed: custom_message.CustomEmbed, discord_embed: DiscordEmbed) -> None:
+    """Set the description of the embed.
+
+    Args:
+        custom_embed (custom_message.CustomEmbed): The custom embed to get the description from.
+        discord_embed (DiscordEmbed): The Discord embed to set the description on.
+    """
+    # Its actually 2048, but we will use 2000 to be safe.
+    max_description_length: int = 2000
+    embed_description: str = custom_embed.description
+    embed_description = (
+        embed_description[:max_description_length] + "..."
+        if len(embed_description) > max_description_length
+        else embed_description
+    )
+    discord_embed.set_description(embed_description) if embed_description else None
+
+
+def set_title(custom_embed: custom_message.CustomEmbed, discord_embed: DiscordEmbed) -> None:
+    """Set the title of the embed.
+
+    Args:
+        custom_embed: The custom embed to get the title from.
+        discord_embed: The Discord embed to set the title on.
+    """
+    # Its actually 256, but we will use 200 to be safe.
+    max_title_length: int = 200
+    embed_title: str = custom_embed.title
+    embed_title = embed_title[:max_title_length] + "..." if len(embed_title) > max_title_length else embed_title
+    discord_embed.set_title(embed_title) if embed_title else None
+
+
 def create_embed_webhook(webhook_url: str, entry: Entry) -> DiscordWebhook:
    """Create a webhook with an embed.

@@ -80,11 +109,8 @@ def create_embed_webhook(webhook_url: str, entry: Entry) -> DiscordWebhook:

    discord_embed: DiscordEmbed = DiscordEmbed()

-    embed_title: str = textwrap.shorten(custom_embed.title, width=200, placeholder="...")
-    discord_embed.set_title(embed_title) if embed_title else None
-
-    webhook_message: str = textwrap.shorten(custom_embed.description, width=2000, placeholder="...")
-    discord_embed.set_description(webhook_message) if webhook_message else None
+    set_description(custom_embed=custom_embed, discord_embed=discord_embed)
+    set_title(custom_embed=custom_embed, discord_embed=discord_embed)

    custom_embed_author_url: str | None = custom_embed.author_url
    if not is_url_valid(custom_embed_author_url):
@@ -158,7 +184,14 @@ def send_to_discord(custom_reader: Reader | None = None, feed: Feed | None = Non
            continue

        # Set the webhook to read, so we don't send it again.
-        reader.set_entry_read(entry, True)
+        try:
+            reader.set_entry_read(entry, True)
+        except EntryNotFoundError as e:
+            logger.error("Error setting entry to read: %s", e)
+            continue
+        except StorageError as e:
+            logger.error("Error setting entry to read: %s", e)
+            continue

        # Get the webhook URL for the entry. If it is None, we will continue to the next entry.
        webhook_url: str = str(reader.get_tag(entry.feed_url, "webhook", ""))
@@ -175,8 +208,13 @@ def send_to_discord(custom_reader: Reader | None = None, feed: Feed | None = Non
            else:
                webhook_message: str = str(default_custom_message)

-            # Truncate the webhook_message to 2000 characters
-            webhook_message = textwrap.shorten(webhook_message, width=2000, placeholder="...")
+            # Its actually 4096, but we will use 4000 to be safe.
+            max_content_length: int = 4000
+            webhook_message = (
+                webhook_message[:max_content_length] + "..."
+                if len(webhook_message) > max_content_length
+                else webhook_message
+            )

            # Create the webhook.
            webhook: DiscordWebhook = DiscordWebhook(url=webhook_url, content=webhook_message, rate_limit_retry=True)
--- a/discord_rss_bot/main.py
+++ b/discord_rss_bot/main.py
@@ -17,6 +17,7 @@ from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
 from httpx import Response
+from markdownify import markdownify
 from reader import Entry, Feed, FeedNotFoundError, Reader, TagNotFoundError
 from reader.types import JSONType
 from starlette.responses import RedirectResponse
@@ -36,7 +37,6 @@ from discord_rss_bot.custom_message import (
    save_embed,
 )
 from discord_rss_bot.feeds import create_feed, send_entry_to_discord, send_to_discord
-from discord_rss_bot.markdown import convert_html_to_md
 from discord_rss_bot.missing_tags import add_missing_tags
 from discord_rss_bot.search import create_html_for_search_results
 from discord_rss_bot.settings import get_reader
@@ -73,7 +73,7 @@ templates: Jinja2Templates = Jinja2Templates(directory="discord_rss_bot/template
 templates.env.filters["encode_url"] = encode_url
 templates.env.filters["entry_is_whitelisted"] = entry_is_whitelisted
 templates.env.filters["entry_is_blacklisted"] = entry_is_blacklisted
-templates.env.filters["discord_markdown"] = convert_html_to_md
+templates.env.filters["discord_markdown"] = markdownify


@app.post("/add_webhook")
--- a/discord_rss_bot/markdown.py
+++ b/discord_rss_bot/markdown.py
@@ -1,53 +0,0 @@
-from bs4 import BeautifulSoup
-
-
-def convert_html_to_md(html: str) -> str:
-    """Convert HTML to markdown.
-
-    Args:
-        html: The HTML to convert.
-
-    Returns:
-        Our markdown.
-    """
-    if not html:
-        return html
-
-    soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
-
-    for bold in soup.find_all("b") + soup.find_all("strong"):
-        bold.replace_with(f"**{bold.text}**")
-
-    for italic in soup.find_all("i") + soup.find_all("em"):
-        italic.replace_with(f"*{italic.text}*")
-
-    for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
-        blockquote.replace_with(f">>> {blockquote.text}")
-
-    for code in soup.find_all("code") + soup.find_all("pre"):
-        code.replace_with(f"`{code.text}`")
-
-    for image in soup.find_all("img"):
-        image.decompose()
-
-    for link in soup.find_all("a") + soup.find_all("link"):
-        if not link.get_text().strip():
-            link.decompose()
-        else:
-            link_text: str = link.text or link.get("href")
-            link_text = link_text.replace("http://", "").replace("https://", "")
-            link.replace_with(f"[{link_text}]({link.get('href')})")
-
-    for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
-        strikethrough.replace_with(f"~~{strikethrough.text}~~")
-
-    for br in soup.find_all("br"):
-        br.replace_with("\n")
-
-    clean_soup: BeautifulSoup = BeautifulSoup(str(soup).replace("</p>", "</p>\n"), features="lxml")
-
-    # Remove all other tags
-    for tag in clean_soup.find_all(True):
-        tag.replace_with(tag.text)
-
-    return clean_soup.text.strip()
--- a/poetry.lock
+++ b/poetry.lock
@@ -675,6 +675,21 @@ html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (==0.29.37)"]

+[[package]]
+name = "markdownify"
+version = "0.12.1"
+description = "Convert HTML to markdown."
+optional = false
+python-versions = "*"
+files = [
+    {file = "markdownify-0.12.1-py3-none-any.whl", hash = "sha256:a3805abd8166dbb7b27783c5599d91f54f10d79894b2621404d85b333c7ce561"},
+    {file = "markdownify-0.12.1.tar.gz", hash = "sha256:1fb08c618b30e0ee7a31a39b998f44a18fb28ab254f55f4af06b6d35a2179e27"},
+]
+
+[package.dependencies]
+beautifulsoup4 = ">=4.9,<5"
+six = ">=1.15,<2"
+
 [[package]]
 name = "markupsafe"
 version = "2.1.5"
@@ -1673,4 +1688,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "1a20eeb21e0dad90c4116b164c8d7a796e53b2bfad916ed494970ee84ee2de52"
+content-hash = "80dda8c54105faacac42a5eb722aa0e985bf42443bf2b8d32d2bda90e4fb5756"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ python-multipart = "^0.0.9"
 reader = "^3.12"
 tomlkit = "^0.12.0"
 uvicorn = { extras = ["standard"], version = "^0.29.0" }
+markdownify = "^0.12.1"

 [tool.poetry.group.dev.dependencies]
 djlint = "^1.34.1"
--- a/tests/test_markdown.py
+++ b/tests/test_markdown.py
@@ -1,69 +0,0 @@
-from discord_rss_bot.markdown import convert_html_to_md
-
-
-def test_convert_to_md() -> None:
-    # Test bold
-    assert convert_html_to_md("<b>bold</b>") == "**bold**"
-
-    # Test italic
-    assert convert_html_to_md("<i>italic</i>") == "*italic*"
-
-    # Test blockquote
-    assert convert_html_to_md("<blockquote>blockquote</blockquote>") == ">>> blockquote"
-
-    # Test code
-    assert convert_html_to_md("<code>code</code>") == "`code`"
-
-    # Test strikethrough
-    assert convert_html_to_md("<s>strikethrough</s>") == "~~strikethrough~~"
-
-    # Test link
-    assert convert_html_to_md('<a href="https://example.com">link</a>') == "[link](https://example.com)"
-
-    # Test pre code
-    assert convert_html_to_md("<pre><code>pre code</code></pre>") == "``pre code``"
-
-    # Test strong
-    assert convert_html_to_md("<strong>strong</strong>") == "**strong**"
-
-    # Test multiple tags
-    assert (
-        convert_html_to_md(
-            '<b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s>',
-        )
-        == "**bold** *italic* [link](https://example.com) `code` ~~strikethrough~~"
-    )
-
-    # Test removing all other tags
-    assert convert_html_to_md("<p>paragraph</p>") == "paragraph"
-    assert convert_html_to_md("<p>paragraph</p><p>paragraph</p>") == "paragraph\nparagraph"
-
-    # Test <br> tags
-    assert convert_html_to_md("<p>paragraph<br>paragraph</p>") == "paragraph\nparagraph"
-
-    # Test removing trailing newline
-    assert convert_html_to_md("paragraph ") == "paragraph"
-
-    # Test removing leading and trailing whitespace
-    assert convert_html_to_md(" paragraph ") == "paragraph"
-
-    # Test removing leading and trailing whitespace and trailing newline
-    assert convert_html_to_md(" paragraph\n \n") == "paragraph"
-
-    # Test real entry
-    nvidia_entry: str = (
-        '<p><a href="https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/">'
-        "NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements</a></p>"
-        '<div class="field field-name-field-short-description field-type-text-long field-label-hidden">'
-        '<div class="field-items"><div class="field-item even">Plus new options to mirror your camera and take a selfie.</div>'  # noqa: E501
-        '</div></div><div class="field field-name-field-thumbnail-image field-type-image field-label-hidden">'
-        '<div class="field-items"><div class="field-item even"><a href="https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/">'
-        '<img width="210" src="https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/news/jan-2023-nvidia-broadcast-update/broadcast-owned-asset-625x330-newsfeed.png"'
-        ' title="NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements" '
-        'alt="NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements"></a></div></div></div>'  # noqa: E501
-    )
-    assert (
-        convert_html_to_md(nvidia_entry)
-        == "[NVIDIA Broadcast 1.4 Adds Eye Contact and Vignette Effects With Virtual Background Enhancements](https://www.nvidia.com/en-us/geforce/news/jan-2023-nvidia-broadcast-update/)\n"
-        "Plus new options to mirror your camera and take a selfie."
-    )