Create our own HTML to Markdown converter

This commit is contained in:
2023-01-21 22:48:00 +01:00
parent 0bcc551a3f
commit d4fd70965e
7 changed files with 203 additions and 78 deletions

View File

@ -1,7 +1,6 @@
import urllib.parse
from functools import lru_cache
import html2text
from reader import Entry, Reader
from discord_rss_bot.filter.blacklist import has_black_tags, should_be_skipped
@ -54,14 +53,3 @@ def entry_is_blacklisted(entry_to_check: Entry) -> bool:
"""
return bool(has_black_tags(reader, entry_to_check.feed) and should_be_skipped(reader, entry_to_check))
@lru_cache()
def convert_to_md(thing: str) -> str:
"""Discord does not support tables so we need to remove them from the markdown."""
text_maker: html2text.HTML2Text = html2text.HTML2Text()
# Ignore tables
text_maker.ignore_tables = True
return text_maker.handle(thing) if thing else ""

View File

@ -1,32 +1,43 @@
import re
from functools import lru_cache
from bs4 import BeautifulSoup
from reader import Entry, Feed, Reader, TagNotFoundError
from discord_rss_bot.custom_filters import convert_to_md
from discord_rss_bot.markdown import convert_html_to_md
from discord_rss_bot.settings import get_reader
def get_images_from_entry(entry: Entry, summary: bool = False) -> list[str]:
def get_images_from_entry(entry: Entry):
"""Get images from a entry.
Args:
entry: The entry to get the images from.
summary: Whether to get the images from the summary or the content.
Returns:
Returns a list of images.
"""
# This regex will match any markdown image that follows the format of ![alt text](image url).
image_regex = r"!\[(.*)\]\((.*)\)"
if summary:
return re.findall(image_regex, convert_to_md(entry.summary)) if entry.summary else []
def return_image(found_images):
soup: BeautifulSoup = BeautifulSoup(found_images, "html.parser")
images = soup.find_all("img")
for image in images:
image_src = image["src"] or ""
image_alt: str = "Link to image"
if image.get("alt"):
image_alt = image.get("alt")
return [(image_src, image_alt)]
return re.findall(image_regex, convert_to_md(entry.content[0].value)) if entry.content else []
images = []
# Get the images from the summary with beautiful soup
if entry.summary:
images = return_image(entry.summary)
# Get the images from the content with beautiful soup
if entry.content:
images = return_image(entry.content[0].value)
# No images found
return images
@lru_cache()
def try_to_replace(custom_message: str, template: str, replace_with: str) -> str:
"""Try to replace a tag in custom_message.
@ -45,19 +56,6 @@ def try_to_replace(custom_message: str, template: str, replace_with: str) -> str
return custom_message
@lru_cache()
def remove_image_tags(message: str) -> str:
"""Remove image tags from message.
Args:
message: The message to remove the tags from.
Returns:
Returns the message with the image tags removed.
"""
return re.sub(r"!\[(.*)\]\((.*)\)", "", message)
def replace_tags(feed: Feed, entry: Entry) -> str:
"""Replace tags in custom_message.
@ -75,17 +73,15 @@ def replace_tags(feed: Feed, entry: Entry) -> str:
content = ""
if entry.summary:
summary: str = entry.summary
summary = convert_to_md(summary)
summary = remove_image_tags(message=summary)
summary = convert_html_to_md(summary)
if entry.content:
for content_item in entry.content:
content: str = content_item.value
content = convert_to_md(content)
content = remove_image_tags(message=content)
content = convert_html_to_md(content)
if images := get_images_from_entry(entry=entry):
first_image: str = images[0][1]
first_image: str = images[0][0]
else:
first_image = ""
@ -123,10 +119,7 @@ def replace_tags(feed: Feed, entry: Entry) -> str:
for template, replace_with in replacement.items():
custom_message = try_to_replace(custom_message, template, replace_with)
# Replace \\n with newlines.
custom_message_with_newlines = custom_message.replace("\\n", "\n")
return custom_message_with_newlines
return custom_message.replace("\\n", "\n")
def get_custom_message(custom_reader: Reader, feed: Feed) -> str:

View File

@ -12,11 +12,12 @@ from reader import Entry, EntryCounts, EntrySearchCounts, EntrySearchResult, Fee
from starlette.responses import RedirectResponse
from discord_rss_bot import settings
from discord_rss_bot.custom_filters import convert_to_md, encode_url, entry_is_blacklisted, entry_is_whitelisted
from discord_rss_bot.custom_message import get_custom_message, get_images_from_entry, remove_image_tags
from discord_rss_bot.custom_filters import encode_url, entry_is_blacklisted, entry_is_whitelisted
from discord_rss_bot.custom_message import get_custom_message, get_images_from_entry, replace_tags
from discord_rss_bot.feeds import get_entry_from_id, send_entry_to_discord, send_to_discord
from discord_rss_bot.filter.blacklist import get_blacklist_content, get_blacklist_summary, get_blacklist_title
from discord_rss_bot.filter.whitelist import get_whitelist_content, get_whitelist_summary, get_whitelist_title
from discord_rss_bot.markdown import convert_html_to_md
from discord_rss_bot.search import create_html_for_search_results
from discord_rss_bot.settings import default_custom_message, get_reader, list_webhooks
@ -30,8 +31,7 @@ reader: Reader = get_reader()
templates.env.filters["encode_url"] = encode_url
templates.env.filters["entry_is_whitelisted"] = entry_is_whitelisted
templates.env.filters["entry_is_blacklisted"] = entry_is_blacklisted
templates.env.filters["discord_markdown"] = convert_to_md
templates.env.filters["remove_image_tags"] = remove_image_tags
templates.env.filters["discord_markdown"] = convert_html_to_md
@app.post("/add_webhook")
@ -429,19 +429,13 @@ def create_html_for_feed(entries: Iterable[Entry]) -> str:
first_image = ""
first_image_text = ""
if images := get_images_from_entry(entry=entry):
first_image: str = images[0][1]
first_image_text: str = images[0][0]
first_image: str = images[0][0]
first_image_text: str = images[0][1]
# Get the text from the entry.
text = "<div class='text-muted'>No content available.</div>"
if entry.summary:
summary: str = convert_to_md(entry.summary)
summary = remove_image_tags(message=summary)
text: str = f"<div class='text-muted'>{summary}</div>"
elif entry.content:
content: str = convert_to_md(entry.content[0].value)
content = remove_image_tags(message=content)
text = f"<div class='text-muted'>{content}</div>"
text = replace_tags(entry.feed, entry)
if not text:
text = "<div class='text-muted'>No content available.</div>"
published = ""
if entry.published:

View File

@ -0,0 +1,68 @@
from functools import lru_cache
from bs4 import BeautifulSoup
@lru_cache(maxsize=2048)
def convert_html_to_md(html: str) -> str:
"""Convert HTML to Markdown.
Discord supports:
- Bold with **text**
- Italic with *text*
- Blockquote with >>> text
- Code with `text`
- Fence code with ```text```
- Links with [text](url)
- Syntax highlighting with ```language
- Strikethrough with ~~text~~
"""
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
# Bold
for bold in soup.find_all("b") + soup.find_all("strong"):
bold.replace_with(f"**{bold.text}**")
# Italic
for italic in soup.find_all("i") + soup.find_all("em"):
italic.replace_with(f"*{italic.text}*")
# Blockquote
for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
blockquote.replace_with(f">>> {blockquote.text}")
# Code
for code in soup.find_all("code") + soup.find_all("pre"):
code.replace_with(f"`{code.text}`")
# Links
for link in soup.find_all("a") + soup.find_all("link"):
link_text = link.text or link.get("href") or "Link"
link.replace_with(f"[{link_text}]({link.get('href')})")
# Strikethrough
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
strikethrough.replace_with(f"~~{strikethrough.text}~~")
# <br> tags
for br in soup.find_all("br"):
br.replace_with("\n")
# Remove all other tags
for tag in soup.find_all(True):
tag.replace_with(tag.text)
# If the text ends with a newline, remove it
# return soup.text[:-1] if soup.text.endswith("\n") else soup.text
return soup.text
# Test the function
if __name__ == "__main__":
html: str = """
<p><b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s></p>
<blockquote>blockquote</blockquote>
<pre><code>pre code</code></pre>
<strong>strong</strong>
"""
print(convert_html_to_md(html))

View File

@ -119,7 +119,7 @@
{% raw %}
{{entry_content}}
{% endraw %}
</code>{{entry.content[0].value|discord_markdown|remove_image_tags}}
</code>{{entry.content[0].value|discord_markdown}}
</li>
<li>
<code>
@ -175,7 +175,7 @@
{% raw %}
{{entry_summary}}
{% endraw %}
</code>{{entry.summary|discord_markdown|remove_image_tags}}
</code>{{entry.summary|discord_markdown}}
</li>
<li>
<code>
@ -222,7 +222,7 @@
<code>
<pre>
{{feed.title -}}
{{- entry.content[0].value|discord_markdown|remove_image_tags -}}
{{- entry.content[0].value|discord_markdown -}}
</pre>
</code>
</li>