Create our own HTML to Markdown converter
This commit is contained in:
@ -1,7 +1,6 @@
|
||||
import urllib.parse
|
||||
from functools import lru_cache
|
||||
|
||||
import html2text
|
||||
from reader import Entry, Reader
|
||||
|
||||
from discord_rss_bot.filter.blacklist import has_black_tags, should_be_skipped
|
||||
@ -54,14 +53,3 @@ def entry_is_blacklisted(entry_to_check: Entry) -> bool:
|
||||
|
||||
"""
|
||||
return bool(has_black_tags(reader, entry_to_check.feed) and should_be_skipped(reader, entry_to_check))
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def convert_to_md(thing: str) -> str:
|
||||
"""Discord does not support tables so we need to remove them from the markdown."""
|
||||
text_maker: html2text.HTML2Text = html2text.HTML2Text()
|
||||
|
||||
# Ignore tables
|
||||
text_maker.ignore_tables = True
|
||||
|
||||
return text_maker.handle(thing) if thing else ""
|
||||
|
@ -1,32 +1,43 @@
|
||||
import re
|
||||
from functools import lru_cache
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from reader import Entry, Feed, Reader, TagNotFoundError
|
||||
|
||||
from discord_rss_bot.custom_filters import convert_to_md
|
||||
from discord_rss_bot.markdown import convert_html_to_md
|
||||
from discord_rss_bot.settings import get_reader
|
||||
|
||||
|
||||
def get_images_from_entry(entry: Entry, summary: bool = False) -> list[str]:
|
||||
def get_images_from_entry(entry: Entry):
|
||||
"""Get images from a entry.
|
||||
|
||||
Args:
|
||||
entry: The entry to get the images from.
|
||||
summary: Whether to get the images from the summary or the content.
|
||||
|
||||
Returns:
|
||||
Returns a list of images.
|
||||
"""
|
||||
# This regex will match any markdown image that follows the format of .
|
||||
image_regex = r"!\[(.*)\]\((.*)\)"
|
||||
|
||||
if summary:
|
||||
return re.findall(image_regex, convert_to_md(entry.summary)) if entry.summary else []
|
||||
def return_image(found_images):
|
||||
soup: BeautifulSoup = BeautifulSoup(found_images, "html.parser")
|
||||
images = soup.find_all("img")
|
||||
for image in images:
|
||||
image_src = image["src"] or ""
|
||||
image_alt: str = "Link to image"
|
||||
if image.get("alt"):
|
||||
image_alt = image.get("alt")
|
||||
return [(image_src, image_alt)]
|
||||
|
||||
return re.findall(image_regex, convert_to_md(entry.content[0].value)) if entry.content else []
|
||||
images = []
|
||||
# Get the images from the summary with beautiful soup
|
||||
if entry.summary:
|
||||
images = return_image(entry.summary)
|
||||
|
||||
# Get the images from the content with beautiful soup
|
||||
if entry.content:
|
||||
images = return_image(entry.content[0].value)
|
||||
|
||||
# No images found
|
||||
return images
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def try_to_replace(custom_message: str, template: str, replace_with: str) -> str:
|
||||
"""Try to replace a tag in custom_message.
|
||||
|
||||
@ -45,19 +56,6 @@ def try_to_replace(custom_message: str, template: str, replace_with: str) -> str
|
||||
return custom_message
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def remove_image_tags(message: str) -> str:
|
||||
"""Remove image tags from message.
|
||||
|
||||
Args:
|
||||
message: The message to remove the tags from.
|
||||
|
||||
Returns:
|
||||
Returns the message with the image tags removed.
|
||||
"""
|
||||
return re.sub(r"!\[(.*)\]\((.*)\)", "", message)
|
||||
|
||||
|
||||
def replace_tags(feed: Feed, entry: Entry) -> str:
|
||||
"""Replace tags in custom_message.
|
||||
|
||||
@ -75,17 +73,15 @@ def replace_tags(feed: Feed, entry: Entry) -> str:
|
||||
content = ""
|
||||
if entry.summary:
|
||||
summary: str = entry.summary
|
||||
summary = convert_to_md(summary)
|
||||
summary = remove_image_tags(message=summary)
|
||||
summary = convert_html_to_md(summary)
|
||||
|
||||
if entry.content:
|
||||
for content_item in entry.content:
|
||||
content: str = content_item.value
|
||||
content = convert_to_md(content)
|
||||
content = remove_image_tags(message=content)
|
||||
content = convert_html_to_md(content)
|
||||
|
||||
if images := get_images_from_entry(entry=entry):
|
||||
first_image: str = images[0][1]
|
||||
first_image: str = images[0][0]
|
||||
else:
|
||||
first_image = ""
|
||||
|
||||
@ -123,10 +119,7 @@ def replace_tags(feed: Feed, entry: Entry) -> str:
|
||||
for template, replace_with in replacement.items():
|
||||
custom_message = try_to_replace(custom_message, template, replace_with)
|
||||
|
||||
# Replace \\n with newlines.
|
||||
custom_message_with_newlines = custom_message.replace("\\n", "\n")
|
||||
|
||||
return custom_message_with_newlines
|
||||
return custom_message.replace("\\n", "\n")
|
||||
|
||||
|
||||
def get_custom_message(custom_reader: Reader, feed: Feed) -> str:
|
||||
|
@ -12,11 +12,12 @@ from reader import Entry, EntryCounts, EntrySearchCounts, EntrySearchResult, Fee
|
||||
from starlette.responses import RedirectResponse
|
||||
|
||||
from discord_rss_bot import settings
|
||||
from discord_rss_bot.custom_filters import convert_to_md, encode_url, entry_is_blacklisted, entry_is_whitelisted
|
||||
from discord_rss_bot.custom_message import get_custom_message, get_images_from_entry, remove_image_tags
|
||||
from discord_rss_bot.custom_filters import encode_url, entry_is_blacklisted, entry_is_whitelisted
|
||||
from discord_rss_bot.custom_message import get_custom_message, get_images_from_entry, replace_tags
|
||||
from discord_rss_bot.feeds import get_entry_from_id, send_entry_to_discord, send_to_discord
|
||||
from discord_rss_bot.filter.blacklist import get_blacklist_content, get_blacklist_summary, get_blacklist_title
|
||||
from discord_rss_bot.filter.whitelist import get_whitelist_content, get_whitelist_summary, get_whitelist_title
|
||||
from discord_rss_bot.markdown import convert_html_to_md
|
||||
from discord_rss_bot.search import create_html_for_search_results
|
||||
from discord_rss_bot.settings import default_custom_message, get_reader, list_webhooks
|
||||
|
||||
@ -30,8 +31,7 @@ reader: Reader = get_reader()
|
||||
templates.env.filters["encode_url"] = encode_url
|
||||
templates.env.filters["entry_is_whitelisted"] = entry_is_whitelisted
|
||||
templates.env.filters["entry_is_blacklisted"] = entry_is_blacklisted
|
||||
templates.env.filters["discord_markdown"] = convert_to_md
|
||||
templates.env.filters["remove_image_tags"] = remove_image_tags
|
||||
templates.env.filters["discord_markdown"] = convert_html_to_md
|
||||
|
||||
|
||||
@app.post("/add_webhook")
|
||||
@ -429,19 +429,13 @@ def create_html_for_feed(entries: Iterable[Entry]) -> str:
|
||||
first_image = ""
|
||||
first_image_text = ""
|
||||
if images := get_images_from_entry(entry=entry):
|
||||
first_image: str = images[0][1]
|
||||
first_image_text: str = images[0][0]
|
||||
first_image: str = images[0][0]
|
||||
first_image_text: str = images[0][1]
|
||||
|
||||
# Get the text from the entry.
|
||||
text = "<div class='text-muted'>No content available.</div>"
|
||||
if entry.summary:
|
||||
summary: str = convert_to_md(entry.summary)
|
||||
summary = remove_image_tags(message=summary)
|
||||
text: str = f"<div class='text-muted'>{summary}</div>"
|
||||
elif entry.content:
|
||||
content: str = convert_to_md(entry.content[0].value)
|
||||
content = remove_image_tags(message=content)
|
||||
text = f"<div class='text-muted'>{content}</div>"
|
||||
text = replace_tags(entry.feed, entry)
|
||||
if not text:
|
||||
text = "<div class='text-muted'>No content available.</div>"
|
||||
|
||||
published = ""
|
||||
if entry.published:
|
||||
|
68
discord_rss_bot/markdown.py
Normal file
68
discord_rss_bot/markdown.py
Normal file
@ -0,0 +1,68 @@
|
||||
from functools import lru_cache
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def convert_html_to_md(html: str) -> str:
|
||||
"""Convert HTML to Markdown.
|
||||
|
||||
Discord supports:
|
||||
- Bold with **text**
|
||||
- Italic with *text*
|
||||
- Blockquote with >>> text
|
||||
- Code with `text`
|
||||
- Fence code with ```text```
|
||||
- Links with [text](url)
|
||||
- Syntax highlighting with ```language
|
||||
- Strikethrough with ~~text~~
|
||||
"""
|
||||
soup: BeautifulSoup = BeautifulSoup(html, features="lxml")
|
||||
|
||||
# Bold
|
||||
for bold in soup.find_all("b") + soup.find_all("strong"):
|
||||
bold.replace_with(f"**{bold.text}**")
|
||||
|
||||
# Italic
|
||||
for italic in soup.find_all("i") + soup.find_all("em"):
|
||||
italic.replace_with(f"*{italic.text}*")
|
||||
|
||||
# Blockquote
|
||||
for blockquote in soup.find_all("blockquote") + soup.find_all("q"):
|
||||
blockquote.replace_with(f">>> {blockquote.text}")
|
||||
|
||||
# Code
|
||||
for code in soup.find_all("code") + soup.find_all("pre"):
|
||||
code.replace_with(f"`{code.text}`")
|
||||
|
||||
# Links
|
||||
for link in soup.find_all("a") + soup.find_all("link"):
|
||||
link_text = link.text or link.get("href") or "Link"
|
||||
link.replace_with(f"[{link_text}]({link.get('href')})")
|
||||
|
||||
# Strikethrough
|
||||
for strikethrough in soup.find_all("s") + soup.find_all("del") + soup.find_all("strike"):
|
||||
strikethrough.replace_with(f"~~{strikethrough.text}~~")
|
||||
|
||||
# <br> tags
|
||||
for br in soup.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
||||
# Remove all other tags
|
||||
for tag in soup.find_all(True):
|
||||
tag.replace_with(tag.text)
|
||||
|
||||
# If the text ends with a newline, remove it
|
||||
# return soup.text[:-1] if soup.text.endswith("\n") else soup.text
|
||||
return soup.text
|
||||
|
||||
|
||||
# Test the function
|
||||
if __name__ == "__main__":
|
||||
html: str = """
|
||||
<p><b>bold</b> <i>italic</i> <a href="https://example.com">link</a> <code>code</code> <s>strikethrough</s></p>
|
||||
<blockquote>blockquote</blockquote>
|
||||
<pre><code>pre code</code></pre>
|
||||
<strong>strong</strong>
|
||||
"""
|
||||
print(convert_html_to_md(html))
|
@ -119,7 +119,7 @@
|
||||
{% raw %}
|
||||
{{entry_content}}
|
||||
{% endraw %}
|
||||
</code>{{entry.content[0].value|discord_markdown|remove_image_tags}}
|
||||
</code>{{entry.content[0].value|discord_markdown}}
|
||||
</li>
|
||||
<li>
|
||||
<code>
|
||||
@ -175,7 +175,7 @@
|
||||
{% raw %}
|
||||
{{entry_summary}}
|
||||
{% endraw %}
|
||||
</code>{{entry.summary|discord_markdown|remove_image_tags}}
|
||||
</code>{{entry.summary|discord_markdown}}
|
||||
</li>
|
||||
<li>
|
||||
<code>
|
||||
@ -222,7 +222,7 @@
|
||||
<code>
|
||||
<pre>
|
||||
{{feed.title -}}
|
||||
{{- entry.content[0].value|discord_markdown|remove_image_tags -}}
|
||||
{{- entry.content[0].value|discord_markdown -}}
|
||||
</pre>
|
||||
</code>
|
||||
</li>
|
||||
|
Reference in New Issue
Block a user