HTML parser for RSS

This commit is contained in:
Eloi Torrents 2025-09-06 08:02:25 +02:00
parent 07fdf16686
commit 77499c700d

View File

@ -2,6 +2,8 @@ from microblog import My_Html_Parser
# from html.parser import HTMLParser
from html import escape
import sys, traceback, dateutil.parser
from urllib.parse import urljoin
try:
import feedgenerator
except ImportError:
@ -46,20 +48,6 @@ def enrich_msg(
trailing_punctuation="",
desc_len_limit=-1,
):
if not is_atom:
from urllib.parse import urljoin
words = line2words(lines, desc_len_limit, trailing_punctuation)
for i in range(len(words)):
token = words[i]
core = token.rstrip(trailing_punctuation)
suffix = token[len(core):]
if len(core) == 0 or "<" in core or ">" in core:
continue
if _is_image_token(core, accepted_images):
abs_url = urljoin(base_url, core)
anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
words[i] = anchor + suffix
return words
content = []
parser = My_Html_Parser([])
for line in lines:
@ -76,6 +64,14 @@ def enrich_msg(
w = escape(word)
new_word = ("<a href=\"%s\">%s</a>") % (w, w)
words[i] = new_word
elif not is_atom:
core = word.rstrip(trailing_punctuation)
if _is_image_token(core, accepted_images):
suffix = word[len(core):]
abs_url = urljoin(base_url, core)
anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
new_word = anchor + suffix
words[i] = new_word
words.insert(0,"<p>")
words.append("</p>")
content.append(" ".join(words))