HTML parser for RSS

2025-09-06 08:02:25 +02:00 · 2025-09-06 08:02:25 +02:00 · 77499c700d
commit 77499c700d
parent 07fdf16686
1 changed files with 10 additions and 14 deletions
--- a/src/rss.py
+++ b/src/rss.py
@ -2,6 +2,8 @@ from microblog import My_Html_Parser
 # from html.parser import HTMLParser
 from html import escape
 import sys, traceback, dateutil.parser
+from urllib.parse import urljoin
+
 try:
    import feedgenerator
 except ImportError:
@ -46,20 +48,6 @@ def enrich_msg(
    trailing_punctuation="",
    desc_len_limit=-1,
 ):
-    if not is_atom:
-        from urllib.parse import urljoin
-        words = line2words(lines, desc_len_limit, trailing_punctuation)
-        for i in range(len(words)):
-            token = words[i]
-            core = token.rstrip(trailing_punctuation)
-            suffix = token[len(core):]
-            if len(core) == 0 or "<" in core or ">" in core:
-                continue
-            if _is_image_token(core, accepted_images):
-                abs_url = urljoin(base_url, core)
-                anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
-                words[i] = anchor + suffix
-        return words
    content = []
    parser = My_Html_Parser([])
    for line in lines:
@ -76,6 +64,14 @@ def enrich_msg(
                    w = escape(word)
                    new_word = ("<a href=\"%s\">%s</a>") % (w, w)
                    words[i] = new_word
+                elif not is_atom:
+                    core = word.rstrip(trailing_punctuation)
+                    if _is_image_token(core, accepted_images):
+                        suffix = word[len(core):]
+                        abs_url = urljoin(base_url, core)
+                        anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
+                        new_word = anchor + suffix
+                        words[i] = new_word
            words.insert(0,"<p>")
            words.append("</p>")
            content.append(" ".join(words))