HTML parser for RSS

2025-09-06 08:02:25 +02:00 · 2025-09-06 08:02:25 +02:00 · 77499c700d
commit 77499c700d
parent 07fdf16686
1 changed files with 10 additions and 14 deletions
--- a/src/rss.py
+++ b/src/rss.py
@ -2,6 +2,8 @@ from microblog import My_Html_Parser
 # from html.parser import HTMLParser
 from html import escape
 import sys, traceback, dateutil.parser
 from urllib.parse import urljoin
 try:
    import feedgenerator
 except ImportError:
@ -46,20 +48,6 @@ def enrich_msg(
    trailing_punctuation="",
    desc_len_limit=-1,
 ):
    if not is_atom:
        from urllib.parse import urljoin
        words = line2words(lines, desc_len_limit, trailing_punctuation)
        for i in range(len(words)):
            token = words[i]
            core = token.rstrip(trailing_punctuation)
            suffix = token[len(core):]
            if len(core) == 0 or "<" in core or ">" in core:
                continue
            if _is_image_token(core, accepted_images):
                abs_url = urljoin(base_url, core)
                anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
                words[i] = anchor + suffix
        return words
    content = []
    parser = My_Html_Parser([])
    for line in lines:
@ -76,6 +64,14 @@ def enrich_msg(
                    w = escape(word)
                    new_word = ("<a href=\"%s\">%s</a>") % (w, w)
                    words[i] = new_word
                elif not is_atom:
                    core = word.rstrip(trailing_punctuation)
                    if _is_image_token(core, accepted_images):
                        suffix = word[len(core):]
                        abs_url = urljoin(base_url, core)
                        anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
                        new_word = anchor + suffix
                        words[i] = new_word
            words.insert(0,"<p>")
            words.append("</p>")
            content.append(" ".join(words))