HTML parser for RSS
This commit is contained in:
parent
07fdf16686
commit
77499c700d
24
src/rss.py
24
src/rss.py
@ -2,6 +2,8 @@ from microblog import My_Html_Parser
|
||||
# from html.parser import HTMLParser
|
||||
from html import escape
|
||||
import sys, traceback, dateutil.parser
|
||||
from urllib.parse import urljoin
|
||||
|
||||
try:
|
||||
import feedgenerator
|
||||
except ImportError:
|
||||
@ -46,20 +48,6 @@ def enrich_msg(
|
||||
trailing_punctuation="",
|
||||
desc_len_limit=-1,
|
||||
):
|
||||
if not is_atom:
|
||||
from urllib.parse import urljoin
|
||||
words = line2words(lines, desc_len_limit, trailing_punctuation)
|
||||
for i in range(len(words)):
|
||||
token = words[i]
|
||||
core = token.rstrip(trailing_punctuation)
|
||||
suffix = token[len(core):]
|
||||
if len(core) == 0 or "<" in core or ">" in core:
|
||||
continue
|
||||
if _is_image_token(core, accepted_images):
|
||||
abs_url = urljoin(base_url, core)
|
||||
anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
|
||||
words[i] = anchor + suffix
|
||||
return words
|
||||
content = []
|
||||
parser = My_Html_Parser([])
|
||||
for line in lines:
|
||||
@ -76,6 +64,14 @@ def enrich_msg(
|
||||
w = escape(word)
|
||||
new_word = ("<a href=\"%s\">%s</a>") % (w, w)
|
||||
words[i] = new_word
|
||||
elif not is_atom:
|
||||
core = word.rstrip(trailing_punctuation)
|
||||
if _is_image_token(core, accepted_images):
|
||||
suffix = word[len(core):]
|
||||
abs_url = urljoin(base_url, core)
|
||||
anchor = f"<a href=\"{escape(abs_url)}\">{escape(abs_url)}</a>"
|
||||
new_word = anchor + suffix
|
||||
words[i] = new_word
|
||||
words.insert(0,"<p>")
|
||||
words.append("</p>")
|
||||
content.append(" ".join(words))
|
||||
|
Loading…
x
Reference in New Issue
Block a user