From af589847e72ee3b8c706bc59d7db10d1b08f151d Mon Sep 17 00:00:00 2001
From: likho <licho@firemail.cc>
Date: Sat, 6 Jan 2024 23:30:03 -0800
Subject: [PATCH] squash merge detect-html

---
 README.md    | 24 +++++++++++---
 microblog.py | 89 +++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index f917af9..dc430c7 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Simple and stylish text-to-html microblog generator.
 * `make` (optional), method for invoking the script. 
 * `urllib` (optional), for uploading multiple files to neocities (`neouploader.py`).
 
-### Usage
+## Usage
 
 The following generates a sample page `result.html`.
 
@@ -30,10 +30,6 @@ This script generate a text file after operation.
 
 * `updatedfiles.txt`, a list of files updated by the script for use in automated uploads.
 
-## Configuration
-
-Settings are read from `settings.toml`. See `example/settings.toml`.
-
 ### Writing Content
 
 See `example/demo.txt`.
@@ -56,6 +52,24 @@ The content file is a plain text file of posts. Each post has two types of infor
 * the two last lines of the file must be empty
 * html can be placed in the message for embedded videos and rich text
 
+## Configuration
+
+Settings are read from `settings.toml`. See `example/settings.toml`.
+
+Configuration options as understood by the script are tentative and may change in the future.
+
+### A key may be missing from your settings file (KeyError)
+
+>I'm getting KeyError when I run the program
+
+>This script is throwing KeyError after I ran git pull
+
+In most cases, this means I added new configuration options. You can resolve this error by copying and pasting the missing keys from `example/settings.toml` to `settings.toml`.
+
+The following command shows differences between the files.
+
+    diff settings.toml example/settings.toml
+
 ## Anything else
 
 This is a script I wrote for personal use. The output can be seen on [https://likho.neocities.org/microblog/index.html](https://likho.neocities.org/microblog/index.html). I figure someone else may want to use it for their own personal websites, so it is published. 
diff --git a/microblog.py b/microblog.py
index 9b4c6dc..3b51bd7 100644
--- a/microblog.py
+++ b/microblog.py
@@ -54,6 +54,32 @@ def make_gallery(indices, w, conf=None):
     return tag
 
 # apply basic HTML formatting - only div class here is gallery
+from html.parser import HTMLParser
+class My_Html_Parser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.stack        = []
+        self.completed_by = ""
+
+    def handle_starttag(self, tag, attrs):
+        self.stack.append(tag)
+        self.is_completed_by = ""
+
+    def handle_endtag(self, tag):
+        # ignore common inline tags
+        ignore = ["i", "em", "b", "strong","u", "s", "a", "span"]
+        # remove an item == tag from the end of the list
+        i    = len(self.stack) - 1
+        last = self.stack[i]
+        while i > -1:
+            if tag == last:
+                self.stack.pop(i)
+                break
+            i -= 1
+            last = self.stack[i]
+        if self.stack == [] and tag not in ignore:
+            self.completed_by = "</%s>" % tag
+
 from html import escape
 def markup(message, config):
     def is_image(s, image_formats):
@@ -71,28 +97,18 @@ def markup(message, config):
             return True
         return False
 
-    result = 0
-    tagged = ""
-    # support multiple images (gallery style)
-    tags   = [] # list of strings
-    output = []
-    gallery = []
-    ptags = config["tag_paragraphs"]
-    sep = ""
-    if "line_separator" in config:
-        sep = config["line_separator"]
-    for line in message:
-        images = [] # list of integers
-        words  = line.split()
-        for i in range(len(words)):
-            word  = words[i]
+    def automarkup(list_of_words):
+        images = []
+        tags   = []
+        for i in range(len(list_of_words)):
+            word  = list_of_words[i]
             # don't help people click http
             if word.find("src=") == 0 or word.find("href=") == 0:
                 continue
             elif word.find("https://") != -1: 
                 w = escape(word)
                 new_word = ("<a href=\"%s\">%s</a>") % (w, w)
-                words[i] = new_word
+                list_of_words[i] = new_word
             elif word.find("#") != -1 and len(word) > 1:
                 # split by unicode blank character if present
                 # allows tagging such as #fanfic|tion
@@ -102,17 +118,40 @@ def markup(message, config):
                 new_word = "<span class=\"hashtag\">%s</span>" % (w[0])
                 if len(w) > 1:
                     new_word += w[1]
-                words[i] = new_word
+                list_of_words[i] = new_word
             elif is_image(word, config["accepted_images"]):
                 images.append(i)
-        if len(images) > 0: 
-            # function invokes pop() which modifies list 'words'
-            gc = config["gallery"] if "gallery" in config else None
-            gallery = make_gallery(images, words, gc)
-        if ptags and len(words) > 0:
-            words.insert(0,"<p>")
-            words.append("</p>")
-        output.append(" ".join(words))
+        return list_of_words, images, tags
+
+    tags   = [] # list of strings
+    output = []
+    gallery = []
+    ptags = config["tag_paragraphs"]
+    sep = ""
+    parser = My_Html_Parser()
+    if "line_separator" in config:
+        sep = config["line_separator"]
+    for line in message:
+        images = [] # list of integers
+        parser.feed(line)
+        if parser.stack == [] \
+        and (parser.completed_by == "" or parser.completed_by not in line):
+            words, images, t = automarkup(line.split())
+            tags += t
+            if len(images) > 0: 
+                # function invokes pop() which modifies list 'words'
+                gc = config["gallery"] if "gallery" in config else None
+                gallery = make_gallery(images, words, gc)
+            elif ptags and len(words) > 0:
+                words.insert(0,"<p>")
+                words.append("</p>")
+            output.append(" ".join(words))
+        elif "pre" in parser.stack \
+        and ("<pre>" not in line \
+        and "<code>" not in line and "</code>" not in line):
+            output.append(escape(line))
+        else: # <pre> is in the parser.stack
+            output.append(line.strip())
         # avoid paragraph with an image gallery
         if len(gallery) > 0:
             output.append("".join(gallery))