webweav.ing/rss/rss.go

131 lines
3.4 KiB
Go

package rss
import (
"fmt"
"io"
"net/http"
"regexp"
"strings"
"time"
"golang.org/x/net/html"
)
func fetchPage(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", fmt.Errorf("Error sending Get request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
return string(body), nil
}
func parseTime(timestr string) (*time.Time, error) {
var formats = []string {
time.ANSIC,
time.UnixDate,
time.RubyDate,
time.RFC822,
time.RFC822Z,
time.RFC850,
time.RFC1123,
time.RFC1123Z,
time.RFC3339,
time.RFC3339Nano,
time.DateTime,
time.DateOnly,
}
for _, f := range formats {
pagetime, err := time.Parse(f, timestr)
if err == nil {
return &pagetime, nil
}
}
return nil, fmt.Errorf("Error parsing time: invalid format")
}
// parseArticle returns an error if it could not parse the HTML or if it could not parse a time
// if a time could not be parsed, the parsed html article will still be returned
func parseArticle(content string) (string, *time.Time, error) {
doc, err := html.Parse(strings.NewReader(content))
if err != nil {
return "", nil, fmt.Errorf("Error parsing HTML: %w", err)
}
var f func(*html.Node, string)
var element *html.Node
var pagetime *time.Time
f = func(n *html.Node, tag string) {
if n.Type == html.ElementNode && n.Data == tag {
element = n
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c, tag)
}
}
f(doc, "article")
var builder strings.Builder
html.Render(&builder, element)
f(element, "time")
for _, d := range element.Attr {
if d.Key == "datetime" {
pagetime, err = parseTime(d.Val)
}
}
article := strings.TrimSuffix(strings.TrimPrefix(builder.String(), "<article>"), "</article>")
return article, pagetime, nil
}
func GenerateRss(siteUrl, siteTitle, siteDesc string, pageUrls ...string) (string, error) {
// get page
// parse article
// parse date
// create item element
// collect item elements into feed
var items strings.Builder
var err error
itemfmt := ` <item>
<title>Content Title</title>
<link>%s</link>
<guid>%s</guid>
<pubDate>%s</pubDate>
<description>%s</description>
</item>
`
for _, u := range pageUrls {
page, err := fetchPage(u)
if err != nil {
continue
}
article, atime, err := parseArticle(page)
if err != nil && article == "" {
continue
}
if atime != nil {
items.WriteString(fmt.Sprintf(itemfmt, u, u, atime.Format("Mon, 2 Jan 2006 15:04:05 MST"), article))
} else {
items.WriteString(fmt.Sprintf(itemfmt, u, u, time.Now().Format("Mon, 2 Jan 2006 15:04:05 MST"), article))
}
}
feed := `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>%s</title>
<link>%s</link>
<description>%s</description>
%s
</channel>
</rss>
`
return fmt.Sprintf(feed, siteTitle, siteUrl, siteDesc, items.String()), err
}