webweav.ing/feed/feed.go

338 lines
8.6 KiB
Go

package feed
import (
"errors"
"fmt"
"io"
"net/http"
"path"
"strings"
"time"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0">
<channel>
<title>%s</title>
<link>%s</link>
<description>%s</description>`
const feedfmtclose = `</channel>
</rss>`
const itemfmt = `<item>
<title>%s</title>
<link>%s</link>
<guid>%s</guid>
<pubDate>%s</pubDate>
<description>
%s
</description>
</item>`
type FeedInfo struct {
SiteName string
SiteUrl string
SiteDesc string
PageUrls []string
Items []*FeedItem
Errors map[string]string
}
type FeedItem struct {
Url string
Title string
Author string
PubTime time.Time
RawText string
}
type SitePage struct {
Url string
Title string
Root *html.Node
Errors []error
ErrStr string
}
func fetchPage(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", fmt.Errorf("Error sending Get request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
return string(body), nil
}
func parseTime(timestr string) (time.Time, error) {
var formats = []string {
time.ANSIC,
time.UnixDate,
time.RubyDate,
time.RFC822,
time.RFC822Z,
time.RFC850,
time.RFC1123,
time.RFC1123Z,
"2006-01-02 15:04:05 -0700 -0700",
time.RFC3339,
time.RFC3339Nano,
time.DateTime,
time.DateOnly,
"2006-01-02 15:04",
"2006-01-02T15:04:05",
"2006-01-02T15:04",
}
var pagetime time.Time
var err error
for _, f := range formats {
pagetime, err = time.Parse(f, timestr)
if err == nil {
return pagetime, err
}
}
return pagetime, fmt.Errorf("%s is in an invalid format", timestr)
}
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
var f func(*html.Node, string)
var element *html.Node
f = func(n *html.Node, s string) {
if n.Type == html.ElementNode && n.Data == s{
element = n
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c, tag)
}
}
f(doc, tag)
if element == nil {
return nil, fmt.Errorf("no &lt;%s&gt; element found", tag)
}
return element, nil
}
func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
var f func(*html.Node, string)
elements := make([]*html.Node, 0)
f = func(n *html.Node, s string) {
if n.Type == html.ElementNode && n.Data == s{
elements = append(elements, n)
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c, tag)
}
}
f(doc, tag)
if len(elements) == 0 {
return nil, fmt.Errorf("no &lt;%s&gt; element found", tag)
}
return elements, nil
}
func getTitleAndUrl(article *html.Node) (string, string, error) {
var title string
var url string
var header *html.Node
h1s, _ := getAllElements(article, "h1")
h2s, _ := getAllElements(article, "h2")
h3s, _ := getAllElements(article, "h3")
if len(h1s) > 0 {
header = h1s[0]
} else if len(h2s) > 0 {
header = h2s[0]
} else if len(h3s) > 0 {
header = h3s[0]
}
if header == nil {
return "", "", nil
}
if header.FirstChild.Type != html.ElementNode {
title = header.FirstChild.Data
} else if header.FirstChild.DataAtom == atom.A {
title = header.FirstChild.FirstChild.Data
for _, d := range header.FirstChild.Attr {
if d.Key == "href" {
url = d.Val
}
}
}
if url == "" && header.Parent.DataAtom == atom.A {
for _, d := range header.FirstChild.Attr {
if d.Key == "href" {
url = d.Val
}
}
}
return title, url, nil
}
func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
var articleBuilder strings.Builder
html.Render(&articleBuilder, article)
item := FeedItem{
Url: url,
RawText: articleBuilder.String(),
}
etime, err := getHtmlElement(article, "time")
if err != nil {
return nil, err
}
var pubTime time.Time
for _, d := range etime.Attr {
if d.Key == "datetime" {
pubTime, err = parseTime(d.Val)
}
if err != nil {
return nil, fmt.Errorf("Error parsing time: %w", err)
}
item.PubTime = pubTime
}
title, itemurl, _ := getTitleAndUrl(article)
if title == "" {
title = pubTime.Format("Jan 02 2006")
}
if itemurl == "" {
itemurl = url
}
if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") {
itemurl = path.Join(url, itemurl)
}
item.Title = title
item.Url = itemurl
return &item, nil
}
func (p *SitePage) Parse() ([]*FeedItem, error) {
items := make([]*FeedItem, 0)
articles, err := getAllElements(p.Root, "article")
if err != nil {
return nil, errors.New("No article elements found")
}
for _, article := range articles {
item, parseErr := NewFeedItem(p.Url, article)
if parseErr != nil {
p.Errors = append(p.Errors, parseErr)
} else {
items = append(items, item)
}
}
if len(p.Errors) > 0 {
errorStrs := make([]string, 0)
for _, perr := range p.Errors {
errorStrs = append(errorStrs, perr.Error())
}
p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error()
}
return items, nil
}
func NewSitePage(url string) (*SitePage, error) {
rawhtml, err := fetchPage(url)
if err != nil {
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
}
nodeRoot, err := html.Parse(strings.NewReader(rawhtml))
if err != nil {
return nil, fmt.Errorf("Error parsing HTML: %w", err)
}
page := SitePage{
Url: url,
Root: nodeRoot,
Errors: make([]error, 0),
}
nodeTitle, err := getHtmlElement(nodeRoot, "title")
if err != nil {
page.Title = url
} else {
page.Title = nodeTitle.FirstChild.Data
}
return &page, nil
}
func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
info := FeedInfo{
SiteName: name,
SiteUrl: base_url,
SiteDesc: desc,
PageUrls: page_urls,
Errors: make(map[string]string, 10),
}
for _,url := range info.PageUrls {
page, err := NewSitePage(url)
if err != nil {
info.Errors[url] = err.Error()
}
pageItems, err := page.Parse()
if err != nil {
info.Errors[url] = err.Error()
} else {
info.Items = append(info.Items, pageItems...)
info.Errors[url] = page.ErrStr
}
}
return &info, nil
}
func (info *FeedInfo) format(raw string) string {
var formatBuilder strings.Builder
depth := 0
oldDepth := 0
for _,line := range strings.Split(raw, "\n") {
tmp := strings.TrimSpace(line)
if tmp == "" {
continue
}
oldDepth = depth
for i,s := range line {
if i < len(line) - 1 {
t := line[i + 1]
if s == '<' && t != '?' && t != '/' {
depth += 1
}
if s == '<' && t == '/' {
depth -= 1
}
if s == '/' && t == '>' {
depth -= 1
}
}
}
for i := 0; i < depth; i++ {
if (i == depth - 1 && oldDepth < depth) {
continue
}
formatBuilder.WriteString(" ")
}
formatBuilder.WriteString(html.EscapeString(tmp))
formatBuilder.WriteString("\n")
}
return formatBuilder.String()
}
func (info *FeedInfo) GenerateRSS() string {
var outputBuilder strings.Builder
outputBuilder.WriteString(fmt.Sprintf(feedfmtopen, info.SiteName, info.SiteUrl, info.SiteDesc))
outputBuilder.WriteString("\n")
for _, item := range info.Items {
outputBuilder.WriteString(fmt.Sprintf(
itemfmt,
item.Title,
item.Url,
item.Url,
item.PubTime.Format("Mon, 2 Jan 2006 15:04:05 MST"),
item.RawText,
))
outputBuilder.WriteString("\n")
}
outputBuilder.WriteString(feedfmtclose)
return info.format(outputBuilder.String())
}