webweav.ing/feed/feed.go

170 lines
4.0 KiB
Go

package feed
import (
"fmt"
"io"
"net/http"
"strings"
"time"
"golang.org/x/net/html"
)
type FeedBuilder interface {
GenerateFeed() string
}
type FeedInfo struct {
SiteName string
SiteUrl string
SiteDesc string
PageUrls []string
errors map[string]error
}
type FeedItem struct {
Url string
Title string
Author string
EscapedText string
PubTime time.Time
RawText string
}
func fetchPage(url string) (string, error) {
resp, err := http.Get(url)
if err != nil {
return "", fmt.Errorf("Error sending Get request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
return string(body), nil
}
func parseTime(timestr string) (time.Time, error) {
var formats = []string {
time.ANSIC,
time.UnixDate,
time.RubyDate,
time.RFC822,
time.RFC822Z,
time.RFC850,
time.RFC1123,
time.RFC1123Z,
"2006-01-02 15:04:05 -0700 -0700",
time.RFC3339,
time.RFC3339Nano,
time.DateTime,
time.DateOnly,
}
var pagetime time.Time
var err error
for _, f := range formats {
pagetime, err = time.Parse(f, timestr)
if err == nil {
return pagetime, err
}
}
return pagetime, fmt.Errorf("Error parsing time: invalid format")
}
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
var f func(*html.Node, string)
var element *html.Node
f = func(n *html.Node, s string) {
if n.Type == html.ElementNode && n.Data == s{
element = n
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c, tag)
}
}
f(doc, tag)
if element == nil {
return nil, fmt.Errorf("no <%s> element found", tag)
}
return element, nil
}
func (f *FeedItem) ParseContent(content string) error {
doc, err := html.Parse(strings.NewReader(content))
if err != nil {
return fmt.Errorf("Error parsing HTML: %w", err)
}
earticle, err := getHtmlElement(doc, "article")
if err != nil {
return err
}
var builder strings.Builder
html.Render(&builder, earticle)
f.RawText = builder.String()
etime, err := getHtmlElement(earticle, "time")
if err != nil {
return err
}
var pubTime time.Time
for _, d := range etime.Attr {
if d.Key == "datetime" {
pubTime, err = parseTime(d.Val)
}
if err != nil {
return fmt.Errorf("Error parsing time: %w", err)
}
f.PubTime = pubTime
}
return nil
}
func NewFeedItem(url string) (*FeedItem, error) {
rawhtml, err := fetchPage(url)
if err != nil {
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
}
item := FeedItem{
Url: url,
}
item.ParseContent(rawhtml);
if err != nil {
return nil, fmt.Errorf("Could not parse feed item: %w", err)
}
return &item, nil
}
// parseArticle returns an error if it could not parse the HTML or if it could not parse a time
// if a time could not be parsed, the parsed html article will still be returned
func parseArticle(content string) (string, *time.Time, error) {
doc, err := html.Parse(strings.NewReader(content))
if err != nil {
return "", nil, fmt.Errorf("Error parsing HTML: %w", err)
}
var f func(*html.Node, string)
var element *html.Node
var pagetime time.Time
f = func(n *html.Node, tag string) {
if n.Type == html.ElementNode && n.Data == tag {
element = n
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c, tag)
}
}
f(doc, "article")
var builder strings.Builder
html.Render(&builder, element)
f(element, "time")
for _, d := range element.Attr {
if d.Key == "datetime" {
pagetime, err = parseTime(d.Val)
}
}
return builder.String(), &pagetime, nil
}