170 lines
4.0 KiB
Go
170 lines
4.0 KiB
Go
|
package feed
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"net/http"
|
||
|
"strings"
|
||
|
"time"
|
||
|
|
||
|
"golang.org/x/net/html"
|
||
|
)
|
||
|
|
||
|
type FeedBuilder interface {
|
||
|
GenerateFeed() string
|
||
|
}
|
||
|
|
||
|
type FeedInfo struct {
|
||
|
SiteName string
|
||
|
SiteUrl string
|
||
|
SiteDesc string
|
||
|
PageUrls []string
|
||
|
errors map[string]error
|
||
|
}
|
||
|
|
||
|
type FeedItem struct {
|
||
|
Url string
|
||
|
Title string
|
||
|
Author string
|
||
|
EscapedText string
|
||
|
PubTime time.Time
|
||
|
RawText string
|
||
|
}
|
||
|
|
||
|
func fetchPage(url string) (string, error) {
|
||
|
resp, err := http.Get(url)
|
||
|
if err != nil {
|
||
|
return "", fmt.Errorf("Error sending Get request: %w", err)
|
||
|
}
|
||
|
defer resp.Body.Close()
|
||
|
body, err := io.ReadAll(resp.Body)
|
||
|
|
||
|
return string(body), nil
|
||
|
}
|
||
|
|
||
|
func parseTime(timestr string) (time.Time, error) {
|
||
|
var formats = []string {
|
||
|
time.ANSIC,
|
||
|
time.UnixDate,
|
||
|
time.RubyDate,
|
||
|
time.RFC822,
|
||
|
time.RFC822Z,
|
||
|
time.RFC850,
|
||
|
time.RFC1123,
|
||
|
time.RFC1123Z,
|
||
|
"2006-01-02 15:04:05 -0700 -0700",
|
||
|
time.RFC3339,
|
||
|
time.RFC3339Nano,
|
||
|
time.DateTime,
|
||
|
time.DateOnly,
|
||
|
}
|
||
|
var pagetime time.Time
|
||
|
var err error
|
||
|
for _, f := range formats {
|
||
|
pagetime, err = time.Parse(f, timestr)
|
||
|
if err == nil {
|
||
|
return pagetime, err
|
||
|
}
|
||
|
}
|
||
|
return pagetime, fmt.Errorf("Error parsing time: invalid format")
|
||
|
}
|
||
|
|
||
|
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
|
||
|
var f func(*html.Node, string)
|
||
|
var element *html.Node
|
||
|
f = func(n *html.Node, s string) {
|
||
|
if n.Type == html.ElementNode && n.Data == s{
|
||
|
element = n
|
||
|
return
|
||
|
}
|
||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
f(c, tag)
|
||
|
}
|
||
|
}
|
||
|
f(doc, tag)
|
||
|
if element == nil {
|
||
|
return nil, fmt.Errorf("no <%s> element found", tag)
|
||
|
}
|
||
|
return element, nil
|
||
|
}
|
||
|
|
||
|
func (f *FeedItem) ParseContent(content string) error {
|
||
|
doc, err := html.Parse(strings.NewReader(content))
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("Error parsing HTML: %w", err)
|
||
|
}
|
||
|
earticle, err := getHtmlElement(doc, "article")
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
var builder strings.Builder
|
||
|
html.Render(&builder, earticle)
|
||
|
f.RawText = builder.String()
|
||
|
|
||
|
etime, err := getHtmlElement(earticle, "time")
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
var pubTime time.Time
|
||
|
for _, d := range etime.Attr {
|
||
|
if d.Key == "datetime" {
|
||
|
pubTime, err = parseTime(d.Val)
|
||
|
}
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("Error parsing time: %w", err)
|
||
|
}
|
||
|
f.PubTime = pubTime
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func NewFeedItem(url string) (*FeedItem, error) {
|
||
|
rawhtml, err := fetchPage(url)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
|
||
|
}
|
||
|
item := FeedItem{
|
||
|
Url: url,
|
||
|
}
|
||
|
item.ParseContent(rawhtml);
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("Could not parse feed item: %w", err)
|
||
|
}
|
||
|
return &item, nil
|
||
|
}
|
||
|
|
||
|
// parseArticle returns an error if it could not parse the HTML or if it could not parse a time
|
||
|
// if a time could not be parsed, the parsed html article will still be returned
|
||
|
func parseArticle(content string) (string, *time.Time, error) {
|
||
|
doc, err := html.Parse(strings.NewReader(content))
|
||
|
if err != nil {
|
||
|
return "", nil, fmt.Errorf("Error parsing HTML: %w", err)
|
||
|
}
|
||
|
var f func(*html.Node, string)
|
||
|
var element *html.Node
|
||
|
var pagetime time.Time
|
||
|
f = func(n *html.Node, tag string) {
|
||
|
if n.Type == html.ElementNode && n.Data == tag {
|
||
|
element = n
|
||
|
return
|
||
|
}
|
||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
f(c, tag)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
f(doc, "article")
|
||
|
var builder strings.Builder
|
||
|
html.Render(&builder, element)
|
||
|
|
||
|
f(element, "time")
|
||
|
for _, d := range element.Attr {
|
||
|
if d.Key == "datetime" {
|
||
|
pagetime, err = parseTime(d.Val)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return builder.String(), &pagetime, nil
|
||
|
}
|
||
|
|