303 lines
7.6 KiB
Go
303 lines
7.6 KiB
Go
package feed
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
|
|
<rss version="2.0">
|
|
<channel>
|
|
<title>%s</title>
|
|
<link>%s</link>
|
|
<description>%s</description>`
|
|
|
|
const feedfmtclose = `</channel>
|
|
</rss>`
|
|
|
|
const itemfmt = `<item>
|
|
<title>%s</title>
|
|
<link>%s</link>
|
|
<guid>%s</guid>
|
|
<pubDate>%s</pubDate>
|
|
<description>
|
|
%s
|
|
</description>
|
|
</item>`
|
|
|
|
type FeedInfo struct {
|
|
SiteName string
|
|
SiteUrl string
|
|
SiteDesc string
|
|
PageUrls []string
|
|
Items []*FeedItem
|
|
Errors map[string]string
|
|
}
|
|
|
|
type FeedItem struct {
|
|
Url string
|
|
Title string
|
|
Author string
|
|
PubTime time.Time
|
|
RawText string
|
|
}
|
|
|
|
type SitePage struct {
|
|
Url string
|
|
Title string
|
|
Root *html.Node
|
|
Errors []error
|
|
ErrStr string
|
|
}
|
|
|
|
func fetchPage(url string) (string, error) {
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
return "", fmt.Errorf("Error sending Get request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
body, err := io.ReadAll(resp.Body)
|
|
|
|
return string(body), nil
|
|
}
|
|
|
|
func parseTime(timestr string) (time.Time, error) {
|
|
var formats = []string {
|
|
time.ANSIC,
|
|
time.UnixDate,
|
|
time.RubyDate,
|
|
time.RFC822,
|
|
time.RFC822Z,
|
|
time.RFC850,
|
|
time.RFC1123,
|
|
time.RFC1123Z,
|
|
"2006-01-02 15:04:05 -0700 -0700",
|
|
time.RFC3339,
|
|
time.RFC3339Nano,
|
|
time.DateTime,
|
|
time.DateOnly,
|
|
"2006-01-02 15:04",
|
|
"2006-01-02T15:04:05",
|
|
"2006-01-02T15:04",
|
|
}
|
|
var pagetime time.Time
|
|
var err error
|
|
for _, f := range formats {
|
|
pagetime, err = time.Parse(f, timestr)
|
|
if err == nil {
|
|
return pagetime, err
|
|
}
|
|
}
|
|
return pagetime, fmt.Errorf("%s is in an invalid format", timestr)
|
|
}
|
|
|
|
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
|
|
var f func(*html.Node, string)
|
|
var element *html.Node
|
|
f = func(n *html.Node, s string) {
|
|
if n.Type == html.ElementNode && n.Data == s{
|
|
element = n
|
|
return
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
f(c, tag)
|
|
}
|
|
}
|
|
f(doc, tag)
|
|
if element == nil {
|
|
return nil, fmt.Errorf("no <%s> element found", tag)
|
|
}
|
|
return element, nil
|
|
}
|
|
|
|
func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
|
|
var f func(*html.Node, string)
|
|
elements := make([]*html.Node, 0)
|
|
f = func(n *html.Node, s string) {
|
|
if n.Type == html.ElementNode && n.Data == s{
|
|
elements = append(elements, n)
|
|
return
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
f(c, tag)
|
|
}
|
|
}
|
|
f(doc, tag)
|
|
if len(elements) == 0 {
|
|
return nil, fmt.Errorf("no <%s> element found", tag)
|
|
}
|
|
return elements, nil
|
|
}
|
|
|
|
func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
|
|
var articleBuilder strings.Builder
|
|
html.Render(&articleBuilder, article)
|
|
item := FeedItem{
|
|
Url: url,
|
|
RawText: articleBuilder.String(),
|
|
}
|
|
|
|
etime, err := getHtmlElement(article, "time")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var pubTime time.Time
|
|
for _, d := range etime.Attr {
|
|
if d.Key == "datetime" {
|
|
pubTime, err = parseTime(d.Val)
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Error parsing time: %w", err)
|
|
}
|
|
item.PubTime = pubTime
|
|
}
|
|
|
|
eh1, _ := getHtmlElement(article, "h1")
|
|
eh2, _ := getHtmlElement(article, "h2")
|
|
eh3, _ := getHtmlElement(article, "h3")
|
|
if eh1 != nil {
|
|
item.Title = eh1.FirstChild.Data
|
|
// TODO: handle <a>
|
|
} else if eh2 != nil {
|
|
item.Title = eh2.FirstChild.Data
|
|
} else if eh3 != nil {
|
|
item.Title = eh3.FirstChild.Data
|
|
} else {
|
|
item.Title = pubTime.Format("Jan 02 2006")
|
|
}
|
|
|
|
return &item, nil
|
|
}
|
|
|
|
func (p *SitePage) Parse() ([]*FeedItem, error) {
|
|
items := make([]*FeedItem, 0)
|
|
articles, err := getAllElements(p.Root, "article")
|
|
if err != nil {
|
|
return nil, errors.New("No article elements found")
|
|
}
|
|
for _, article := range articles {
|
|
item, parseErr := NewFeedItem(p.Url, article)
|
|
if parseErr != nil {
|
|
p.Errors = append(p.Errors, parseErr)
|
|
} else {
|
|
items = append(items, item)
|
|
}
|
|
}
|
|
if len(p.Errors) > 0 {
|
|
errorStrs := make([]string, 0)
|
|
for _, perr := range p.Errors {
|
|
errorStrs = append(errorStrs, perr.Error())
|
|
}
|
|
p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error()
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
func NewSitePage(url string) (*SitePage, error) {
|
|
rawhtml, err := fetchPage(url)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
|
|
}
|
|
nodeRoot, err := html.Parse(strings.NewReader(rawhtml))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Error parsing HTML: %w", err)
|
|
}
|
|
page := SitePage{
|
|
Url: url,
|
|
Root: nodeRoot,
|
|
Errors: make([]error, 0),
|
|
}
|
|
nodeTitle, err := getHtmlElement(nodeRoot, "title")
|
|
if err != nil {
|
|
page.Title = url
|
|
} else {
|
|
page.Title = nodeTitle.FirstChild.Data
|
|
}
|
|
return &page, nil
|
|
}
|
|
|
|
func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
|
|
info := FeedInfo{
|
|
SiteName: name,
|
|
SiteUrl: base_url,
|
|
SiteDesc: desc,
|
|
PageUrls: page_urls,
|
|
Errors: make(map[string]string, 10),
|
|
}
|
|
for _,url := range info.PageUrls {
|
|
page, err := NewSitePage(url)
|
|
if err != nil {
|
|
info.Errors[url] = err.Error()
|
|
}
|
|
pageItems, err := page.Parse()
|
|
if err != nil {
|
|
info.Errors[url] = err.Error()
|
|
} else {
|
|
info.Items = append(info.Items, pageItems...)
|
|
info.Errors[url] = page.ErrStr
|
|
}
|
|
}
|
|
return &info, nil
|
|
}
|
|
|
|
func (info *FeedInfo) format(raw string) string {
|
|
var formatBuilder strings.Builder
|
|
depth := 0
|
|
oldDepth := 0
|
|
for _,line := range strings.Split(raw, "\n") {
|
|
tmp := strings.TrimSpace(line)
|
|
if tmp == "" {
|
|
continue
|
|
}
|
|
oldDepth = depth
|
|
for i,s := range line {
|
|
if i < len(line) - 1 {
|
|
t := line[i + 1]
|
|
if s == '<' && t != '?' && t != '/' {
|
|
depth += 1
|
|
}
|
|
if s == '<' && t == '/' {
|
|
depth -= 1
|
|
}
|
|
if s == '/' && t == '>' {
|
|
depth -= 1
|
|
}
|
|
}
|
|
}
|
|
for i := 0; i < depth; i++ {
|
|
if (i == depth - 1 && oldDepth < depth) {
|
|
continue
|
|
}
|
|
formatBuilder.WriteString(" ")
|
|
}
|
|
formatBuilder.WriteString(html.EscapeString(tmp))
|
|
formatBuilder.WriteString("\n")
|
|
}
|
|
return formatBuilder.String()
|
|
}
|
|
|
|
func (info *FeedInfo) GenerateRSS() string {
|
|
var outputBuilder strings.Builder
|
|
outputBuilder.WriteString(fmt.Sprintf(feedfmtopen, info.SiteName, info.SiteUrl, info.SiteDesc))
|
|
outputBuilder.WriteString("\n")
|
|
for _, item := range info.Items {
|
|
outputBuilder.WriteString(fmt.Sprintf(
|
|
itemfmt,
|
|
item.Title,
|
|
item.Url,
|
|
item.Url,
|
|
item.PubTime.Format("Mon, 2 Jan 2006 15:04:05 MST"),
|
|
item.RawText,
|
|
))
|
|
outputBuilder.WriteString("\n")
|
|
}
|
|
outputBuilder.WriteString(feedfmtclose)
|
|
return info.format(outputBuilder.String())
|
|
}
|