package feed import ( "fmt" "io" "net/http" "strings" "time" "golang.org/x/net/html" ) type FeedBuilder interface { GenerateFeed() string } type FeedInfo struct { SiteName string SiteUrl string SiteDesc string PageUrls []string errors map[string]error } type FeedItem struct { Url string Title string Author string EscapedText string PubTime time.Time RawText string } func fetchPage(url string) (string, error) { resp, err := http.Get(url) if err != nil { return "", fmt.Errorf("Error sending Get request: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) return string(body), nil } func parseTime(timestr string) (time.Time, error) { var formats = []string { time.ANSIC, time.UnixDate, time.RubyDate, time.RFC822, time.RFC822Z, time.RFC850, time.RFC1123, time.RFC1123Z, "2006-01-02 15:04:05 -0700 -0700", time.RFC3339, time.RFC3339Nano, time.DateTime, time.DateOnly, } var pagetime time.Time var err error for _, f := range formats { pagetime, err = time.Parse(f, timestr) if err == nil { return pagetime, err } } return pagetime, fmt.Errorf("Error parsing time: invalid format") } func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { var f func(*html.Node, string) var element *html.Node f = func(n *html.Node, s string) { if n.Type == html.ElementNode && n.Data == s{ element = n return } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c, tag) } } f(doc, tag) if element == nil { return nil, fmt.Errorf("no <%s> element found", tag) } return element, nil } func (f *FeedItem) ParseContent(content string) error { doc, err := html.Parse(strings.NewReader(content)) if err != nil { return fmt.Errorf("Error parsing HTML: %w", err) } earticle, err := getHtmlElement(doc, "article") if err != nil { return err } var builder strings.Builder html.Render(&builder, earticle) f.RawText = builder.String() etime, err := getHtmlElement(earticle, "time") if err != nil { return err } var pubTime time.Time for _, d := range etime.Attr { if d.Key == "datetime" { pubTime, err = parseTime(d.Val) } if err != nil { return fmt.Errorf("Error parsing time: %w", err) } f.PubTime = pubTime } return nil } func NewFeedItem(url string) (*FeedItem, error) { rawhtml, err := fetchPage(url) if err != nil { return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) } item := FeedItem{ Url: url, } item.ParseContent(rawhtml); if err != nil { return nil, fmt.Errorf("Could not parse feed item: %w", err) } return &item, nil } // parseArticle returns an error if it could not parse the HTML or if it could not parse a time // if a time could not be parsed, the parsed html article will still be returned func parseArticle(content string) (string, *time.Time, error) { doc, err := html.Parse(strings.NewReader(content)) if err != nil { return "", nil, fmt.Errorf("Error parsing HTML: %w", err) } var f func(*html.Node, string) var element *html.Node var pagetime time.Time f = func(n *html.Node, tag string) { if n.Type == html.ElementNode && n.Data == tag { element = n return } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c, tag) } } f(doc, "article") var builder strings.Builder html.Render(&builder, element) f(element, "time") for _, d := range element.Attr { if d.Key == "datetime" { pagetime, err = parseTime(d.Val) } } return builder.String(), &pagetime, nil }