package feed import ( "errors" "fmt" "io" "net/http" "path" "strings" "time" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) const feedfmtopen = ` %s %s %s` const feedfmtclose = ` ` const itemfmt = ` %s %s %s %s %s ` type FeedInfo struct { SiteName string SiteUrl string SiteDesc string PageUrls []string Items []*FeedItem Errors map[string]string } type FeedItem struct { Url string Title string Author string PubTime time.Time RawText string } type SitePage struct { Url string Title string Root *html.Node Errors []error ErrStr string } func fetchPage(url string) (string, error) { resp, err := http.Get(url) if err != nil { return "", fmt.Errorf("Error sending Get request: %w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) return string(body), nil } func parseTime(timestr string) (time.Time, error) { var formats = []string { time.ANSIC, time.UnixDate, time.RubyDate, time.RFC822, time.RFC822Z, time.RFC850, time.RFC1123, time.RFC1123Z, "2006-01-02 15:04:05 -0700 -0700", time.RFC3339, time.RFC3339Nano, time.DateTime, time.DateOnly, "2006-01-02 15:04", "2006-01-02T15:04:05", "2006-01-02T15:04", } var pagetime time.Time var err error for _, f := range formats { pagetime, err = time.Parse(f, timestr) if err == nil { return pagetime, err } } return pagetime, fmt.Errorf("%s is in an invalid format", timestr) } func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { var f func(*html.Node, string) var element *html.Node f = func(n *html.Node, s string) { if n.Type == html.ElementNode && n.Data == s{ element = n return } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c, tag) } } f(doc, tag) if element == nil { return nil, fmt.Errorf("no <%s> element found", tag) } return element, nil } func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) { var f func(*html.Node, string) elements := make([]*html.Node, 0) f = func(n *html.Node, s string) { if n.Type == html.ElementNode && n.Data == s{ elements = append(elements, n) return } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c, tag) } } f(doc, tag) if len(elements) == 0 { return nil, fmt.Errorf("no <%s> element found", tag) } return elements, nil } func getTitleAndUrl(article *html.Node) (string, string, error) { var title string var url string var header *html.Node h1, _ := getHtmlElement(article, "h1") h2, _ := getHtmlElement(article, "h2") h3, _ := getHtmlElement(article, "h3") if h1 != nil { header = h1 } else if h2 != nil { header = h2 } else if h3 != nil { header = h3 } if header == nil { return "", "", nil } if header.FirstChild.Type != html.ElementNode { title = header.FirstChild.Data } else if header.FirstChild.DataAtom == atom.A { title = header.FirstChild.FirstChild.Data for _, d := range header.FirstChild.Attr { if d.Key == "href" { url = d.Val } } } if url == "" && header.Parent.DataAtom == atom.A { for _, d := range header.FirstChild.Attr { if d.Key == "href" { url = d.Val } } } return title, url, nil } func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { var articleBuilder strings.Builder html.Render(&articleBuilder, article) item := FeedItem{ Url: url, RawText: articleBuilder.String(), } etime, err := getHtmlElement(article, "time") if err != nil { return nil, err } var pubTime time.Time for _, d := range etime.Attr { if d.Key == "datetime" { pubTime, err = parseTime(d.Val) } if err != nil { return nil, fmt.Errorf("Error parsing time: %w", err) } item.PubTime = pubTime } title, itemurl, _ := getTitleAndUrl(article) if title == "" { title = pubTime.Format("Jan 02 2006") } if itemurl == "" { itemurl = url } if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") { itemurl = path.Join(url, itemurl) } item.Title = title item.Url = itemurl return &item, nil } func (p *SitePage) Parse() ([]*FeedItem, error) { items := make([]*FeedItem, 0) articles, err := getAllElements(p.Root, "article") if err != nil { return nil, errors.New("No article elements found") } for _, article := range articles { item, parseErr := NewFeedItem(p.Url, article) if parseErr != nil { p.Errors = append(p.Errors, parseErr) } else { items = append(items, item) } } if len(p.Errors) > 0 { errorStrs := make([]string, 0) for _, perr := range p.Errors { errorStrs = append(errorStrs, perr.Error()) } p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error() } return items, nil } func NewSitePage(url string) (*SitePage, error) { rawhtml, err := fetchPage(url) if err != nil { return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) } nodeRoot, err := html.Parse(strings.NewReader(rawhtml)) if err != nil { return nil, fmt.Errorf("Error parsing HTML: %w", err) } page := SitePage{ Url: url, Root: nodeRoot, Errors: make([]error, 0), } nodeTitle, err := getHtmlElement(nodeRoot, "title") if err != nil { page.Title = url } else { page.Title = nodeTitle.FirstChild.Data } return &page, nil } func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) { info := FeedInfo{ SiteName: name, SiteUrl: base_url, SiteDesc: desc, PageUrls: page_urls, Errors: make(map[string]string, 10), } for _,url := range info.PageUrls { page, err := NewSitePage(url) if err != nil { info.Errors[url] = err.Error() } pageItems, err := page.Parse() if err != nil { info.Errors[url] = err.Error() } else { info.Items = append(info.Items, pageItems...) info.Errors[url] = page.ErrStr } } return &info, nil } func (info *FeedInfo) format(raw string) string { var formatBuilder strings.Builder depth := 0 oldDepth := 0 for _,line := range strings.Split(raw, "\n") { tmp := strings.TrimSpace(line) if tmp == "" { continue } oldDepth = depth for i,s := range line { if i < len(line) - 1 { t := line[i + 1] if s == '<' && t != '?' && t != '/' { depth += 1 } if s == '<' && t == '/' { depth -= 1 } if s == '/' && t == '>' { depth -= 1 } } } for i := 0; i < depth; i++ { if (i == depth - 1 && oldDepth < depth) { continue } formatBuilder.WriteString(" ") } formatBuilder.WriteString(html.EscapeString(tmp)) formatBuilder.WriteString("\n") } return formatBuilder.String() } func (info *FeedInfo) GenerateRSS() string { var outputBuilder strings.Builder outputBuilder.WriteString(fmt.Sprintf(feedfmtopen, info.SiteName, info.SiteUrl, info.SiteDesc)) outputBuilder.WriteString("\n") for _, item := range info.Items { outputBuilder.WriteString(fmt.Sprintf( itemfmt, item.Title, item.Url, item.Url, item.PubTime.Format("Mon, 2 Jan 2006 15:04:05 MST"), item.RawText, )) outputBuilder.WriteString("\n") } outputBuilder.WriteString(feedfmtclose) return info.format(outputBuilder.String()) }