diff --git a/feed/feed.go b/feed/feed.go index e95fd89..23102f6 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -1,13 +1,16 @@ package feed import ( + "errors" "fmt" "io" "net/http" + "path" "strings" "time" "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) const feedfmtopen = ` @@ -47,6 +50,14 @@ type FeedItem struct { RawText string } +type SitePage struct { + Url string + Title string + Root *html.Node + Errors []error + ErrStr string +} + func fetchPage(url string) (string, error) { resp, err := http.Get(url) if err != nil { @@ -85,7 +96,7 @@ func parseTime(timestr string) (time.Time, error) { return pagetime, err } } - return pagetime, fmt.Errorf("Error parsing time: invalid format") + return pagetime, fmt.Errorf("%s is in an invalid format", timestr) } func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { @@ -107,29 +118,72 @@ func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { return element, nil } -func (f *FeedItem) ParseContent(content string) error { - doc, err := html.Parse(strings.NewReader(content)) - if err != nil { - return fmt.Errorf("Error parsing HTML: %w", err) +func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) { + var f func(*html.Node, string) + elements := make([]*html.Node, 0) + f = func(n *html.Node, s string) { + if n.Type == html.ElementNode && n.Data == s{ + elements = append(elements, n) + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c, tag) + } } - earticle, err := getHtmlElement(doc, "article") - if err != nil { - return err - } - etitle, err := getHtmlElement(doc, "title") - if err != nil { - f.Title = "" - } else { - f.Title = etitle.FirstChild.Data + f(doc, tag) + if len(elements) == 0 { + return nil, fmt.Errorf("no <%s> element found", tag) } + return elements, nil +} +func getTitleAndUrl(article *html.Node) (string, string, error) { + var title string + var url string + var header *html.Node + h1s, _ := getAllElements(article, "h1") + h2s, _ := getAllElements(article, "h2") + h3s, _ := getAllElements(article, "h3") + if len(h1s) > 0 { + header = h1s[0] + } else if len(h2s) > 0 { + header = h2s[0] + } else if len(h3s) > 0 { + header = h3s[0] + } + if header == nil { + return "", "", nil + } + if header.FirstChild.Type != html.ElementNode { + title = header.FirstChild.Data + } else if header.FirstChild.DataAtom == atom.A { + title = header.FirstChild.FirstChild.Data + for _, d := range header.FirstChild.Attr { + if d.Key == "href" { + url = d.Val + } + } + } + if url == "" && header.Parent.DataAtom == atom.A { + for _, d := range header.FirstChild.Attr { + if d.Key == "href" { + url = d.Val + } + } + } + return title, url, nil +} + +func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { var articleBuilder strings.Builder - html.Render(&articleBuilder, earticle) - f.RawText = articleBuilder.String() - - etime, err := getHtmlElement(earticle, "time") + html.Render(&articleBuilder, article) + item := FeedItem{ + Url: url, + RawText: articleBuilder.String(), + } + etime, err := getHtmlElement(article, "time") if err != nil { - return err + return nil, err } var pubTime time.Time for _, d := range etime.Attr { @@ -137,26 +191,70 @@ func (f *FeedItem) ParseContent(content string) error { pubTime, err = parseTime(d.Val) } if err != nil { - return fmt.Errorf("Error parsing time: %w", err) + return nil, fmt.Errorf("Error parsing time: %w", err) } - f.PubTime = pubTime + item.PubTime = pubTime } - return nil + title, itemurl, _ := getTitleAndUrl(article) + if title == "" { + title = pubTime.Format("Jan 02 2006") + } + if itemurl == "" { + itemurl = url + } + if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") { + itemurl = path.Join(url, itemurl) + } + item.Title = title + item.Url = itemurl + return &item, nil } -func NewFeedItem(url string) (*FeedItem, error) { +func (p *SitePage) Parse() ([]*FeedItem, error) { + items := make([]*FeedItem, 0) + articles, err := getAllElements(p.Root, "article") + if err != nil { + return nil, errors.New("No article elements found") + } + for _, article := range articles { + item, parseErr := NewFeedItem(p.Url, article) + if parseErr != nil { + p.Errors = append(p.Errors, parseErr) + } else { + items = append(items, item) + } + } + if len(p.Errors) > 0 { + errorStrs := make([]string, 0) + for _, perr := range p.Errors { + errorStrs = append(errorStrs, perr.Error()) + } + p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error() + } + return items, nil +} + +func NewSitePage(url string) (*SitePage, error) { rawhtml, err := fetchPage(url) if err != nil { return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) } - item := FeedItem{ - Url: url, - } - err = item.ParseContent(rawhtml); + nodeRoot, err := html.Parse(strings.NewReader(rawhtml)) if err != nil { - return nil, fmt.Errorf("Could not parse feed item: %w", err) + return nil, fmt.Errorf("Error parsing HTML: %w", err) } - return &item, nil + page := SitePage{ + Url: url, + Root: nodeRoot, + Errors: make([]error, 0), + } + nodeTitle, err := getHtmlElement(nodeRoot, "title") + if err != nil { + page.Title = url + } else { + page.Title = nodeTitle.FirstChild.Data + } + return &page, nil } func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) { @@ -168,11 +266,16 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed Errors: make(map[string]string, 10), } for _,url := range info.PageUrls { - item, err := NewFeedItem(url) + page, err := NewSitePage(url) + if err != nil { + info.Errors[url] = err.Error() + } + pageItems, err := page.Parse() if err != nil { info.Errors[url] = err.Error() } else { - info.Items = append(info.Items, item) + info.Items = append(info.Items, pageItems...) + info.Errors[url] = page.ErrStr } } return &info, nil diff --git a/feed/feed_test.go b/feed/feed_test.go index a5cbf2e..6cb4fed 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -1,12 +1,15 @@ package feed_test import ( - "testing" - "time" - "git.32bit.cafe/yequari/webweav.ing/feed" + "strings" + "testing" + "time" + + "git.32bit.cafe/yequari/webweav.ing/feed" + "golang.org/x/net/html" ) -func TestTimeParsing(t *testing.T) { +func TestTimeParsingFormat(t *testing.T) { testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7") if err != nil { t.Errorf("creating test date failed: %s", err) @@ -19,35 +22,38 @@ func TestTimeParsing(t *testing.T) { }{ { "YYYY-MM-DD", - `
hello world
`, + `
hello world
`, time.DateOnly, }, { "YYYY-MM-DD HH:MM", - `
hello world
`, + `
hello world
`, "2006-01-02 15:04", }, { "YYYY-MM-DD HH:MM:SS", - `
hello world
`, + `
hello world
`, "2006-01-02 15:04:05", }, { "YYYY-MM-DDTHH:MM:SS", - `
hello world
`, + `
hello world
`, "2006-01-02T15:04:05", }, { "YYYY-MM-DDTHH:MM", - `
hello world
`, + `
hello world
`, "2006-01-02T15:04", }, } for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - item := feed.FeedItem{} - err := item.ParseContent(tt.input) + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + item, err := feed.NewFeedItem("", html) if err != nil { t.Errorf("error: %s", err) } @@ -61,38 +67,231 @@ func TestTimeParsing(t *testing.T) { } } - -func TestParseFeedItem(t *testing.T) { +func TestArticleTimeParsing(t *testing.T) { testDate, err := time.Parse("2006-Jan-02", "2004-May-14") if err != nil { t.Errorf("creating test date failed: %s", err) } + testDate2, err := time.Parse("2006-Jan-02", "2004-May-07") + if err != nil { + t.Errorf("creating test date failed: %s", err) + } var tests = []struct { name string input string - want_time *time.Time - want_article string + want_time []*time.Time }{ { "article and time stripped out of basic HTML", "
hello world
", - &testDate, - "
hello world
", + []*time.Time{&testDate}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []*time.Time{&testDate, &testDate2}, }, } for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - item := feed.FeedItem{} - err := item.ParseContent(tt.input) + html, err := html.Parse(strings.NewReader(tt.input)) if err != nil { t.Errorf("error: %s", err) } - if item.RawText != tt.want_article { - t.Errorf("got %s, want %s", item.RawText, tt.want_article) + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), } - if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) { - t.Errorf("got %s, want %s", item.PubTime, *tt.want_time) + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { + t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) + } + } + }) + } +} + +func TestArticleUrls(t *testing.T) { + var tests = []struct { + name string + input string + want_url []string + }{ + { + "article without url", + "
hello world
", + []string{""}, + }, + { + "one article with one url", + "

Title

hello world
", + []string{"https://example.com"}, + }, + { + "multiple articles", + "

Title

hello world

Title

this is a second article
", + []string{"https://example.com/example1", "https://example.com/example2"}, + }, + { + "one article with multiple urls", + "

Title

click me

hello world

", + []string{"https://example.com"}, + }, + { + "one article with multiple h1 urls", + "

Title

click me

hello world

", + []string{"https://example.com"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.Url != tt.want_url[i] { + t.Errorf("got %s, want %s", item.Url, tt.want_url[i]) + } + } + }) + } +} + +func TestArticleTitles(t *testing.T) { + var tests = []struct { + name string + input string + want_title []string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + []string{"May 14 2004"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []string{"May 14 2004", "May 07 2004"}, + }, + { + "multiple articles with h1", + "

Hello

hello world
this is a second article
", + []string{"Hello", "World"}, + }, + { + "article with h1", + "

Hello

hello world
", + []string{"Hello"}, + }, + { + "article with h2", + "

Hello

hello world
", + []string{"Hello"}, + }, + { + "article with h3", + "

Hello

hello world
", + []string{"Hello"}, + }, + { + "article with h1 and h2", + "

Hello

World

hello world
", + []string{"Hello"}, + }, + { + "article with h2 and h3", + "

Hello

World

hello world
", + []string{"World"}, + }, + { + "article with multiple h1", + "

Hello

World

hello world
", + []string{"Hello"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.Title != tt.want_title[i] { + t.Errorf("got %s, want %s", item.Title, tt.want_title[i]) + } + } + }) + } +} + +func TestArticleParsing(t *testing.T) { + var tests = []struct { + name string + input string + want_article []string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + []string{"
hello world
"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []string{"
hello world
", "
this is a second article
"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.RawText != tt.want_article[i] { + t.Errorf("got %s, want %s", item.RawText, tt.want_article[i]) + } } }) }