From 0546e9ec7ef331ef2e7a90d0b6738f41b97c77d0 Mon Sep 17 00:00:00 2001 From: yequari Date: Fri, 15 Mar 2024 16:52:03 -0700 Subject: [PATCH] implement multiple article parsing --- feed/feed.go | 111 +++++++++++++++++++++++++++++++++------------- feed/feed_test.go | 70 ++++++++++++++++++++--------- 2 files changed, 129 insertions(+), 52 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index e95fd89..a51d7d5 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -1,6 +1,7 @@ package feed import ( + "errors" "fmt" "io" "net/http" @@ -47,6 +48,13 @@ type FeedItem struct { RawText string } +type SitePage struct { + Url string + Title string + Root *html.Node + Errors []error +} + func fetchPage(url string) (string, error) { resp, err := http.Get(url) if err != nil { @@ -85,7 +93,7 @@ func parseTime(timestr string) (time.Time, error) { return pagetime, err } } - return pagetime, fmt.Errorf("Error parsing time: invalid format") + return pagetime, fmt.Errorf("%s is in an invalid format", timestr) } func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { @@ -107,29 +115,36 @@ func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { return element, nil } -func (f *FeedItem) ParseContent(content string) error { - doc, err := html.Parse(strings.NewReader(content)) - if err != nil { - return fmt.Errorf("Error parsing HTML: %w", err) +func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) { + var f func(*html.Node, string) + elements := make([]*html.Node, 0) + f = func(n *html.Node, s string) { + if n.Type == html.ElementNode && n.Data == s{ + elements = append(elements, n) + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c, tag) + } } - earticle, err := getHtmlElement(doc, "article") - if err != nil { - return err - } - etitle, err := getHtmlElement(doc, "title") - if err != nil { - f.Title = "" - } else { - f.Title = etitle.FirstChild.Data + f(doc, tag) + if len(elements) == 0 { + return nil, fmt.Errorf("no <%s> element found", tag) } + return elements, nil +} +func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { var articleBuilder strings.Builder - html.Render(&articleBuilder, earticle) - f.RawText = articleBuilder.String() + html.Render(&articleBuilder, article) + item := FeedItem{ + Url: url, + RawText: articleBuilder.String(), + } - etime, err := getHtmlElement(earticle, "time") + etime, err := getHtmlElement(article, "time") if err != nil { - return err + return nil, err } var pubTime time.Time for _, d := range etime.Attr { @@ -137,26 +152,58 @@ func (f *FeedItem) ParseContent(content string) error { pubTime, err = parseTime(d.Val) } if err != nil { - return fmt.Errorf("Error parsing time: %w", err) + return nil, fmt.Errorf("Error parsing time: %w", err) } - f.PubTime = pubTime + item.PubTime = pubTime } - return nil + return &item, nil } -func NewFeedItem(url string) (*FeedItem, error) { +func (p *SitePage) Parse() ([]*FeedItem, error) { + items := make([]*FeedItem, 0) + articles, err := getAllElements(p.Root, "article") + if err != nil { + return nil, errors.New("No article elements found") + } + for _, article := range articles { + item, parseErr := NewFeedItem(p.Url, article) + if parseErr != nil { + p.Errors = append(p.Errors, parseErr) + } else { + items = append(items, item) + } + } + if len(p.Errors) > 0 { + errorStrs := make([]string, 0) + for _, perr := range p.Errors { + errorStrs = append(errorStrs, perr.Error()) + } + err = errors.New(strings.Join(errorStrs, "\n")) + } + return items, err +} + +func NewSitePage(url string) (*SitePage, error) { rawhtml, err := fetchPage(url) if err != nil { return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) } - item := FeedItem{ - Url: url, - } - err = item.ParseContent(rawhtml); + nodeRoot, err := html.Parse(strings.NewReader(rawhtml)) if err != nil { - return nil, fmt.Errorf("Could not parse feed item: %w", err) + return nil, fmt.Errorf("Error parsing HTML: %w", err) } - return &item, nil + page := SitePage{ + Url: url, + Root: nodeRoot, + Errors: make([]error, 0), + } + nodeTitle, err := getHtmlElement(nodeRoot, "title") + if err != nil { + page.Title = url + } else { + page.Title = nodeTitle.FirstChild.Data + } + return &page, nil } func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) { @@ -168,11 +215,15 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed Errors: make(map[string]string, 10), } for _,url := range info.PageUrls { - item, err := NewFeedItem(url) + page, err := NewSitePage(url) + if err != nil { + info.Errors[url] = err.Error() + } + pageItems, err := page.Parse() if err != nil { info.Errors[url] = err.Error() } else { - info.Items = append(info.Items, item) + info.Items = append(info.Items, pageItems...) } } return &info, nil diff --git a/feed/feed_test.go b/feed/feed_test.go index a5cbf2e..5127518 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -1,9 +1,12 @@ package feed_test import ( - "testing" - "time" - "git.32bit.cafe/yequari/webweav.ing/feed" + "strings" + "testing" + "time" + + "git.32bit.cafe/yequari/webweav.ing/feed" + "golang.org/x/net/html" ) func TestTimeParsing(t *testing.T) { @@ -19,35 +22,38 @@ func TestTimeParsing(t *testing.T) { }{ { "YYYY-MM-DD", - `
hello world
`, + `
hello world
`, time.DateOnly, }, { "YYYY-MM-DD HH:MM", - `
hello world
`, + `
hello world
`, "2006-01-02 15:04", }, { "YYYY-MM-DD HH:MM:SS", - `
hello world
`, + `
hello world
`, "2006-01-02 15:04:05", }, { "YYYY-MM-DDTHH:MM:SS", - `
hello world
`, + `
hello world
`, "2006-01-02T15:04:05", }, { "YYYY-MM-DDTHH:MM", - `
hello world
`, + `
hello world
`, "2006-01-02T15:04", }, } for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - item := feed.FeedItem{} - err := item.ParseContent(tt.input) + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + item, err := feed.NewFeedItem("", html) if err != nil { t.Errorf("error: %s", err) } @@ -61,38 +67,58 @@ func TestTimeParsing(t *testing.T) { } } - -func TestParseFeedItem(t *testing.T) { +func TestArticleParsing(t *testing.T) { testDate, err := time.Parse("2006-Jan-02", "2004-May-14") if err != nil { t.Errorf("creating test date failed: %s", err) } + testDate2, err := time.Parse("2006-Jan-02", "2004-May-07") + if err != nil { + t.Errorf("creating test date failed: %s", err) + } var tests = []struct { name string input string - want_time *time.Time - want_article string + want_time []*time.Time + want_article []string }{ { "article and time stripped out of basic HTML", "
hello world
", - &testDate, - "
hello world
", + []*time.Time{&testDate}, + []string{"
hello world
"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []*time.Time{&testDate, &testDate2}, + []string{"
hello world
", "
this is a second article
"}, }, } for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - item := feed.FeedItem{} - err := item.ParseContent(tt.input) + html, err := html.Parse(strings.NewReader(tt.input)) if err != nil { t.Errorf("error: %s", err) } - if item.RawText != tt.want_article { - t.Errorf("got %s, want %s", item.RawText, tt.want_article) + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), } - if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) { - t.Errorf("got %s, want %s", item.PubTime, *tt.want_time) + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.RawText != tt.want_article[i] { + t.Errorf("got %s, want %s", item.RawText, tt.want_article[i]) + } + if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { + t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) + } } }) }