From 282820cc0210dd630c7d433995fedb30aaf570f5 Mon Sep 17 00:00:00 2001 From: yequari Date: Wed, 10 Apr 2024 20:49:54 -0700 Subject: [PATCH] parse out titles and urls from heading tags --- feed/feed.go | 65 +++++++++++++---- feed/feed_test.go | 175 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 200 insertions(+), 40 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index ed46939..f10f2e0 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -5,10 +5,12 @@ import ( "fmt" "io" "net/http" + "path" "strings" "time" "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) const feedfmtopen = ` @@ -135,6 +137,43 @@ func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) { return elements, nil } +func getTitleAndUrl(article *html.Node) (string, string, error) { + var title string + var url string + var header *html.Node + h1, _ := getHtmlElement(article, "h1") + h2, _ := getHtmlElement(article, "h2") + h3, _ := getHtmlElement(article, "h3") + if h1 != nil { + header = h1 + } else if h2 != nil { + header = h2 + } else if h3 != nil { + header = h3 + } + if header == nil { + return "", "", nil + } + if header.FirstChild.Type != html.ElementNode { + title = header.FirstChild.Data + } else if header.FirstChild.DataAtom == atom.A { + title = header.FirstChild.FirstChild.Data + for _, d := range header.FirstChild.Attr { + if d.Key == "href" { + url = d.Val + } + } + } + if url == "" && header.Parent.DataAtom == atom.A { + for _, d := range header.FirstChild.Attr { + if d.Key == "href" { + url = d.Val + } + } + } + return title, url, nil +} + func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { var articleBuilder strings.Builder html.Render(&articleBuilder, article) @@ -142,7 +181,6 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { Url: url, RawText: articleBuilder.String(), } - etime, err := getHtmlElement(article, "time") if err != nil { return nil, err @@ -157,21 +195,18 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { } item.PubTime = pubTime } - - eh1, _ := getHtmlElement(article, "h1") - eh2, _ := getHtmlElement(article, "h2") - eh3, _ := getHtmlElement(article, "h3") - if eh1 != nil { - item.Title = eh1.FirstChild.Data - // TODO: handle - } else if eh2 != nil { - item.Title = eh2.FirstChild.Data - } else if eh3 != nil { - item.Title = eh3.FirstChild.Data - } else { - item.Title = pubTime.Format("Jan 02 2006") + title, itemurl, _ := getTitleAndUrl(article) + if title == "" { + title = pubTime.Format("Jan 02 2006") } - + if itemurl == "" { + itemurl = url + } + if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") { + itemurl = path.Join(url, itemurl) + } + item.Title = title + item.Url = itemurl return &item, nil } diff --git a/feed/feed_test.go b/feed/feed_test.go index 275d691..6cb4fed 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -9,7 +9,7 @@ import ( "golang.org/x/net/html" ) -func TestTimeParsing(t *testing.T) { +func TestTimeParsingFormat(t *testing.T) { testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7") if err != nil { t.Errorf("creating test date failed: %s", err) @@ -67,7 +67,7 @@ func TestTimeParsing(t *testing.T) { } } -func TestArticleParsing(t *testing.T) { +func TestArticleTimeParsing(t *testing.T) { testDate, err := time.Parse("2006-Jan-02", "2004-May-14") if err != nil { t.Errorf("creating test date failed: %s", err) @@ -80,67 +80,198 @@ func TestArticleParsing(t *testing.T) { name string input string want_time []*time.Time - want_article []string - want_title []string }{ { "article and time stripped out of basic HTML", "
hello world
", []*time.Time{&testDate}, - []string{"
hello world
"}, - []string{"May 14 2004"}, }, { "multiple articles", "
hello world
this is a second article
", []*time.Time{&testDate, &testDate2}, - []string{"
hello world
", "
this is a second article
"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { + t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) + } + } + }) + } +} + +func TestArticleUrls(t *testing.T) { + var tests = []struct { + name string + input string + want_url []string + }{ + { + "article without url", + "
hello world
", + []string{""}, + }, + { + "one article with one url", + "

Title

hello world
", + []string{"https://example.com"}, + }, + { + "multiple articles", + "

Title

hello world

Title

this is a second article
", + []string{"https://example.com/example1", "https://example.com/example2"}, + }, + { + "one article with multiple urls", + "

Title

click me

hello world

", + []string{"https://example.com"}, + }, + { + "one article with multiple h1 urls", + "

Title

click me

hello world

", + []string{"https://example.com"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.Url != tt.want_url[i] { + t.Errorf("got %s, want %s", item.Url, tt.want_url[i]) + } + } + }) + } +} + +func TestArticleTitles(t *testing.T) { + var tests = []struct { + name string + input string + want_title []string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + []string{"May 14 2004"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", []string{"May 14 2004", "May 07 2004"}, }, + { + "multiple articles with h1", + "

Hello

hello world
this is a second article
", + []string{"Hello", "World"}, + }, { "article with h1", "

Hello

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

hello world
"}, []string{"Hello"}, }, { "article with h2", "

Hello

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

hello world
"}, []string{"Hello"}, }, { "article with h3", "

Hello

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

hello world
"}, []string{"Hello"}, }, { "article with h1 and h2", "

Hello

World

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

World

hello world
"}, []string{"Hello"}, }, { "article with h2 and h3", "

Hello

World

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

World

hello world
"}, []string{"World"}, }, { "article with multiple h1", "

Hello

World

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

World

hello world
"}, []string{"Hello"}, }, } + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.Title != tt.want_title[i] { + t.Errorf("got %s, want %s", item.Title, tt.want_title[i]) + } + } + }) + } +} + +func TestArticleParsing(t *testing.T) { + var tests = []struct { + name string + input string + want_article []string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + []string{"
hello world
"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []string{"
hello world
", "
this is a second article
"}, + }, + } + for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { html, err := html.Parse(strings.NewReader(tt.input)) @@ -161,12 +292,6 @@ func TestArticleParsing(t *testing.T) { if item.RawText != tt.want_article[i] { t.Errorf("got %s, want %s", item.RawText, tt.want_article[i]) } - if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { - t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) - } - if item.Title != tt.want_title[i] { - t.Errorf("got %s, want %s", item.Title, tt.want_title[i]) - } } }) }