From 0546e9ec7ef331ef2e7a90d0b6738f41b97c77d0 Mon Sep 17 00:00:00 2001 From: yequari Date: Fri, 15 Mar 2024 16:52:03 -0700 Subject: [PATCH 1/5] implement multiple article parsing --- feed/feed.go | 111 +++++++++++++++++++++++++++++++++------------- feed/feed_test.go | 70 ++++++++++++++++++++--------- 2 files changed, 129 insertions(+), 52 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index e95fd89..a51d7d5 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -1,6 +1,7 @@ package feed import ( + "errors" "fmt" "io" "net/http" @@ -47,6 +48,13 @@ type FeedItem struct { RawText string } +type SitePage struct { + Url string + Title string + Root *html.Node + Errors []error +} + func fetchPage(url string) (string, error) { resp, err := http.Get(url) if err != nil { @@ -85,7 +93,7 @@ func parseTime(timestr string) (time.Time, error) { return pagetime, err } } - return pagetime, fmt.Errorf("Error parsing time: invalid format") + return pagetime, fmt.Errorf("%s is in an invalid format", timestr) } func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { @@ -107,29 +115,36 @@ func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { return element, nil } -func (f *FeedItem) ParseContent(content string) error { - doc, err := html.Parse(strings.NewReader(content)) - if err != nil { - return fmt.Errorf("Error parsing HTML: %w", err) +func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) { + var f func(*html.Node, string) + elements := make([]*html.Node, 0) + f = func(n *html.Node, s string) { + if n.Type == html.ElementNode && n.Data == s{ + elements = append(elements, n) + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c, tag) + } } - earticle, err := getHtmlElement(doc, "article") - if err != nil { - return err - } - etitle, err := getHtmlElement(doc, "title") - if err != nil { - f.Title = "" - } else { - f.Title = etitle.FirstChild.Data + f(doc, tag) + if len(elements) == 0 { + return nil, fmt.Errorf("no <%s> element found", tag) } + return elements, nil +} +func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { var articleBuilder strings.Builder - html.Render(&articleBuilder, earticle) - f.RawText = articleBuilder.String() + html.Render(&articleBuilder, article) + item := FeedItem{ + Url: url, + RawText: articleBuilder.String(), + } - etime, err := getHtmlElement(earticle, "time") + etime, err := getHtmlElement(article, "time") if err != nil { - return err + return nil, err } var pubTime time.Time for _, d := range etime.Attr { @@ -137,26 +152,58 @@ func (f *FeedItem) ParseContent(content string) error { pubTime, err = parseTime(d.Val) } if err != nil { - return fmt.Errorf("Error parsing time: %w", err) + return nil, fmt.Errorf("Error parsing time: %w", err) } - f.PubTime = pubTime + item.PubTime = pubTime } - return nil + return &item, nil } -func NewFeedItem(url string) (*FeedItem, error) { +func (p *SitePage) Parse() ([]*FeedItem, error) { + items := make([]*FeedItem, 0) + articles, err := getAllElements(p.Root, "article") + if err != nil { + return nil, errors.New("No article elements found") + } + for _, article := range articles { + item, parseErr := NewFeedItem(p.Url, article) + if parseErr != nil { + p.Errors = append(p.Errors, parseErr) + } else { + items = append(items, item) + } + } + if len(p.Errors) > 0 { + errorStrs := make([]string, 0) + for _, perr := range p.Errors { + errorStrs = append(errorStrs, perr.Error()) + } + err = errors.New(strings.Join(errorStrs, "\n")) + } + return items, err +} + +func NewSitePage(url string) (*SitePage, error) { rawhtml, err := fetchPage(url) if err != nil { return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) } - item := FeedItem{ - Url: url, - } - err = item.ParseContent(rawhtml); + nodeRoot, err := html.Parse(strings.NewReader(rawhtml)) if err != nil { - return nil, fmt.Errorf("Could not parse feed item: %w", err) + return nil, fmt.Errorf("Error parsing HTML: %w", err) } - return &item, nil + page := SitePage{ + Url: url, + Root: nodeRoot, + Errors: make([]error, 0), + } + nodeTitle, err := getHtmlElement(nodeRoot, "title") + if err != nil { + page.Title = url + } else { + page.Title = nodeTitle.FirstChild.Data + } + return &page, nil } func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) { @@ -168,11 +215,15 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed Errors: make(map[string]string, 10), } for _,url := range info.PageUrls { - item, err := NewFeedItem(url) + page, err := NewSitePage(url) + if err != nil { + info.Errors[url] = err.Error() + } + pageItems, err := page.Parse() if err != nil { info.Errors[url] = err.Error() } else { - info.Items = append(info.Items, item) + info.Items = append(info.Items, pageItems...) } } return &info, nil diff --git a/feed/feed_test.go b/feed/feed_test.go index a5cbf2e..5127518 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -1,9 +1,12 @@ package feed_test import ( - "testing" - "time" - "git.32bit.cafe/yequari/webweav.ing/feed" + "strings" + "testing" + "time" + + "git.32bit.cafe/yequari/webweav.ing/feed" + "golang.org/x/net/html" ) func TestTimeParsing(t *testing.T) { @@ -19,35 +22,38 @@ func TestTimeParsing(t *testing.T) { }{ { "YYYY-MM-DD", - `
hello world
`, + `
hello world
`, time.DateOnly, }, { "YYYY-MM-DD HH:MM", - `
hello world
`, + `
hello world
`, "2006-01-02 15:04", }, { "YYYY-MM-DD HH:MM:SS", - `
hello world
`, + `
hello world
`, "2006-01-02 15:04:05", }, { "YYYY-MM-DDTHH:MM:SS", - `
hello world
`, + `
hello world
`, "2006-01-02T15:04:05", }, { "YYYY-MM-DDTHH:MM", - `
hello world
`, + `
hello world
`, "2006-01-02T15:04", }, } for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - item := feed.FeedItem{} - err := item.ParseContent(tt.input) + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + item, err := feed.NewFeedItem("", html) if err != nil { t.Errorf("error: %s", err) } @@ -61,38 +67,58 @@ func TestTimeParsing(t *testing.T) { } } - -func TestParseFeedItem(t *testing.T) { +func TestArticleParsing(t *testing.T) { testDate, err := time.Parse("2006-Jan-02", "2004-May-14") if err != nil { t.Errorf("creating test date failed: %s", err) } + testDate2, err := time.Parse("2006-Jan-02", "2004-May-07") + if err != nil { + t.Errorf("creating test date failed: %s", err) + } var tests = []struct { name string input string - want_time *time.Time - want_article string + want_time []*time.Time + want_article []string }{ { "article and time stripped out of basic HTML", "
hello world
", - &testDate, - "
hello world
", + []*time.Time{&testDate}, + []string{"
hello world
"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []*time.Time{&testDate, &testDate2}, + []string{"
hello world
", "
this is a second article
"}, }, } for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - item := feed.FeedItem{} - err := item.ParseContent(tt.input) + html, err := html.Parse(strings.NewReader(tt.input)) if err != nil { t.Errorf("error: %s", err) } - if item.RawText != tt.want_article { - t.Errorf("got %s, want %s", item.RawText, tt.want_article) + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), } - if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) { - t.Errorf("got %s, want %s", item.PubTime, *tt.want_time) + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.RawText != tt.want_article[i] { + t.Errorf("got %s, want %s", item.RawText, tt.want_article[i]) + } + if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { + t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) + } } }) } From 3ea57fe25cfef8e78571c280901be3cbe32ca31d Mon Sep 17 00:00:00 2001 From: yequari Date: Tue, 2 Apr 2024 17:24:51 -0700 Subject: [PATCH 2/5] implement basic multiple articles --- feed/feed.go | 7 +++++-- feed/feed_test.go | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index a51d7d5..ba23e9c 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -53,6 +53,7 @@ type SitePage struct { Title string Root *html.Node Errors []error + ErrStr string } func fetchPage(url string) (string, error) { @@ -84,6 +85,7 @@ func parseTime(timestr string) (time.Time, error) { "2006-01-02 15:04", "2006-01-02T15:04:05", "2006-01-02T15:04", + // "2006-02-01 15:04", } var pagetime time.Time var err error @@ -178,9 +180,9 @@ func (p *SitePage) Parse() ([]*FeedItem, error) { for _, perr := range p.Errors { errorStrs = append(errorStrs, perr.Error()) } - err = errors.New(strings.Join(errorStrs, "\n")) + p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error() } - return items, err + return items, nil } func NewSitePage(url string) (*SitePage, error) { @@ -224,6 +226,7 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed info.Errors[url] = err.Error() } else { info.Items = append(info.Items, pageItems...) + info.Errors[url] = page.ErrStr } } return &info, nil diff --git a/feed/feed_test.go b/feed/feed_test.go index 5127518..5ae555b 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -45,6 +45,11 @@ func TestTimeParsing(t *testing.T) { `
hello world
`, "2006-01-02T15:04", }, + { + "YYYY-DD-MM HH:MM", + `
hello world
`, + "2006-02-01 15:04", + }, } for _, tt := range tests { From 684ee15a9557cd4cb405eb1a3729fc49d8f6fad8 Mon Sep 17 00:00:00 2001 From: yequari Date: Tue, 9 Apr 2024 18:43:27 -0700 Subject: [PATCH 3/5] parse headers --- feed/feed.go | 16 +++++++++++++- feed/feed_test.go | 53 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index ba23e9c..ed46939 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -85,7 +85,6 @@ func parseTime(timestr string) (time.Time, error) { "2006-01-02 15:04", "2006-01-02T15:04:05", "2006-01-02T15:04", - // "2006-02-01 15:04", } var pagetime time.Time var err error @@ -158,6 +157,21 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { } item.PubTime = pubTime } + + eh1, _ := getHtmlElement(article, "h1") + eh2, _ := getHtmlElement(article, "h2") + eh3, _ := getHtmlElement(article, "h3") + if eh1 != nil { + item.Title = eh1.FirstChild.Data + // TODO: handle + } else if eh2 != nil { + item.Title = eh2.FirstChild.Data + } else if eh3 != nil { + item.Title = eh3.FirstChild.Data + } else { + item.Title = pubTime.Format("Jan 02 2006") + } + return &item, nil } diff --git a/feed/feed_test.go b/feed/feed_test.go index 5ae555b..275d691 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -45,11 +45,6 @@ func TestTimeParsing(t *testing.T) { `
hello world
`, "2006-01-02T15:04", }, - { - "YYYY-DD-MM HH:MM", - `
hello world
`, - "2006-02-01 15:04", - }, } for _, tt := range tests { @@ -86,18 +81,63 @@ func TestArticleParsing(t *testing.T) { input string want_time []*time.Time want_article []string + want_title []string }{ { "article and time stripped out of basic HTML", "
hello world
", []*time.Time{&testDate}, []string{"
hello world
"}, + []string{"May 14 2004"}, }, { "multiple articles", "
hello world
this is a second article
", []*time.Time{&testDate, &testDate2}, []string{"
hello world
", "
this is a second article
"}, + []string{"May 14 2004", "May 07 2004"}, + }, + { + "article with h1", + "

Hello

hello world
", + []*time.Time{&testDate}, + []string{"

Hello

hello world
"}, + []string{"Hello"}, + }, + { + "article with h2", + "

Hello

hello world
", + []*time.Time{&testDate}, + []string{"

Hello

hello world
"}, + []string{"Hello"}, + }, + { + "article with h3", + "

Hello

hello world
", + []*time.Time{&testDate}, + []string{"

Hello

hello world
"}, + []string{"Hello"}, + }, + { + "article with h1 and h2", + "

Hello

World

hello world
", + []*time.Time{&testDate}, + []string{"

Hello

World

hello world
"}, + []string{"Hello"}, + }, + { + "article with h2 and h3", + "

Hello

World

hello world
", + []*time.Time{&testDate}, + []string{"

Hello

World

hello world
"}, + []string{"World"}, + }, + { + "article with multiple h1", + "

Hello

World

hello world
", + []*time.Time{&testDate}, + []string{"

Hello

World

hello world
"}, + []string{"Hello"}, }, } @@ -124,6 +164,9 @@ func TestArticleParsing(t *testing.T) { if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) } + if item.Title != tt.want_title[i] { + t.Errorf("got %s, want %s", item.Title, tt.want_title[i]) + } } }) } From d165de5d865c9fed296613ba7e91430352534c16 Mon Sep 17 00:00:00 2001 From: yequari Date: Wed, 10 Apr 2024 20:49:54 -0700 Subject: [PATCH 4/5] parse out titles and urls from heading tags --- feed/feed.go | 65 +++++++++++++---- feed/feed_test.go | 175 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 200 insertions(+), 40 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index ed46939..f10f2e0 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -5,10 +5,12 @@ import ( "fmt" "io" "net/http" + "path" "strings" "time" "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) const feedfmtopen = ` @@ -135,6 +137,43 @@ func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) { return elements, nil } +func getTitleAndUrl(article *html.Node) (string, string, error) { + var title string + var url string + var header *html.Node + h1, _ := getHtmlElement(article, "h1") + h2, _ := getHtmlElement(article, "h2") + h3, _ := getHtmlElement(article, "h3") + if h1 != nil { + header = h1 + } else if h2 != nil { + header = h2 + } else if h3 != nil { + header = h3 + } + if header == nil { + return "", "", nil + } + if header.FirstChild.Type != html.ElementNode { + title = header.FirstChild.Data + } else if header.FirstChild.DataAtom == atom.A { + title = header.FirstChild.FirstChild.Data + for _, d := range header.FirstChild.Attr { + if d.Key == "href" { + url = d.Val + } + } + } + if url == "" && header.Parent.DataAtom == atom.A { + for _, d := range header.FirstChild.Attr { + if d.Key == "href" { + url = d.Val + } + } + } + return title, url, nil +} + func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { var articleBuilder strings.Builder html.Render(&articleBuilder, article) @@ -142,7 +181,6 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { Url: url, RawText: articleBuilder.String(), } - etime, err := getHtmlElement(article, "time") if err != nil { return nil, err @@ -157,21 +195,18 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { } item.PubTime = pubTime } - - eh1, _ := getHtmlElement(article, "h1") - eh2, _ := getHtmlElement(article, "h2") - eh3, _ := getHtmlElement(article, "h3") - if eh1 != nil { - item.Title = eh1.FirstChild.Data - // TODO: handle
- } else if eh2 != nil { - item.Title = eh2.FirstChild.Data - } else if eh3 != nil { - item.Title = eh3.FirstChild.Data - } else { - item.Title = pubTime.Format("Jan 02 2006") + title, itemurl, _ := getTitleAndUrl(article) + if title == "" { + title = pubTime.Format("Jan 02 2006") } - + if itemurl == "" { + itemurl = url + } + if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") { + itemurl = path.Join(url, itemurl) + } + item.Title = title + item.Url = itemurl return &item, nil } diff --git a/feed/feed_test.go b/feed/feed_test.go index 275d691..6cb4fed 100644 --- a/feed/feed_test.go +++ b/feed/feed_test.go @@ -9,7 +9,7 @@ import ( "golang.org/x/net/html" ) -func TestTimeParsing(t *testing.T) { +func TestTimeParsingFormat(t *testing.T) { testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7") if err != nil { t.Errorf("creating test date failed: %s", err) @@ -67,7 +67,7 @@ func TestTimeParsing(t *testing.T) { } } -func TestArticleParsing(t *testing.T) { +func TestArticleTimeParsing(t *testing.T) { testDate, err := time.Parse("2006-Jan-02", "2004-May-14") if err != nil { t.Errorf("creating test date failed: %s", err) @@ -80,67 +80,198 @@ func TestArticleParsing(t *testing.T) { name string input string want_time []*time.Time - want_article []string - want_title []string }{ { "article and time stripped out of basic HTML", "
hello world
", []*time.Time{&testDate}, - []string{"
hello world
"}, - []string{"May 14 2004"}, }, { "multiple articles", "
hello world
this is a second article
", []*time.Time{&testDate, &testDate2}, - []string{"
hello world
", "
this is a second article
"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { + t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) + } + } + }) + } +} + +func TestArticleUrls(t *testing.T) { + var tests = []struct { + name string + input string + want_url []string + }{ + { + "article without url", + "
hello world
", + []string{""}, + }, + { + "one article with one url", + "

Title

hello world
", + []string{"https://example.com"}, + }, + { + "multiple articles", + "

Title

hello world

Title

this is a second article
", + []string{"https://example.com/example1", "https://example.com/example2"}, + }, + { + "one article with multiple urls", + "", + []string{"https://example.com"}, + }, + { + "one article with multiple h1 urls", + "", + []string{"https://example.com"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.Url != tt.want_url[i] { + t.Errorf("got %s, want %s", item.Url, tt.want_url[i]) + } + } + }) + } +} + +func TestArticleTitles(t *testing.T) { + var tests = []struct { + name string + input string + want_title []string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + []string{"May 14 2004"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", []string{"May 14 2004", "May 07 2004"}, }, + { + "multiple articles with h1", + "

Hello

hello world
this is a second article
", + []string{"Hello", "World"}, + }, { "article with h1", "

Hello

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

hello world
"}, []string{"Hello"}, }, { "article with h2", "

Hello

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

hello world
"}, []string{"Hello"}, }, { "article with h3", "

Hello

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

hello world
"}, []string{"Hello"}, }, { "article with h1 and h2", "

Hello

World

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

World

hello world
"}, []string{"Hello"}, }, { "article with h2 and h3", "

Hello

World

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

World

hello world
"}, []string{"World"}, }, { "article with multiple h1", "

Hello

World

hello world
", - []*time.Time{&testDate}, - []string{"

Hello

World

hello world
"}, []string{"Hello"}, }, } + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + html, err := html.Parse(strings.NewReader(tt.input)) + if err != nil { + t.Errorf("error: %s", err) + } + page := feed.SitePage{ + Url: "", + Title: "", + Root: html, + Errors: make([]error, 0), + } + items, err := page.Parse() + if err != nil { + t.Errorf("error: %s", err) + } + for i, item := range items { + if item.Title != tt.want_title[i] { + t.Errorf("got %s, want %s", item.Title, tt.want_title[i]) + } + } + }) + } +} + +func TestArticleParsing(t *testing.T) { + var tests = []struct { + name string + input string + want_article []string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + []string{"
hello world
"}, + }, + { + "multiple articles", + "
hello world
this is a second article
", + []string{"
hello world
", "
this is a second article
"}, + }, + } + for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { html, err := html.Parse(strings.NewReader(tt.input)) @@ -161,12 +292,6 @@ func TestArticleParsing(t *testing.T) { if item.RawText != tt.want_article[i] { t.Errorf("got %s, want %s", item.RawText, tt.want_article[i]) } - if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) { - t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i]) - } - if item.Title != tt.want_title[i] { - t.Errorf("got %s, want %s", item.Title, tt.want_title[i]) - } } }) } From c9e922dd97456727645c354231e38c4277882057 Mon Sep 17 00:00:00 2001 From: yequari Date: Thu, 18 Apr 2024 11:45:00 -0700 Subject: [PATCH 5/5] get first header element --- feed/feed.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/feed/feed.go b/feed/feed.go index f10f2e0..23102f6 100644 --- a/feed/feed.go +++ b/feed/feed.go @@ -141,15 +141,15 @@ func getTitleAndUrl(article *html.Node) (string, string, error) { var title string var url string var header *html.Node - h1, _ := getHtmlElement(article, "h1") - h2, _ := getHtmlElement(article, "h2") - h3, _ := getHtmlElement(article, "h3") - if h1 != nil { - header = h1 - } else if h2 != nil { - header = h2 - } else if h3 != nil { - header = h3 + h1s, _ := getAllElements(article, "h1") + h2s, _ := getAllElements(article, "h2") + h3s, _ := getAllElements(article, "h3") + if len(h1s) > 0 { + header = h1s[0] + } else if len(h2s) > 0 { + header = h2s[0] + } else if len(h3s) > 0 { + header = h3s[0] } if header == nil { return "", "", nil