Implement Parsing of Multiple Articles on one Page #5

Merged
yequari merged 5 commits from multiple-articles-2 into master 2024-04-18 18:55:50 +00:00
2 changed files with 63 additions and 6 deletions
Showing only changes of commit 684ee15a95 - Show all commits

View File

@ -85,7 +85,6 @@ func parseTime(timestr string) (time.Time, error) {
"2006-01-02 15:04",
"2006-01-02T15:04:05",
"2006-01-02T15:04",
// "2006-02-01 15:04",
}
var pagetime time.Time
var err error
@ -158,6 +157,21 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
}
item.PubTime = pubTime
}
eh1, _ := getHtmlElement(article, "h1")
eh2, _ := getHtmlElement(article, "h2")
eh3, _ := getHtmlElement(article, "h3")
if eh1 != nil {
item.Title = eh1.FirstChild.Data
// TODO: handle <a>
} else if eh2 != nil {
item.Title = eh2.FirstChild.Data
} else if eh3 != nil {
item.Title = eh3.FirstChild.Data
} else {
item.Title = pubTime.Format("Jan 02 2006")
}
return &item, nil
}

View File

@ -45,11 +45,6 @@ func TestTimeParsing(t *testing.T) {
`<article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article>`,
"2006-01-02T15:04",
},
{
"YYYY-DD-MM HH:MM",
`<article><time datetime="2004-14-05 07:30">May 14 2004</time>hello world</article>`,
"2006-02-01 15:04",
},
}
for _, tt := range tests {
@ -86,18 +81,63 @@ func TestArticleParsing(t *testing.T) {
input string
want_time []*time.Time
want_article []string
want_title []string
}{
{
"article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
[]string{"May 14 2004"},
},
{
"multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]*time.Time{&testDate, &testDate2},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
[]string{"May 14 2004", "May 07 2004"},
},
{
"article with h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article>"},
[]string{"Hello"},
},
{
"article with h2",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article>"},
[]string{"Hello"},
},
{
"article with h3",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article>"},
[]string{"Hello"},
},
{
"article with h1 and h2",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article>"},
[]string{"Hello"},
},
{
"article with h2 and h3",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article>"},
[]string{"World"},
},
{
"article with multiple h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article>"},
[]string{"Hello"},
},
}
@ -124,6 +164,9 @@ func TestArticleParsing(t *testing.T) {
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
}
if item.Title != tt.want_title[i] {
t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
}
}
})
}