Implement Parsing of Multiple Articles on one Page #5
16
feed/feed.go
16
feed/feed.go
|
@ -85,7 +85,6 @@ func parseTime(timestr string) (time.Time, error) {
|
||||||
"2006-01-02 15:04",
|
"2006-01-02 15:04",
|
||||||
"2006-01-02T15:04:05",
|
"2006-01-02T15:04:05",
|
||||||
"2006-01-02T15:04",
|
"2006-01-02T15:04",
|
||||||
// "2006-02-01 15:04",
|
|
||||||
}
|
}
|
||||||
var pagetime time.Time
|
var pagetime time.Time
|
||||||
var err error
|
var err error
|
||||||
|
@ -158,6 +157,21 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
|
||||||
}
|
}
|
||||||
item.PubTime = pubTime
|
item.PubTime = pubTime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
eh1, _ := getHtmlElement(article, "h1")
|
||||||
|
eh2, _ := getHtmlElement(article, "h2")
|
||||||
|
eh3, _ := getHtmlElement(article, "h3")
|
||||||
|
if eh1 != nil {
|
||||||
|
item.Title = eh1.FirstChild.Data
|
||||||
|
// TODO: handle <a>
|
||||||
|
} else if eh2 != nil {
|
||||||
|
item.Title = eh2.FirstChild.Data
|
||||||
|
} else if eh3 != nil {
|
||||||
|
item.Title = eh3.FirstChild.Data
|
||||||
|
} else {
|
||||||
|
item.Title = pubTime.Format("Jan 02 2006")
|
||||||
|
}
|
||||||
|
|
||||||
return &item, nil
|
return &item, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,11 +45,6 @@ func TestTimeParsing(t *testing.T) {
|
||||||
`<article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article>`,
|
`<article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article>`,
|
||||||
"2006-01-02T15:04",
|
"2006-01-02T15:04",
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"YYYY-DD-MM HH:MM",
|
|
||||||
`<article><time datetime="2004-14-05 07:30">May 14 2004</time>hello world</article>`,
|
|
||||||
"2006-02-01 15:04",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
|
@ -86,18 +81,63 @@ func TestArticleParsing(t *testing.T) {
|
||||||
input string
|
input string
|
||||||
want_time []*time.Time
|
want_time []*time.Time
|
||||||
want_article []string
|
want_article []string
|
||||||
|
want_title []string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
"article and time stripped out of basic HTML",
|
"article and time stripped out of basic HTML",
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
||||||
[]*time.Time{&testDate},
|
[]*time.Time{&testDate},
|
||||||
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
|
||||||
|
[]string{"May 14 2004"},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"multiple articles",
|
"multiple articles",
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
|
||||||
[]*time.Time{&testDate, &testDate2},
|
[]*time.Time{&testDate, &testDate2},
|
||||||
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
|
||||||
|
[]string{"May 14 2004", "May 07 2004"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"article with h1",
|
||||||
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>",
|
||||||
|
[]*time.Time{&testDate},
|
||||||
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article>"},
|
||||||
|
[]string{"Hello"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"article with h2",
|
||||||
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>",
|
||||||
|
[]*time.Time{&testDate},
|
||||||
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article>"},
|
||||||
|
[]string{"Hello"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"article with h3",
|
||||||
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>",
|
||||||
|
[]*time.Time{&testDate},
|
||||||
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article>"},
|
||||||
|
[]string{"Hello"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"article with h1 and h2",
|
||||||
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>",
|
||||||
|
[]*time.Time{&testDate},
|
||||||
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article>"},
|
||||||
|
[]string{"Hello"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"article with h2 and h3",
|
||||||
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>",
|
||||||
|
[]*time.Time{&testDate},
|
||||||
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article>"},
|
||||||
|
[]string{"World"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"article with multiple h1",
|
||||||
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>",
|
||||||
|
[]*time.Time{&testDate},
|
||||||
|
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article>"},
|
||||||
|
[]string{"Hello"},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -124,6 +164,9 @@ func TestArticleParsing(t *testing.T) {
|
||||||
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
|
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
|
||||||
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
|
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
|
||||||
}
|
}
|
||||||
|
if item.Title != tt.want_title[i] {
|
||||||
|
t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue