Compare commits
No commits in common. "master" and "time-parsing" have entirely different histories.
master
...
time-parsi
169
feed/feed.go
169
feed/feed.go
|
@ -1,16 +1,13 @@
|
||||||
package feed
|
package feed
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"path"
|
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
"golang.org/x/net/html/atom"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
|
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
@ -50,14 +47,6 @@ type FeedItem struct {
|
||||||
RawText string
|
RawText string
|
||||||
}
|
}
|
||||||
|
|
||||||
type SitePage struct {
|
|
||||||
Url string
|
|
||||||
Title string
|
|
||||||
Root *html.Node
|
|
||||||
Errors []error
|
|
||||||
ErrStr string
|
|
||||||
}
|
|
||||||
|
|
||||||
func fetchPage(url string) (string, error) {
|
func fetchPage(url string) (string, error) {
|
||||||
resp, err := http.Get(url)
|
resp, err := http.Get(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -96,7 +85,7 @@ func parseTime(timestr string) (time.Time, error) {
|
||||||
return pagetime, err
|
return pagetime, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return pagetime, fmt.Errorf("%s is in an invalid format", timestr)
|
return pagetime, fmt.Errorf("Error parsing time: invalid format")
|
||||||
}
|
}
|
||||||
|
|
||||||
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
|
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
|
||||||
|
@ -118,72 +107,29 @@ func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
|
||||||
return element, nil
|
return element, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
|
func (f *FeedItem) ParseContent(content string) error {
|
||||||
var f func(*html.Node, string)
|
doc, err := html.Parse(strings.NewReader(content))
|
||||||
elements := make([]*html.Node, 0)
|
|
||||||
f = func(n *html.Node, s string) {
|
|
||||||
if n.Type == html.ElementNode && n.Data == s{
|
|
||||||
elements = append(elements, n)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
||||||
f(c, tag)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
f(doc, tag)
|
|
||||||
if len(elements) == 0 {
|
|
||||||
return nil, fmt.Errorf("no <%s> element found", tag)
|
|
||||||
}
|
|
||||||
return elements, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func getTitleAndUrl(article *html.Node) (string, string, error) {
|
|
||||||
var title string
|
|
||||||
var url string
|
|
||||||
var header *html.Node
|
|
||||||
h1s, _ := getAllElements(article, "h1")
|
|
||||||
h2s, _ := getAllElements(article, "h2")
|
|
||||||
h3s, _ := getAllElements(article, "h3")
|
|
||||||
if len(h1s) > 0 {
|
|
||||||
header = h1s[0]
|
|
||||||
} else if len(h2s) > 0 {
|
|
||||||
header = h2s[0]
|
|
||||||
} else if len(h3s) > 0 {
|
|
||||||
header = h3s[0]
|
|
||||||
}
|
|
||||||
if header == nil {
|
|
||||||
return "", "", nil
|
|
||||||
}
|
|
||||||
if header.FirstChild.Type != html.ElementNode {
|
|
||||||
title = header.FirstChild.Data
|
|
||||||
} else if header.FirstChild.DataAtom == atom.A {
|
|
||||||
title = header.FirstChild.FirstChild.Data
|
|
||||||
for _, d := range header.FirstChild.Attr {
|
|
||||||
if d.Key == "href" {
|
|
||||||
url = d.Val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if url == "" && header.Parent.DataAtom == atom.A {
|
|
||||||
for _, d := range header.FirstChild.Attr {
|
|
||||||
if d.Key == "href" {
|
|
||||||
url = d.Val
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return title, url, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
|
|
||||||
var articleBuilder strings.Builder
|
|
||||||
html.Render(&articleBuilder, article)
|
|
||||||
item := FeedItem{
|
|
||||||
Url: url,
|
|
||||||
RawText: articleBuilder.String(),
|
|
||||||
}
|
|
||||||
etime, err := getHtmlElement(article, "time")
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return fmt.Errorf("Error parsing HTML: %w", err)
|
||||||
|
}
|
||||||
|
earticle, err := getHtmlElement(doc, "article")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
etitle, err := getHtmlElement(doc, "title")
|
||||||
|
if err != nil {
|
||||||
|
f.Title = ""
|
||||||
|
} else {
|
||||||
|
f.Title = etitle.FirstChild.Data
|
||||||
|
}
|
||||||
|
|
||||||
|
var articleBuilder strings.Builder
|
||||||
|
html.Render(&articleBuilder, earticle)
|
||||||
|
f.RawText = articleBuilder.String()
|
||||||
|
|
||||||
|
etime, err := getHtmlElement(earticle, "time")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
var pubTime time.Time
|
var pubTime time.Time
|
||||||
for _, d := range etime.Attr {
|
for _, d := range etime.Attr {
|
||||||
|
@ -191,70 +137,26 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
|
||||||
pubTime, err = parseTime(d.Val)
|
pubTime, err = parseTime(d.Val)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Error parsing time: %w", err)
|
return fmt.Errorf("Error parsing time: %w", err)
|
||||||
}
|
}
|
||||||
item.PubTime = pubTime
|
f.PubTime = pubTime
|
||||||
}
|
}
|
||||||
title, itemurl, _ := getTitleAndUrl(article)
|
return nil
|
||||||
if title == "" {
|
|
||||||
title = pubTime.Format("Jan 02 2006")
|
|
||||||
}
|
|
||||||
if itemurl == "" {
|
|
||||||
itemurl = url
|
|
||||||
}
|
|
||||||
if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") {
|
|
||||||
itemurl = path.Join(url, itemurl)
|
|
||||||
}
|
|
||||||
item.Title = title
|
|
||||||
item.Url = itemurl
|
|
||||||
return &item, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *SitePage) Parse() ([]*FeedItem, error) {
|
func NewFeedItem(url string) (*FeedItem, error) {
|
||||||
items := make([]*FeedItem, 0)
|
|
||||||
articles, err := getAllElements(p.Root, "article")
|
|
||||||
if err != nil {
|
|
||||||
return nil, errors.New("No article elements found")
|
|
||||||
}
|
|
||||||
for _, article := range articles {
|
|
||||||
item, parseErr := NewFeedItem(p.Url, article)
|
|
||||||
if parseErr != nil {
|
|
||||||
p.Errors = append(p.Errors, parseErr)
|
|
||||||
} else {
|
|
||||||
items = append(items, item)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(p.Errors) > 0 {
|
|
||||||
errorStrs := make([]string, 0)
|
|
||||||
for _, perr := range p.Errors {
|
|
||||||
errorStrs = append(errorStrs, perr.Error())
|
|
||||||
}
|
|
||||||
p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error()
|
|
||||||
}
|
|
||||||
return items, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewSitePage(url string) (*SitePage, error) {
|
|
||||||
rawhtml, err := fetchPage(url)
|
rawhtml, err := fetchPage(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
|
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
|
||||||
}
|
}
|
||||||
nodeRoot, err := html.Parse(strings.NewReader(rawhtml))
|
item := FeedItem{
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("Error parsing HTML: %w", err)
|
|
||||||
}
|
|
||||||
page := SitePage{
|
|
||||||
Url: url,
|
Url: url,
|
||||||
Root: nodeRoot,
|
|
||||||
Errors: make([]error, 0),
|
|
||||||
}
|
}
|
||||||
nodeTitle, err := getHtmlElement(nodeRoot, "title")
|
err = item.ParseContent(rawhtml);
|
||||||
if err != nil {
|
if err != nil {
|
||||||
page.Title = url
|
return nil, fmt.Errorf("Could not parse feed item: %w", err)
|
||||||
} else {
|
|
||||||
page.Title = nodeTitle.FirstChild.Data
|
|
||||||
}
|
}
|
||||||
return &page, nil
|
return &item, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
|
func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
|
||||||
|
@ -266,16 +168,11 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed
|
||||||
Errors: make(map[string]string, 10),
|
Errors: make(map[string]string, 10),
|
||||||
}
|
}
|
||||||
for _,url := range info.PageUrls {
|
for _,url := range info.PageUrls {
|
||||||
page, err := NewSitePage(url)
|
item, err := NewFeedItem(url)
|
||||||
if err != nil {
|
|
||||||
info.Errors[url] = err.Error()
|
|
||||||
}
|
|
||||||
pageItems, err := page.Parse()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
info.Errors[url] = err.Error()
|
info.Errors[url] = err.Error()
|
||||||
} else {
|
} else {
|
||||||
info.Items = append(info.Items, pageItems...)
|
info.Items = append(info.Items, item)
|
||||||
info.Errors[url] = page.ErrStr
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &info, nil
|
return &info, nil
|
||||||
|
|
|
@ -1,15 +1,12 @@
|
||||||
package feed_test
|
package feed_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"strings"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"git.32bit.cafe/yequari/webweav.ing/feed"
|
"git.32bit.cafe/yequari/webweav.ing/feed"
|
||||||
"golang.org/x/net/html"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestTimeParsingFormat(t *testing.T) {
|
func TestTimeParsing(t *testing.T) {
|
||||||
testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7")
|
testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("creating test date failed: %s", err)
|
t.Errorf("creating test date failed: %s", err)
|
||||||
|
@ -22,38 +19,35 @@ func TestTimeParsingFormat(t *testing.T) {
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
"YYYY-MM-DD",
|
"YYYY-MM-DD",
|
||||||
`<article><time datetime="2004-05-14">May 14 2004</time>hello world</article>`,
|
`<html><head></head><body><article><time datetime="2004-05-14">May 14 2004</time>hello world</article></body></html>`,
|
||||||
time.DateOnly,
|
time.DateOnly,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"YYYY-MM-DD HH:MM",
|
"YYYY-MM-DD HH:MM",
|
||||||
`<article><time datetime="2004-05-14 07:30">May 14 2004</time>hello world</article>`,
|
`<html><head></head><body><article><time datetime="2004-05-14 07:30">May 14 2004</time>hello world</article></body></html>`,
|
||||||
"2006-01-02 15:04",
|
"2006-01-02 15:04",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"YYYY-MM-DD HH:MM:SS",
|
"YYYY-MM-DD HH:MM:SS",
|
||||||
`<article><time datetime="2004-05-14 07:30:55">May 14 2004</time>hello world</article>`,
|
`<html><head></head><body><article><time datetime="2004-05-14 07:30:55">May 14 2004</time>hello world</article></body></html>`,
|
||||||
"2006-01-02 15:04:05",
|
"2006-01-02 15:04:05",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"YYYY-MM-DDTHH:MM:SS",
|
"YYYY-MM-DDTHH:MM:SS",
|
||||||
`<article><time datetime="2004-05-14T07:30:55">May 14 2004</time>hello world</article>`,
|
`<html><head></head><body><article><time datetime="2004-05-14T07:30:55">May 14 2004</time>hello world</article></body></html>`,
|
||||||
"2006-01-02T15:04:05",
|
"2006-01-02T15:04:05",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"YYYY-MM-DDTHH:MM",
|
"YYYY-MM-DDTHH:MM",
|
||||||
`<article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article>`,
|
`<html><head></head><body><article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article></body></html>`,
|
||||||
"2006-01-02T15:04",
|
"2006-01-02T15:04",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func (t *testing.T) {
|
t.Run(tt.name, func (t *testing.T) {
|
||||||
html, err := html.Parse(strings.NewReader(tt.input))
|
item := feed.FeedItem{}
|
||||||
if err != nil {
|
err := item.ParseContent(tt.input)
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
item, err := feed.NewFeedItem("", html)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("error: %s", err)
|
t.Errorf("error: %s", err)
|
||||||
}
|
}
|
||||||
|
@ -67,231 +61,38 @@ func TestTimeParsingFormat(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestArticleTimeParsing(t *testing.T) {
|
|
||||||
|
func TestParseFeedItem(t *testing.T) {
|
||||||
testDate, err := time.Parse("2006-Jan-02", "2004-May-14")
|
testDate, err := time.Parse("2006-Jan-02", "2004-May-14")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("creating test date failed: %s", err)
|
t.Errorf("creating test date failed: %s", err)
|
||||||
}
|
}
|
||||||
testDate2, err := time.Parse("2006-Jan-02", "2004-May-07")
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("creating test date failed: %s", err)
|
|
||||||
}
|
|
||||||
var tests = []struct {
|
var tests = []struct {
|
||||||
name string
|
name string
|
||||||
input string
|
input string
|
||||||
want_time []*time.Time
|
want_time *time.Time
|
||||||
|
want_article string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
"article and time stripped out of basic HTML",
|
"article and time stripped out of basic HTML",
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
||||||
[]*time.Time{&testDate},
|
&testDate,
|
||||||
},
|
"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>",
|
||||||
{
|
|
||||||
"multiple articles",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
|
|
||||||
[]*time.Time{&testDate, &testDate2},
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func (t *testing.T) {
|
t.Run(tt.name, func (t *testing.T) {
|
||||||
html, err := html.Parse(strings.NewReader(tt.input))
|
item := feed.FeedItem{}
|
||||||
|
err := item.ParseContent(tt.input)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("error: %s", err)
|
t.Errorf("error: %s", err)
|
||||||
}
|
}
|
||||||
page := feed.SitePage{
|
if item.RawText != tt.want_article {
|
||||||
Url: "",
|
t.Errorf("got %s, want %s", item.RawText, tt.want_article)
|
||||||
Title: "",
|
|
||||||
Root: html,
|
|
||||||
Errors: make([]error, 0),
|
|
||||||
}
|
|
||||||
items, err := page.Parse()
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
for i, item := range items {
|
|
||||||
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
|
|
||||||
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestArticleUrls(t *testing.T) {
|
|
||||||
var tests = []struct {
|
|
||||||
name string
|
|
||||||
input string
|
|
||||||
want_url []string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
"article without url",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
|
||||||
[]string{""},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"one article with one url",
|
|
||||||
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
|
||||||
[]string{"https://example.com"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"multiple articles",
|
|
||||||
"<html><head></head><body><article><h1><a href=\"https://example.com/example1\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><h1><a href=\"https://example.com/example2\">Title</a></h1><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
|
|
||||||
[]string{"https://example.com/example1", "https://example.com/example2"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"one article with multiple urls",
|
|
||||||
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><a href=\"https://google.com\">click me</a><p>hello world</p></article></body></html>",
|
|
||||||
[]string{"https://example.com"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"one article with multiple h1 urls",
|
|
||||||
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><h1><a href=\"https://google.com\">click me</a></h1><p>hello world</p></article></body></html>",
|
|
||||||
[]string{"https://example.com"},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func (t *testing.T) {
|
|
||||||
html, err := html.Parse(strings.NewReader(tt.input))
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
page := feed.SitePage{
|
|
||||||
Url: "",
|
|
||||||
Title: "",
|
|
||||||
Root: html,
|
|
||||||
Errors: make([]error, 0),
|
|
||||||
}
|
|
||||||
items, err := page.Parse()
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
for i, item := range items {
|
|
||||||
if item.Url != tt.want_url[i] {
|
|
||||||
t.Errorf("got %s, want %s", item.Url, tt.want_url[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestArticleTitles(t *testing.T) {
|
|
||||||
var tests = []struct {
|
|
||||||
name string
|
|
||||||
input string
|
|
||||||
want_title []string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
"article and time stripped out of basic HTML",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
|
||||||
[]string{"May 14 2004"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"multiple articles",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
|
|
||||||
[]string{"May 14 2004", "May 07 2004"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"multiple articles with h1",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article><article><time datetime=\"2004-05-07\"><h1>World</h1>May 7 2004</time>this is a second article</article></body></html>",
|
|
||||||
[]string{"Hello", "World"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"article with h1",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>",
|
|
||||||
[]string{"Hello"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"article with h2",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>",
|
|
||||||
[]string{"Hello"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"article with h3",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>",
|
|
||||||
[]string{"Hello"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"article with h1 and h2",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>",
|
|
||||||
[]string{"Hello"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"article with h2 and h3",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>",
|
|
||||||
[]string{"World"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"article with multiple h1",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>",
|
|
||||||
[]string{"Hello"},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func (t *testing.T) {
|
|
||||||
html, err := html.Parse(strings.NewReader(tt.input))
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
page := feed.SitePage{
|
|
||||||
Url: "",
|
|
||||||
Title: "",
|
|
||||||
Root: html,
|
|
||||||
Errors: make([]error, 0),
|
|
||||||
}
|
|
||||||
items, err := page.Parse()
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
for i, item := range items {
|
|
||||||
if item.Title != tt.want_title[i] {
|
|
||||||
t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestArticleParsing(t *testing.T) {
|
|
||||||
var tests = []struct {
|
|
||||||
name string
|
|
||||||
input string
|
|
||||||
want_article []string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
"article and time stripped out of basic HTML",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
|
|
||||||
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"multiple articles",
|
|
||||||
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
|
|
||||||
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func (t *testing.T) {
|
|
||||||
html, err := html.Parse(strings.NewReader(tt.input))
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
page := feed.SitePage{
|
|
||||||
Url: "",
|
|
||||||
Title: "",
|
|
||||||
Root: html,
|
|
||||||
Errors: make([]error, 0),
|
|
||||||
}
|
|
||||||
items, err := page.Parse()
|
|
||||||
if err != nil {
|
|
||||||
t.Errorf("error: %s", err)
|
|
||||||
}
|
|
||||||
for i, item := range items {
|
|
||||||
if item.RawText != tt.want_article[i] {
|
|
||||||
t.Errorf("got %s, want %s", item.RawText, tt.want_article[i])
|
|
||||||
}
|
}
|
||||||
|
if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) {
|
||||||
|
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue