parse out titles and urls from heading tags
This commit is contained in:
		
							parent
							
								
									684ee15a95
								
							
						
					
					
						commit
						d165de5d86
					
				
							
								
								
									
										65
									
								
								feed/feed.go
									
									
									
									
									
								
							
							
						
						
									
										65
									
								
								feed/feed.go
									
									
									
									
									
								
							@ -5,10 +5,12 @@ import (
 | 
				
			|||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
	"io"
 | 
						"io"
 | 
				
			||||||
	"net/http"
 | 
						"net/http"
 | 
				
			||||||
 | 
						"path"
 | 
				
			||||||
	"strings"
 | 
						"strings"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"golang.org/x/net/html"
 | 
						"golang.org/x/net/html"
 | 
				
			||||||
 | 
						"golang.org/x/net/html/atom"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
 | 
					const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
 | 
				
			||||||
@ -135,6 +137,43 @@ func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
 | 
				
			|||||||
    return elements, nil
 | 
					    return elements, nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func getTitleAndUrl(article *html.Node) (string, string, error) {
 | 
				
			||||||
 | 
					    var title string
 | 
				
			||||||
 | 
					    var url string
 | 
				
			||||||
 | 
					    var header *html.Node
 | 
				
			||||||
 | 
					    h1, _ := getHtmlElement(article, "h1")
 | 
				
			||||||
 | 
					    h2, _ := getHtmlElement(article, "h2")
 | 
				
			||||||
 | 
					    h3, _ := getHtmlElement(article, "h3")
 | 
				
			||||||
 | 
					    if h1 != nil {
 | 
				
			||||||
 | 
					        header = h1
 | 
				
			||||||
 | 
					    } else if h2 != nil {
 | 
				
			||||||
 | 
					        header = h2
 | 
				
			||||||
 | 
					    } else if h3 != nil {
 | 
				
			||||||
 | 
					        header = h3
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if header == nil {
 | 
				
			||||||
 | 
					        return "", "", nil
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if header.FirstChild.Type != html.ElementNode {
 | 
				
			||||||
 | 
					        title = header.FirstChild.Data
 | 
				
			||||||
 | 
					    } else if header.FirstChild.DataAtom == atom.A {
 | 
				
			||||||
 | 
					        title = header.FirstChild.FirstChild.Data
 | 
				
			||||||
 | 
					        for _, d := range header.FirstChild.Attr {
 | 
				
			||||||
 | 
					            if d.Key == "href" {
 | 
				
			||||||
 | 
					                url = d.Val
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					    if url == "" && header.Parent.DataAtom == atom.A {
 | 
				
			||||||
 | 
					        for _, d := range header.FirstChild.Attr {
 | 
				
			||||||
 | 
					            if d.Key == "href" {
 | 
				
			||||||
 | 
					                url = d.Val
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    return title, url, nil
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
 | 
					func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
 | 
				
			||||||
    var articleBuilder strings.Builder
 | 
					    var articleBuilder strings.Builder
 | 
				
			||||||
    html.Render(&articleBuilder, article)
 | 
					    html.Render(&articleBuilder, article)
 | 
				
			||||||
@ -142,7 +181,6 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
 | 
				
			|||||||
        Url: url,
 | 
					        Url: url,
 | 
				
			||||||
        RawText: articleBuilder.String(),
 | 
					        RawText: articleBuilder.String(),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
    etime, err := getHtmlElement(article, "time")
 | 
					    etime, err := getHtmlElement(article, "time")
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        return nil, err
 | 
					        return nil, err
 | 
				
			||||||
@ -157,21 +195,18 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
        item.PubTime = pubTime
 | 
					        item.PubTime = pubTime
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    title, itemurl, _ := getTitleAndUrl(article)
 | 
				
			||||||
    eh1, _ := getHtmlElement(article, "h1")
 | 
					    if title == "" {
 | 
				
			||||||
    eh2, _ := getHtmlElement(article, "h2")
 | 
					        title = pubTime.Format("Jan 02 2006")
 | 
				
			||||||
    eh3, _ := getHtmlElement(article, "h3")
 | 
					 | 
				
			||||||
    if eh1 != nil {
 | 
					 | 
				
			||||||
        item.Title = eh1.FirstChild.Data
 | 
					 | 
				
			||||||
        // TODO: handle <a>
 | 
					 | 
				
			||||||
    } else if eh2 != nil {
 | 
					 | 
				
			||||||
        item.Title = eh2.FirstChild.Data
 | 
					 | 
				
			||||||
    } else if eh3 != nil {
 | 
					 | 
				
			||||||
        item.Title = eh3.FirstChild.Data
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
        item.Title = pubTime.Format("Jan 02 2006")
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    if itemurl == "" {
 | 
				
			||||||
 | 
					        itemurl = url
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") {
 | 
				
			||||||
 | 
					        itemurl = path.Join(url, itemurl)
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    item.Title = title
 | 
				
			||||||
 | 
					    item.Url = itemurl
 | 
				
			||||||
    return &item, nil 
 | 
					    return &item, nil 
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -9,7 +9,7 @@ import (
 | 
				
			|||||||
	"golang.org/x/net/html"
 | 
						"golang.org/x/net/html"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func TestTimeParsing(t *testing.T) {
 | 
					func TestTimeParsingFormat(t *testing.T) {
 | 
				
			||||||
    testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7")
 | 
					    testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7")
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        t.Errorf("creating test date failed: %s", err)
 | 
					        t.Errorf("creating test date failed: %s", err)
 | 
				
			||||||
@ -67,7 +67,7 @@ func TestTimeParsing(t *testing.T) {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func TestArticleParsing(t *testing.T) {
 | 
					func TestArticleTimeParsing(t *testing.T) {
 | 
				
			||||||
    testDate, err := time.Parse("2006-Jan-02", "2004-May-14") 
 | 
					    testDate, err := time.Parse("2006-Jan-02", "2004-May-14") 
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        t.Errorf("creating test date failed: %s", err)
 | 
					        t.Errorf("creating test date failed: %s", err)
 | 
				
			||||||
@ -80,67 +80,198 @@ func TestArticleParsing(t *testing.T) {
 | 
				
			|||||||
        name string
 | 
					        name string
 | 
				
			||||||
        input string
 | 
					        input string
 | 
				
			||||||
        want_time []*time.Time
 | 
					        want_time []*time.Time
 | 
				
			||||||
        want_article []string
 | 
					 | 
				
			||||||
        want_title []string
 | 
					 | 
				
			||||||
    }{
 | 
					    }{
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article and time stripped out of basic HTML",
 | 
					                "article and time stripped out of basic HTML",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					                []*time.Time{&testDate}, 
 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"May 14 2004"},
 | 
					 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "multiple articles",
 | 
					                "multiple articles",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate, &testDate2},
 | 
					                []*time.Time{&testDate, &testDate2},
 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
 | 
					            },
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for _, tt := range tests {
 | 
				
			||||||
 | 
					        t.Run(tt.name, func (t *testing.T) {
 | 
				
			||||||
 | 
					            html, err := html.Parse(strings.NewReader(tt.input))
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            page := feed.SitePage{
 | 
				
			||||||
 | 
					                Url: "",
 | 
				
			||||||
 | 
					                Title: "",
 | 
				
			||||||
 | 
					                Root: html, 
 | 
				
			||||||
 | 
					                Errors: make([]error, 0),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            items, err := page.Parse()
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            for i, item := range items {
 | 
				
			||||||
 | 
					                if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
 | 
				
			||||||
 | 
					                    t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func TestArticleUrls(t *testing.T) {
 | 
				
			||||||
 | 
					    var tests = []struct {
 | 
				
			||||||
 | 
					        name string
 | 
				
			||||||
 | 
					        input string
 | 
				
			||||||
 | 
					        want_url []string
 | 
				
			||||||
 | 
					    }{
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "article without url",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{""},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "one article with one url",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"https://example.com"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "multiple articles",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><h1><a href=\"https://example.com/example1\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><h1><a href=\"https://example.com/example2\">Title</a></h1><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"https://example.com/example1", "https://example.com/example2"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "one article with multiple urls",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><a href=\"https://google.com\">click me</a><p>hello world</p></article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"https://example.com"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "one article with multiple h1 urls",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><h1><a href=\"https://google.com\">click me</a></h1><p>hello world</p></article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"https://example.com"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for _, tt := range tests {
 | 
				
			||||||
 | 
					        t.Run(tt.name, func (t *testing.T) {
 | 
				
			||||||
 | 
					            html, err := html.Parse(strings.NewReader(tt.input))
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            page := feed.SitePage{
 | 
				
			||||||
 | 
					                Url: "",
 | 
				
			||||||
 | 
					                Title: "",
 | 
				
			||||||
 | 
					                Root: html, 
 | 
				
			||||||
 | 
					                Errors: make([]error, 0),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            items, err := page.Parse()
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            for i, item := range items {
 | 
				
			||||||
 | 
					                if item.Url != tt.want_url[i] {
 | 
				
			||||||
 | 
					                    t.Errorf("got %s, want %s", item.Url, tt.want_url[i])
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func TestArticleTitles(t *testing.T) {
 | 
				
			||||||
 | 
					    var tests = []struct {
 | 
				
			||||||
 | 
					        name string
 | 
				
			||||||
 | 
					        input string
 | 
				
			||||||
 | 
					        want_title []string
 | 
				
			||||||
 | 
					    }{
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "article and time stripped out of basic HTML",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"May 14 2004"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "multiple articles",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", 
 | 
				
			||||||
                []string{"May 14 2004", "May 07 2004"},
 | 
					                []string{"May 14 2004", "May 07 2004"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "multiple articles with h1",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article><article><time datetime=\"2004-05-07\"><h1>World</h1>May 7 2004</time>this is a second article</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"Hello", "World"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article with h1",
 | 
					                "article with h1",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"Hello"},
 | 
					                []string{"Hello"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article with h2",
 | 
					                "article with h2",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"Hello"},
 | 
					                []string{"Hello"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article with h3",
 | 
					                "article with h3",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"Hello"},
 | 
					                []string{"Hello"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article with h1 and h2",
 | 
					                "article with h1 and h2",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"Hello"},
 | 
					                []string{"Hello"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article with h2 and h3",
 | 
					                "article with h2 and h3",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"World"},
 | 
					                []string{"World"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article with multiple h1",
 | 
					                "article with multiple h1",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>", 
 | 
				
			||||||
                []*time.Time{&testDate}, 
 | 
					 | 
				
			||||||
                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article>"},
 | 
					 | 
				
			||||||
                []string{"Hello"},
 | 
					                []string{"Hello"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for _, tt := range tests {
 | 
				
			||||||
 | 
					        t.Run(tt.name, func (t *testing.T) {
 | 
				
			||||||
 | 
					            html, err := html.Parse(strings.NewReader(tt.input))
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            page := feed.SitePage{
 | 
				
			||||||
 | 
					                Url: "",
 | 
				
			||||||
 | 
					                Title: "",
 | 
				
			||||||
 | 
					                Root: html, 
 | 
				
			||||||
 | 
					                Errors: make([]error, 0),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            items, err := page.Parse()
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            for i, item := range items {
 | 
				
			||||||
 | 
					                if item.Title != tt.want_title[i] {
 | 
				
			||||||
 | 
					                    t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func TestArticleParsing(t *testing.T) {
 | 
				
			||||||
 | 
					    var tests = []struct {
 | 
				
			||||||
 | 
					        name string
 | 
				
			||||||
 | 
					        input string
 | 
				
			||||||
 | 
					        want_article []string
 | 
				
			||||||
 | 
					    }{
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "article and time stripped out of basic HTML",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "multiple articles",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", 
 | 
				
			||||||
 | 
					                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for _, tt := range tests {
 | 
					    for _, tt := range tests {
 | 
				
			||||||
        t.Run(tt.name, func (t *testing.T) {
 | 
					        t.Run(tt.name, func (t *testing.T) {
 | 
				
			||||||
            html, err := html.Parse(strings.NewReader(tt.input))
 | 
					            html, err := html.Parse(strings.NewReader(tt.input))
 | 
				
			||||||
@ -161,12 +292,6 @@ func TestArticleParsing(t *testing.T) {
 | 
				
			|||||||
                if item.RawText != tt.want_article[i] {
 | 
					                if item.RawText != tt.want_article[i] {
 | 
				
			||||||
                    t.Errorf("got %s, want %s", item.RawText, tt.want_article[i])
 | 
					                    t.Errorf("got %s, want %s", item.RawText, tt.want_article[i])
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
 | 
					 | 
				
			||||||
                    t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
                if item.Title != tt.want_title[i] {
 | 
					 | 
				
			||||||
                    t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        })
 | 
					        })
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user