implement multiple article parsing
This commit is contained in:
		
							parent
							
								
									d4041140d2
								
							
						
					
					
						commit
						b778f07a33
					
				
							
								
								
									
										109
									
								
								feed/feed.go
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								feed/feed.go
									
									
									
									
									
								
							@ -1,6 +1,7 @@
 | 
				
			|||||||
package feed
 | 
					package feed
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
 | 
						"errors"
 | 
				
			||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
	"io"
 | 
						"io"
 | 
				
			||||||
	"net/http"
 | 
						"net/http"
 | 
				
			||||||
@ -47,6 +48,13 @@ type FeedItem struct {
 | 
				
			|||||||
    RawText     string
 | 
					    RawText     string
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					type SitePage struct {
 | 
				
			||||||
 | 
					    Url     string
 | 
				
			||||||
 | 
					    Title   string
 | 
				
			||||||
 | 
					    Root    *html.Node
 | 
				
			||||||
 | 
					    Errors  []error
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func fetchPage(url string) (string, error) {
 | 
					func fetchPage(url string) (string, error) {
 | 
				
			||||||
    resp, err := http.Get(url)
 | 
					    resp, err := http.Get(url)
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
@ -85,7 +93,7 @@ func parseTime(timestr string) (time.Time, error) {
 | 
				
			|||||||
            return pagetime, err
 | 
					            return pagetime, err
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return pagetime, fmt.Errorf("Error parsing time: invalid format")
 | 
					    return pagetime, fmt.Errorf("%s is in an invalid format", timestr)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
 | 
					func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
 | 
				
			||||||
@ -107,29 +115,36 @@ func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
 | 
				
			|||||||
    return element, nil
 | 
					    return element, nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func (f *FeedItem) ParseContent(content string) error {
 | 
					func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
 | 
				
			||||||
    doc, err := html.Parse(strings.NewReader(content))
 | 
					    var f func(*html.Node, string)
 | 
				
			||||||
    if err != nil {
 | 
					    elements := make([]*html.Node, 0)
 | 
				
			||||||
        return fmt.Errorf("Error parsing HTML: %w", err)
 | 
					    f = func(n *html.Node, s string) {
 | 
				
			||||||
 | 
					        if n.Type == html.ElementNode && n.Data == s{
 | 
				
			||||||
 | 
					            elements = append(elements, n)
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    earticle, err := getHtmlElement(doc, "article")
 | 
					        for c := n.FirstChild; c != nil; c = c.NextSibling {
 | 
				
			||||||
    if err != nil {
 | 
					            f(c, tag)
 | 
				
			||||||
        return err
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    etitle, err := getHtmlElement(doc, "title")
 | 
					    }
 | 
				
			||||||
    if err != nil {
 | 
					    f(doc, tag)
 | 
				
			||||||
        f.Title = ""
 | 
					    if len(elements) == 0 {
 | 
				
			||||||
    } else {
 | 
					        return nil, fmt.Errorf("no <%s> element found", tag)
 | 
				
			||||||
        f.Title = etitle.FirstChild.Data
 | 
					    }
 | 
				
			||||||
 | 
					    return elements, nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
 | 
				
			||||||
    var articleBuilder strings.Builder
 | 
					    var articleBuilder strings.Builder
 | 
				
			||||||
    html.Render(&articleBuilder, earticle)
 | 
					    html.Render(&articleBuilder, article)
 | 
				
			||||||
    f.RawText = articleBuilder.String()
 | 
					    item := FeedItem{
 | 
				
			||||||
 | 
					        Url: url,
 | 
				
			||||||
 | 
					        RawText: articleBuilder.String(),
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    etime, err := getHtmlElement(earticle, "time")
 | 
					    etime, err := getHtmlElement(article, "time")
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        return err
 | 
					        return nil, err
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    var pubTime time.Time
 | 
					    var pubTime time.Time
 | 
				
			||||||
    for _, d := range etime.Attr {
 | 
					    for _, d := range etime.Attr {
 | 
				
			||||||
@ -137,26 +152,58 @@ func (f *FeedItem) ParseContent(content string) error {
 | 
				
			|||||||
            pubTime, err = parseTime(d.Val)
 | 
					            pubTime, err = parseTime(d.Val)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        if err != nil {
 | 
					        if err != nil {
 | 
				
			||||||
            return  fmt.Errorf("Error parsing time: %w", err)
 | 
					            return nil, fmt.Errorf("Error parsing time: %w", err)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        f.PubTime = pubTime
 | 
					        item.PubTime = pubTime
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return nil 
 | 
					    return &item, nil 
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func NewFeedItem(url string) (*FeedItem, error) {
 | 
					func (p *SitePage) Parse() ([]*FeedItem, error) {
 | 
				
			||||||
 | 
					    items := make([]*FeedItem, 0)
 | 
				
			||||||
 | 
					    articles, err := getAllElements(p.Root, "article")
 | 
				
			||||||
 | 
					    if err != nil {
 | 
				
			||||||
 | 
					        return nil, errors.New("No article elements found")
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    for _, article := range articles {
 | 
				
			||||||
 | 
					        item, parseErr := NewFeedItem(p.Url, article)
 | 
				
			||||||
 | 
					        if parseErr != nil {
 | 
				
			||||||
 | 
					            p.Errors = append(p.Errors, parseErr)
 | 
				
			||||||
 | 
					        } else {
 | 
				
			||||||
 | 
					            items = append(items, item)
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if len(p.Errors) > 0 {
 | 
				
			||||||
 | 
					        errorStrs := make([]string, 0)
 | 
				
			||||||
 | 
					        for _, perr := range p.Errors {
 | 
				
			||||||
 | 
					            errorStrs = append(errorStrs, perr.Error())
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        err = errors.New(strings.Join(errorStrs, "\n"))
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    return items, err
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func NewSitePage(url string) (*SitePage, error) {
 | 
				
			||||||
    rawhtml, err := fetchPage(url)
 | 
					    rawhtml, err := fetchPage(url)
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
 | 
					        return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    item := FeedItem{
 | 
					    nodeRoot, err := html.Parse(strings.NewReader(rawhtml))
 | 
				
			||||||
        Url: url,
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    err = item.ParseContent(rawhtml);
 | 
					 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        return nil, fmt.Errorf("Could not parse feed item: %w", err)
 | 
					        return nil, fmt.Errorf("Error parsing HTML: %w", err)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return &item, nil
 | 
					    page := SitePage{
 | 
				
			||||||
 | 
					        Url: url,
 | 
				
			||||||
 | 
					        Root: nodeRoot,
 | 
				
			||||||
 | 
					        Errors: make([]error, 0),
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    nodeTitle, err := getHtmlElement(nodeRoot, "title")
 | 
				
			||||||
 | 
					    if err != nil {
 | 
				
			||||||
 | 
					        page.Title = url
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					        page.Title = nodeTitle.FirstChild.Data
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    return &page, nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
 | 
					func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
 | 
				
			||||||
@ -168,11 +215,15 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed
 | 
				
			|||||||
        Errors: make(map[string]string, 10),
 | 
					        Errors: make(map[string]string, 10),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    for _,url := range info.PageUrls {
 | 
					    for _,url := range info.PageUrls {
 | 
				
			||||||
        item, err := NewFeedItem(url)
 | 
					        page, err := NewSitePage(url)
 | 
				
			||||||
 | 
					        if err != nil {
 | 
				
			||||||
 | 
					            info.Errors[url] = err.Error()
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        pageItems, err := page.Parse()
 | 
				
			||||||
        if err != nil {
 | 
					        if err != nil {
 | 
				
			||||||
            info.Errors[url] = err.Error()
 | 
					            info.Errors[url] = err.Error()
 | 
				
			||||||
        } else {
 | 
					        } else {
 | 
				
			||||||
            info.Items = append(info.Items, item)
 | 
					            info.Items = append(info.Items, pageItems...)
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return &info, nil
 | 
					    return &info, nil
 | 
				
			||||||
 | 
				
			|||||||
@ -1,9 +1,12 @@
 | 
				
			|||||||
package feed_test
 | 
					package feed_test
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import (
 | 
					import (
 | 
				
			||||||
 | 
						"strings"
 | 
				
			||||||
	"testing"
 | 
						"testing"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"git.32bit.cafe/yequari/webweav.ing/feed"
 | 
						"git.32bit.cafe/yequari/webweav.ing/feed"
 | 
				
			||||||
 | 
						"golang.org/x/net/html"
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func TestTimeParsing(t *testing.T) {
 | 
					func TestTimeParsing(t *testing.T) {
 | 
				
			||||||
@ -19,35 +22,38 @@ func TestTimeParsing(t *testing.T) {
 | 
				
			|||||||
    }{
 | 
					    }{
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "YYYY-MM-DD",
 | 
					                "YYYY-MM-DD",
 | 
				
			||||||
                `<html><head></head><body><article><time datetime="2004-05-14">May 14 2004</time>hello world</article></body></html>`,
 | 
					                `<article><time datetime="2004-05-14">May 14 2004</time>hello world</article>`,
 | 
				
			||||||
                time.DateOnly,
 | 
					                time.DateOnly,
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "YYYY-MM-DD HH:MM",
 | 
					                "YYYY-MM-DD HH:MM",
 | 
				
			||||||
                `<html><head></head><body><article><time datetime="2004-05-14 07:30">May 14 2004</time>hello world</article></body></html>`,
 | 
					                `<article><time datetime="2004-05-14 07:30">May 14 2004</time>hello world</article>`,
 | 
				
			||||||
                "2006-01-02 15:04",
 | 
					                "2006-01-02 15:04",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "YYYY-MM-DD HH:MM:SS",
 | 
					                "YYYY-MM-DD HH:MM:SS",
 | 
				
			||||||
                `<html><head></head><body><article><time datetime="2004-05-14 07:30:55">May 14 2004</time>hello world</article></body></html>`,
 | 
					                `<article><time datetime="2004-05-14 07:30:55">May 14 2004</time>hello world</article>`,
 | 
				
			||||||
                "2006-01-02 15:04:05",
 | 
					                "2006-01-02 15:04:05",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "YYYY-MM-DDTHH:MM:SS",
 | 
					                "YYYY-MM-DDTHH:MM:SS",
 | 
				
			||||||
                `<html><head></head><body><article><time datetime="2004-05-14T07:30:55">May 14 2004</time>hello world</article></body></html>`,
 | 
					                `<article><time datetime="2004-05-14T07:30:55">May 14 2004</time>hello world</article>`,
 | 
				
			||||||
                "2006-01-02T15:04:05",
 | 
					                "2006-01-02T15:04:05",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "YYYY-MM-DDTHH:MM",
 | 
					                "YYYY-MM-DDTHH:MM",
 | 
				
			||||||
                `<html><head></head><body><article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article></body></html>`,
 | 
					                `<article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article>`,
 | 
				
			||||||
                "2006-01-02T15:04",
 | 
					                "2006-01-02T15:04",
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for _, tt := range tests {
 | 
					    for _, tt := range tests {
 | 
				
			||||||
        t.Run(tt.name, func (t *testing.T) {
 | 
					        t.Run(tt.name, func (t *testing.T) {
 | 
				
			||||||
            item := feed.FeedItem{}
 | 
					            html, err := html.Parse(strings.NewReader(tt.input))
 | 
				
			||||||
            err := item.ParseContent(tt.input)
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            item, err := feed.NewFeedItem("", html)
 | 
				
			||||||
            if err != nil {
 | 
					            if err != nil {
 | 
				
			||||||
                t.Errorf("error: %s", err)
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
@ -61,38 +67,58 @@ func TestTimeParsing(t *testing.T) {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					func TestArticleParsing(t *testing.T) {
 | 
				
			||||||
func TestParseFeedItem(t *testing.T) {
 | 
					 | 
				
			||||||
    testDate, err := time.Parse("2006-Jan-02", "2004-May-14") 
 | 
					    testDate, err := time.Parse("2006-Jan-02", "2004-May-14") 
 | 
				
			||||||
    if err != nil {
 | 
					    if err != nil {
 | 
				
			||||||
        t.Errorf("creating test date failed: %s", err)
 | 
					        t.Errorf("creating test date failed: %s", err)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    testDate2, err := time.Parse("2006-Jan-02", "2004-May-07") 
 | 
				
			||||||
 | 
					    if err != nil {
 | 
				
			||||||
 | 
					        t.Errorf("creating test date failed: %s", err)
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    var tests = []struct {
 | 
					    var tests = []struct {
 | 
				
			||||||
        name string
 | 
					        name string
 | 
				
			||||||
        input string
 | 
					        input string
 | 
				
			||||||
        want_time *time.Time
 | 
					        want_time []*time.Time
 | 
				
			||||||
        want_article string
 | 
					        want_article []string
 | 
				
			||||||
    }{
 | 
					    }{
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                "article and time stripped out of basic HTML",
 | 
					                "article and time stripped out of basic HTML",
 | 
				
			||||||
                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", 
 | 
				
			||||||
                &testDate, 
 | 
					                []*time.Time{&testDate}, 
 | 
				
			||||||
                "<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>",
 | 
					                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "multiple articles",
 | 
				
			||||||
 | 
					                "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", 
 | 
				
			||||||
 | 
					                []*time.Time{&testDate, &testDate2},
 | 
				
			||||||
 | 
					                []string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for _, tt := range tests {
 | 
					    for _, tt := range tests {
 | 
				
			||||||
        t.Run(tt.name, func (t *testing.T) {
 | 
					        t.Run(tt.name, func (t *testing.T) {
 | 
				
			||||||
            item := feed.FeedItem{}
 | 
					            html, err := html.Parse(strings.NewReader(tt.input))
 | 
				
			||||||
            err := item.ParseContent(tt.input)
 | 
					 | 
				
			||||||
            if err != nil {
 | 
					            if err != nil {
 | 
				
			||||||
                t.Errorf("error: %s", err)
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            if item.RawText != tt.want_article {
 | 
					            page := feed.SitePage{
 | 
				
			||||||
                t.Errorf("got %s, want %s", item.RawText, tt.want_article)
 | 
					                Url: "",
 | 
				
			||||||
 | 
					                Title: "",
 | 
				
			||||||
 | 
					                Root: html, 
 | 
				
			||||||
 | 
					                Errors: make([]error, 0),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            items, err := page.Parse()
 | 
				
			||||||
 | 
					            if err != nil {
 | 
				
			||||||
 | 
					                t.Errorf("error: %s", err)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            for i, item := range items {
 | 
				
			||||||
 | 
					                if item.RawText != tt.want_article[i] {
 | 
				
			||||||
 | 
					                    t.Errorf("got %s, want %s", item.RawText, tt.want_article[i])
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
 | 
				
			||||||
 | 
					                    t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) {
 | 
					 | 
				
			||||||
                t.Errorf("got %s, want %s", item.PubTime, *tt.want_time)
 | 
					 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        })
 | 
					        })
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user