Compare commits

...

5 Commits

Author SHA1 Message Date
yequari c9e922dd97 get first header element 2024-04-18 11:45:00 -07:00
yequari d165de5d86 parse out titles and urls from heading tags 2024-04-10 20:49:54 -07:00
yequari 684ee15a95 parse headers 2024-04-09 18:43:27 -07:00
yequari 3ea57fe25c implement basic multiple articles 2024-04-02 17:24:51 -07:00
yequari 0546e9ec7e implement multiple article parsing 2024-03-15 16:52:03 -07:00
2 changed files with 356 additions and 54 deletions

View File

@ -1,13 +1,16 @@
package feed package feed
import ( import (
"errors"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"path"
"strings" "strings"
"time" "time"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/atom"
) )
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?> const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
@ -47,6 +50,14 @@ type FeedItem struct {
RawText string RawText string
} }
type SitePage struct {
Url string
Title string
Root *html.Node
Errors []error
ErrStr string
}
func fetchPage(url string) (string, error) { func fetchPage(url string) (string, error) {
resp, err := http.Get(url) resp, err := http.Get(url)
if err != nil { if err != nil {
@ -85,7 +96,7 @@ func parseTime(timestr string) (time.Time, error) {
return pagetime, err return pagetime, err
} }
} }
return pagetime, fmt.Errorf("Error parsing time: invalid format") return pagetime, fmt.Errorf("%s is in an invalid format", timestr)
} }
func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
@ -107,29 +118,72 @@ func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) {
return element, nil return element, nil
} }
func (f *FeedItem) ParseContent(content string) error { func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
doc, err := html.Parse(strings.NewReader(content)) var f func(*html.Node, string)
if err != nil { elements := make([]*html.Node, 0)
return fmt.Errorf("Error parsing HTML: %w", err) f = func(n *html.Node, s string) {
if n.Type == html.ElementNode && n.Data == s{
elements = append(elements, n)
return
} }
earticle, err := getHtmlElement(doc, "article") for c := n.FirstChild; c != nil; c = c.NextSibling {
if err != nil { f(c, tag)
return err
} }
etitle, err := getHtmlElement(doc, "title") }
if err != nil { f(doc, tag)
f.Title = "" if len(elements) == 0 {
} else { return nil, fmt.Errorf("no &lt;%s&gt; element found", tag)
f.Title = etitle.FirstChild.Data }
return elements, nil
} }
func getTitleAndUrl(article *html.Node) (string, string, error) {
var title string
var url string
var header *html.Node
h1s, _ := getAllElements(article, "h1")
h2s, _ := getAllElements(article, "h2")
h3s, _ := getAllElements(article, "h3")
if len(h1s) > 0 {
header = h1s[0]
} else if len(h2s) > 0 {
header = h2s[0]
} else if len(h3s) > 0 {
header = h3s[0]
}
if header == nil {
return "", "", nil
}
if header.FirstChild.Type != html.ElementNode {
title = header.FirstChild.Data
} else if header.FirstChild.DataAtom == atom.A {
title = header.FirstChild.FirstChild.Data
for _, d := range header.FirstChild.Attr {
if d.Key == "href" {
url = d.Val
}
}
}
if url == "" && header.Parent.DataAtom == atom.A {
for _, d := range header.FirstChild.Attr {
if d.Key == "href" {
url = d.Val
}
}
}
return title, url, nil
}
func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
var articleBuilder strings.Builder var articleBuilder strings.Builder
html.Render(&articleBuilder, earticle) html.Render(&articleBuilder, article)
f.RawText = articleBuilder.String() item := FeedItem{
Url: url,
etime, err := getHtmlElement(earticle, "time") RawText: articleBuilder.String(),
}
etime, err := getHtmlElement(article, "time")
if err != nil { if err != nil {
return err return nil, err
} }
var pubTime time.Time var pubTime time.Time
for _, d := range etime.Attr { for _, d := range etime.Attr {
@ -137,26 +191,70 @@ func (f *FeedItem) ParseContent(content string) error {
pubTime, err = parseTime(d.Val) pubTime, err = parseTime(d.Val)
} }
if err != nil { if err != nil {
return fmt.Errorf("Error parsing time: %w", err) return nil, fmt.Errorf("Error parsing time: %w", err)
} }
f.PubTime = pubTime item.PubTime = pubTime
} }
return nil title, itemurl, _ := getTitleAndUrl(article)
if title == "" {
title = pubTime.Format("Jan 02 2006")
}
if itemurl == "" {
itemurl = url
}
if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") {
itemurl = path.Join(url, itemurl)
}
item.Title = title
item.Url = itemurl
return &item, nil
} }
func NewFeedItem(url string) (*FeedItem, error) { func (p *SitePage) Parse() ([]*FeedItem, error) {
items := make([]*FeedItem, 0)
articles, err := getAllElements(p.Root, "article")
if err != nil {
return nil, errors.New("No article elements found")
}
for _, article := range articles {
item, parseErr := NewFeedItem(p.Url, article)
if parseErr != nil {
p.Errors = append(p.Errors, parseErr)
} else {
items = append(items, item)
}
}
if len(p.Errors) > 0 {
errorStrs := make([]string, 0)
for _, perr := range p.Errors {
errorStrs = append(errorStrs, perr.Error())
}
p.ErrStr = errors.New(strings.Join(errorStrs, "\n")).Error()
}
return items, nil
}
func NewSitePage(url string) (*SitePage, error) {
rawhtml, err := fetchPage(url) rawhtml, err := fetchPage(url)
if err != nil { if err != nil {
return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err)
} }
item := FeedItem{ nodeRoot, err := html.Parse(strings.NewReader(rawhtml))
Url: url,
}
err = item.ParseContent(rawhtml);
if err != nil { if err != nil {
return nil, fmt.Errorf("Could not parse feed item: %w", err) return nil, fmt.Errorf("Error parsing HTML: %w", err)
} }
return &item, nil page := SitePage{
Url: url,
Root: nodeRoot,
Errors: make([]error, 0),
}
nodeTitle, err := getHtmlElement(nodeRoot, "title")
if err != nil {
page.Title = url
} else {
page.Title = nodeTitle.FirstChild.Data
}
return &page, nil
} }
func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) { func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*FeedInfo, error) {
@ -168,11 +266,16 @@ func NewFeedInfo(name, base_url, desc, author string, page_urls...string) (*Feed
Errors: make(map[string]string, 10), Errors: make(map[string]string, 10),
} }
for _,url := range info.PageUrls { for _,url := range info.PageUrls {
item, err := NewFeedItem(url) page, err := NewSitePage(url)
if err != nil {
info.Errors[url] = err.Error()
}
pageItems, err := page.Parse()
if err != nil { if err != nil {
info.Errors[url] = err.Error() info.Errors[url] = err.Error()
} else { } else {
info.Items = append(info.Items, item) info.Items = append(info.Items, pageItems...)
info.Errors[url] = page.ErrStr
} }
} }
return &info, nil return &info, nil

View File

@ -1,12 +1,15 @@
package feed_test package feed_test
import ( import (
"strings"
"testing" "testing"
"time" "time"
"git.32bit.cafe/yequari/webweav.ing/feed" "git.32bit.cafe/yequari/webweav.ing/feed"
"golang.org/x/net/html"
) )
func TestTimeParsing(t *testing.T) { func TestTimeParsingFormat(t *testing.T) {
testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7") testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7")
if err != nil { if err != nil {
t.Errorf("creating test date failed: %s", err) t.Errorf("creating test date failed: %s", err)
@ -19,35 +22,38 @@ func TestTimeParsing(t *testing.T) {
}{ }{
{ {
"YYYY-MM-DD", "YYYY-MM-DD",
`<html><head></head><body><article><time datetime="2004-05-14">May 14 2004</time>hello world</article></body></html>`, `<article><time datetime="2004-05-14">May 14 2004</time>hello world</article>`,
time.DateOnly, time.DateOnly,
}, },
{ {
"YYYY-MM-DD HH:MM", "YYYY-MM-DD HH:MM",
`<html><head></head><body><article><time datetime="2004-05-14 07:30">May 14 2004</time>hello world</article></body></html>`, `<article><time datetime="2004-05-14 07:30">May 14 2004</time>hello world</article>`,
"2006-01-02 15:04", "2006-01-02 15:04",
}, },
{ {
"YYYY-MM-DD HH:MM:SS", "YYYY-MM-DD HH:MM:SS",
`<html><head></head><body><article><time datetime="2004-05-14 07:30:55">May 14 2004</time>hello world</article></body></html>`, `<article><time datetime="2004-05-14 07:30:55">May 14 2004</time>hello world</article>`,
"2006-01-02 15:04:05", "2006-01-02 15:04:05",
}, },
{ {
"YYYY-MM-DDTHH:MM:SS", "YYYY-MM-DDTHH:MM:SS",
`<html><head></head><body><article><time datetime="2004-05-14T07:30:55">May 14 2004</time>hello world</article></body></html>`, `<article><time datetime="2004-05-14T07:30:55">May 14 2004</time>hello world</article>`,
"2006-01-02T15:04:05", "2006-01-02T15:04:05",
}, },
{ {
"YYYY-MM-DDTHH:MM", "YYYY-MM-DDTHH:MM",
`<html><head></head><body><article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article></body></html>`, `<article><time datetime="2004-05-14T07:30">May 14 2004</time>hello world</article>`,
"2006-01-02T15:04", "2006-01-02T15:04",
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) { t.Run(tt.name, func (t *testing.T) {
item := feed.FeedItem{} html, err := html.Parse(strings.NewReader(tt.input))
err := item.ParseContent(tt.input) if err != nil {
t.Errorf("error: %s", err)
}
item, err := feed.NewFeedItem("", html)
if err != nil { if err != nil {
t.Errorf("error: %s", err) t.Errorf("error: %s", err)
} }
@ -61,38 +67,231 @@ func TestTimeParsing(t *testing.T) {
} }
} }
func TestArticleTimeParsing(t *testing.T) {
func TestParseFeedItem(t *testing.T) {
testDate, err := time.Parse("2006-Jan-02", "2004-May-14") testDate, err := time.Parse("2006-Jan-02", "2004-May-14")
if err != nil { if err != nil {
t.Errorf("creating test date failed: %s", err) t.Errorf("creating test date failed: %s", err)
} }
testDate2, err := time.Parse("2006-Jan-02", "2004-May-07")
if err != nil {
t.Errorf("creating test date failed: %s", err)
}
var tests = []struct { var tests = []struct {
name string name string
input string input string
want_time *time.Time want_time []*time.Time
want_article string
}{ }{
{ {
"article and time stripped out of basic HTML", "article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
&testDate, []*time.Time{&testDate},
"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", },
{
"multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]*time.Time{&testDate, &testDate2},
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) { t.Run(tt.name, func (t *testing.T) {
item := feed.FeedItem{} html, err := html.Parse(strings.NewReader(tt.input))
err := item.ParseContent(tt.input)
if err != nil { if err != nil {
t.Errorf("error: %s", err) t.Errorf("error: %s", err)
} }
if item.RawText != tt.want_article { page := feed.SitePage{
t.Errorf("got %s, want %s", item.RawText, tt.want_article) Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
}
}
})
}
}
func TestArticleUrls(t *testing.T) {
var tests = []struct {
name string
input string
want_url []string
}{
{
"article without url",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{""},
},
{
"one article with one url",
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{"https://example.com"},
},
{
"multiple articles",
"<html><head></head><body><article><h1><a href=\"https://example.com/example1\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><h1><a href=\"https://example.com/example2\">Title</a></h1><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]string{"https://example.com/example1", "https://example.com/example2"},
},
{
"one article with multiple urls",
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><a href=\"https://google.com\">click me</a><p>hello world</p></article></body></html>",
[]string{"https://example.com"},
},
{
"one article with multiple h1 urls",
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><h1><a href=\"https://google.com\">click me</a></h1><p>hello world</p></article></body></html>",
[]string{"https://example.com"},
},
}
for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input))
if err != nil {
t.Errorf("error: %s", err)
}
page := feed.SitePage{
Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if item.Url != tt.want_url[i] {
t.Errorf("got %s, want %s", item.Url, tt.want_url[i])
}
}
})
}
}
func TestArticleTitles(t *testing.T) {
var tests = []struct {
name string
input string
want_title []string
}{
{
"article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{"May 14 2004"},
},
{
"multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]string{"May 14 2004", "May 07 2004"},
},
{
"multiple articles with h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article><article><time datetime=\"2004-05-07\"><h1>World</h1>May 7 2004</time>this is a second article</article></body></html>",
[]string{"Hello", "World"},
},
{
"article with h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>",
[]string{"Hello"},
},
{
"article with h2",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>",
[]string{"Hello"},
},
{
"article with h3",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>",
[]string{"Hello"},
},
{
"article with h1 and h2",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>",
[]string{"Hello"},
},
{
"article with h2 and h3",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>",
[]string{"World"},
},
{
"article with multiple h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>",
[]string{"Hello"},
},
}
for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input))
if err != nil {
t.Errorf("error: %s", err)
}
page := feed.SitePage{
Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if item.Title != tt.want_title[i] {
t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
}
}
})
}
}
func TestArticleParsing(t *testing.T) {
var tests = []struct {
name string
input string
want_article []string
}{
{
"article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
},
{
"multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
},
}
for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input))
if err != nil {
t.Errorf("error: %s", err)
}
page := feed.SitePage{
Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if item.RawText != tt.want_article[i] {
t.Errorf("got %s, want %s", item.RawText, tt.want_article[i])
} }
if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) {
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time)
} }
}) })
} }