parse out titles and urls from heading tags

This commit is contained in:
yequari 2024-04-10 20:49:54 -07:00
parent 598f970489
commit 282820cc02
2 changed files with 200 additions and 40 deletions

View File

@ -5,10 +5,12 @@ import (
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"path"
"strings" "strings"
"time" "time"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/atom"
) )
const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?> const feedfmtopen = `<?xml version="1.0" encoding="utf-8"?>
@ -135,6 +137,43 @@ func getAllElements(doc *html.Node, tag string) ([]*html.Node, error) {
return elements, nil return elements, nil
} }
func getTitleAndUrl(article *html.Node) (string, string, error) {
var title string
var url string
var header *html.Node
h1, _ := getHtmlElement(article, "h1")
h2, _ := getHtmlElement(article, "h2")
h3, _ := getHtmlElement(article, "h3")
if h1 != nil {
header = h1
} else if h2 != nil {
header = h2
} else if h3 != nil {
header = h3
}
if header == nil {
return "", "", nil
}
if header.FirstChild.Type != html.ElementNode {
title = header.FirstChild.Data
} else if header.FirstChild.DataAtom == atom.A {
title = header.FirstChild.FirstChild.Data
for _, d := range header.FirstChild.Attr {
if d.Key == "href" {
url = d.Val
}
}
}
if url == "" && header.Parent.DataAtom == atom.A {
for _, d := range header.FirstChild.Attr {
if d.Key == "href" {
url = d.Val
}
}
}
return title, url, nil
}
func NewFeedItem(url string, article *html.Node) (*FeedItem, error) { func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
var articleBuilder strings.Builder var articleBuilder strings.Builder
html.Render(&articleBuilder, article) html.Render(&articleBuilder, article)
@ -142,7 +181,6 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
Url: url, Url: url,
RawText: articleBuilder.String(), RawText: articleBuilder.String(),
} }
etime, err := getHtmlElement(article, "time") etime, err := getHtmlElement(article, "time")
if err != nil { if err != nil {
return nil, err return nil, err
@ -157,21 +195,18 @@ func NewFeedItem(url string, article *html.Node) (*FeedItem, error) {
} }
item.PubTime = pubTime item.PubTime = pubTime
} }
title, itemurl, _ := getTitleAndUrl(article)
eh1, _ := getHtmlElement(article, "h1") if title == "" {
eh2, _ := getHtmlElement(article, "h2") title = pubTime.Format("Jan 02 2006")
eh3, _ := getHtmlElement(article, "h3")
if eh1 != nil {
item.Title = eh1.FirstChild.Data
// TODO: handle <a>
} else if eh2 != nil {
item.Title = eh2.FirstChild.Data
} else if eh3 != nil {
item.Title = eh3.FirstChild.Data
} else {
item.Title = pubTime.Format("Jan 02 2006")
} }
if itemurl == "" {
itemurl = url
}
if itemurl != "" && !strings.HasPrefix(itemurl, "http://") && !strings.HasPrefix(itemurl, "https://") {
itemurl = path.Join(url, itemurl)
}
item.Title = title
item.Url = itemurl
return &item, nil return &item, nil
} }

View File

@ -9,7 +9,7 @@ import (
"golang.org/x/net/html" "golang.org/x/net/html"
) )
func TestTimeParsing(t *testing.T) { func TestTimeParsingFormat(t *testing.T) {
testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7") testDate, err := time.Parse("2006-Jan-02 15:04:05 -7", "2004-May-14 07:30:55 -7")
if err != nil { if err != nil {
t.Errorf("creating test date failed: %s", err) t.Errorf("creating test date failed: %s", err)
@ -67,7 +67,7 @@ func TestTimeParsing(t *testing.T) {
} }
} }
func TestArticleParsing(t *testing.T) { func TestArticleTimeParsing(t *testing.T) {
testDate, err := time.Parse("2006-Jan-02", "2004-May-14") testDate, err := time.Parse("2006-Jan-02", "2004-May-14")
if err != nil { if err != nil {
t.Errorf("creating test date failed: %s", err) t.Errorf("creating test date failed: %s", err)
@ -80,67 +80,198 @@ func TestArticleParsing(t *testing.T) {
name string name string
input string input string
want_time []*time.Time want_time []*time.Time
want_article []string
want_title []string
}{ }{
{ {
"article and time stripped out of basic HTML", "article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]*time.Time{&testDate}, []*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
[]string{"May 14 2004"},
}, },
{ {
"multiple articles", "multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]*time.Time{&testDate, &testDate2}, []*time.Time{&testDate, &testDate2},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"}, },
}
for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input))
if err != nil {
t.Errorf("error: %s", err)
}
page := feed.SitePage{
Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
}
}
})
}
}
func TestArticleUrls(t *testing.T) {
var tests = []struct {
name string
input string
want_url []string
}{
{
"article without url",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{""},
},
{
"one article with one url",
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{"https://example.com"},
},
{
"multiple articles",
"<html><head></head><body><article><h1><a href=\"https://example.com/example1\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><h1><a href=\"https://example.com/example2\">Title</a></h1><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]string{"https://example.com/example1", "https://example.com/example2"},
},
{
"one article with multiple urls",
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><a href=\"https://google.com\">click me</a><p>hello world</p></article></body></html>",
[]string{"https://example.com"},
},
{
"one article with multiple h1 urls",
"<html><head></head><body><article><h1><a href=\"https://example.com\">Title</a></h1><time datetime=\"2004-05-14\">May 14 2004</time><h1><a href=\"https://google.com\">click me</a></h1><p>hello world</p></article></body></html>",
[]string{"https://example.com"},
},
}
for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input))
if err != nil {
t.Errorf("error: %s", err)
}
page := feed.SitePage{
Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if item.Url != tt.want_url[i] {
t.Errorf("got %s, want %s", item.Url, tt.want_url[i])
}
}
})
}
}
func TestArticleTitles(t *testing.T) {
var tests = []struct {
name string
input string
want_title []string
}{
{
"article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{"May 14 2004"},
},
{
"multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]string{"May 14 2004", "May 07 2004"}, []string{"May 14 2004", "May 07 2004"},
}, },
{
"multiple articles with h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article><article><time datetime=\"2004-05-07\"><h1>World</h1>May 7 2004</time>this is a second article</article></body></html>",
[]string{"Hello", "World"},
},
{ {
"article with h1", "article with h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1>hello world</article>"},
[]string{"Hello"}, []string{"Hello"},
}, },
{ {
"article with h2", "article with h2",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h2>Hello</h2>hello world</article>"},
[]string{"Hello"}, []string{"Hello"},
}, },
{ {
"article with h3", "article with h3",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3>hello world</article>"},
[]string{"Hello"}, []string{"Hello"},
}, },
{ {
"article with h1 and h2", "article with h1 and h2",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h2>World</h2>hello world</article>"},
[]string{"Hello"}, []string{"Hello"},
}, },
{ {
"article with h2 and h3", "article with h2 and h3",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h3>Hello</h3><h2>World</h2>hello world</article>"},
[]string{"World"}, []string{"World"},
}, },
{ {
"article with multiple h1", "article with multiple h1",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>", "<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article></body></html>",
[]*time.Time{&testDate},
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time><h1>Hello</h1><h1>World</h1>hello world</article>"},
[]string{"Hello"}, []string{"Hello"},
}, },
} }
for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input))
if err != nil {
t.Errorf("error: %s", err)
}
page := feed.SitePage{
Url: "",
Title: "",
Root: html,
Errors: make([]error, 0),
}
items, err := page.Parse()
if err != nil {
t.Errorf("error: %s", err)
}
for i, item := range items {
if item.Title != tt.want_title[i] {
t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
}
}
})
}
}
func TestArticleParsing(t *testing.T) {
var tests = []struct {
name string
input string
want_article []string
}{
{
"article and time stripped out of basic HTML",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article></body></html>",
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>"},
},
{
"multiple articles",
"<html><head></head><body><article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article><article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article></body></html>",
[]string{"<article><time datetime=\"2004-05-14\">May 14 2004</time>hello world</article>", "<article><time datetime=\"2004-05-07\">May 7 2004</time>this is a second article</article>"},
},
}
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func (t *testing.T) { t.Run(tt.name, func (t *testing.T) {
html, err := html.Parse(strings.NewReader(tt.input)) html, err := html.Parse(strings.NewReader(tt.input))
@ -161,12 +292,6 @@ func TestArticleParsing(t *testing.T) {
if item.RawText != tt.want_article[i] { if item.RawText != tt.want_article[i] {
t.Errorf("got %s, want %s", item.RawText, tt.want_article[i]) t.Errorf("got %s, want %s", item.RawText, tt.want_article[i])
} }
if tt.want_time[i] != nil && !item.PubTime.Equal(*tt.want_time[i]) {
t.Errorf("got %s, want %s", item.PubTime, *tt.want_time[i])
}
if item.Title != tt.want_title[i] {
t.Errorf("got %s, want %s", item.Title, tt.want_title[i])
}
} }
}) })
} }