diff --git a/cmd/web/handlers.go b/cmd/web/handlers.go index c562a27..37f8c1f 100644 --- a/cmd/web/handlers.go +++ b/cmd/web/handlers.go @@ -1,11 +1,12 @@ package main import ( + "fmt" "html" "net/http" "strings" - "git.32bit.cafe/yequari/rss-gen/rss" + "git.32bit.cafe/yequari/rss-gen/feed" ) func (app *application) home(w http.ResponseWriter, r *http.Request) { @@ -28,8 +29,9 @@ func (app *application) generateRss(w http.ResponseWriter, r *http.Request) { pages[i] = strings.TrimSpace(pages[i]) } - feed, err := rss.GenerateRss(siteUrl, siteName, siteDesc, pages...) + feed, err := feed.GenerateRss(siteUrl, siteName, siteDesc, pages...) if err != nil { + w.Write([]byte(fmt.Sprintf("

Error generating feed: %s

", err.Error()))) app.infoLog.Printf("Error generating feed: %s\n", err.Error()) } for _, line := range strings.Split(feed, "\n") { diff --git a/rss/export_test.go b/feed/export_test.go similarity index 60% rename from rss/export_test.go rename to feed/export_test.go index 67c68f8..9975a53 100644 --- a/rss/export_test.go +++ b/feed/export_test.go @@ -1,4 +1,5 @@ -package rss +package feed var ParseArticle = parseArticle var FetchPage = fetchPage +var ParseTime = parseTime diff --git a/feed/feed.go b/feed/feed.go new file mode 100644 index 0000000..c625f59 --- /dev/null +++ b/feed/feed.go @@ -0,0 +1,169 @@ +package feed + +import ( + "fmt" + "io" + "net/http" + "strings" + "time" + + "golang.org/x/net/html" +) + +type FeedBuilder interface { + GenerateFeed() string +} + +type FeedInfo struct { + SiteName string + SiteUrl string + SiteDesc string + PageUrls []string + errors map[string]error +} + +type FeedItem struct { + Url string + Title string + Author string + EscapedText string + PubTime time.Time + RawText string +} + +func fetchPage(url string) (string, error) { + resp, err := http.Get(url) + if err != nil { + return "", fmt.Errorf("Error sending Get request: %w", err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + + return string(body), nil +} + +func parseTime(timestr string) (time.Time, error) { + var formats = []string { + time.ANSIC, + time.UnixDate, + time.RubyDate, + time.RFC822, + time.RFC822Z, + time.RFC850, + time.RFC1123, + time.RFC1123Z, + "2006-01-02 15:04:05 -0700 -0700", + time.RFC3339, + time.RFC3339Nano, + time.DateTime, + time.DateOnly, + } + var pagetime time.Time + var err error + for _, f := range formats { + pagetime, err = time.Parse(f, timestr) + if err == nil { + return pagetime, err + } + } + return pagetime, fmt.Errorf("Error parsing time: invalid format") +} + +func getHtmlElement(doc *html.Node, tag string) (*html.Node, error) { + var f func(*html.Node, string) + var element *html.Node + f = func(n *html.Node, s string) { + if n.Type == html.ElementNode && n.Data == s{ + element = n + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c, tag) + } + } + f(doc, tag) + if element == nil { + return nil, fmt.Errorf("no <%s> element found", tag) + } + return element, nil +} + +func (f *FeedItem) ParseContent(content string) error { + doc, err := html.Parse(strings.NewReader(content)) + if err != nil { + return fmt.Errorf("Error parsing HTML: %w", err) + } + earticle, err := getHtmlElement(doc, "article") + if err != nil { + return err + } + var builder strings.Builder + html.Render(&builder, earticle) + f.RawText = builder.String() + + etime, err := getHtmlElement(earticle, "time") + if err != nil { + return err + } + var pubTime time.Time + for _, d := range etime.Attr { + if d.Key == "datetime" { + pubTime, err = parseTime(d.Val) + } + if err != nil { + return fmt.Errorf("Error parsing time: %w", err) + } + f.PubTime = pubTime + } + return nil +} + +func NewFeedItem(url string) (*FeedItem, error) { + rawhtml, err := fetchPage(url) + if err != nil { + return nil, fmt.Errorf("Could not fetch page '%s': %w", url, err) + } + item := FeedItem{ + Url: url, + } + item.ParseContent(rawhtml); + if err != nil { + return nil, fmt.Errorf("Could not parse feed item: %w", err) + } + return &item, nil +} + +// parseArticle returns an error if it could not parse the HTML or if it could not parse a time +// if a time could not be parsed, the parsed html article will still be returned +func parseArticle(content string) (string, *time.Time, error) { + doc, err := html.Parse(strings.NewReader(content)) + if err != nil { + return "", nil, fmt.Errorf("Error parsing HTML: %w", err) + } + var f func(*html.Node, string) + var element *html.Node + var pagetime time.Time + f = func(n *html.Node, tag string) { + if n.Type == html.ElementNode && n.Data == tag { + element = n + return + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c, tag) + } + } + + f(doc, "article") + var builder strings.Builder + html.Render(&builder, element) + + f(element, "time") + for _, d := range element.Attr { + if d.Key == "datetime" { + pagetime, err = parseTime(d.Val) + } + } + + return builder.String(), &pagetime, nil +} + diff --git a/rss/rss_test.go b/feed/feed_test.go similarity index 50% rename from rss/rss_test.go rename to feed/feed_test.go index 367406f..d3de717 100644 --- a/rss/rss_test.go +++ b/feed/feed_test.go @@ -1,8 +1,8 @@ -package rss_test +package feed_test import "testing" import "time" -import "git.32bit.cafe/yequari/rss-gen/rss" +import "git.32bit.cafe/yequari/rss-gen/feed" func TestArticleParse(t *testing.T) { testDate, err := time.Parse("2006-Jan-02", "2004-May-14") @@ -31,7 +31,7 @@ func TestArticleParse(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func (t *testing.T) { - article, articleTime, err := rss.ParseArticle(tt.input) + article, articleTime, err := feed.ParseArticle(tt.input) if err != nil { t.Errorf("error: %s", err) } @@ -44,3 +44,39 @@ func TestArticleParse(t *testing.T) { }) } } + +func TestParseFeedItem(t *testing.T) { + testDate, err := time.Parse("2006-Jan-02", "2004-May-14") + if err != nil { + t.Errorf("creating test date failed: %s", err) + } + var tests = []struct { + name string + input string + want_time *time.Time + want_article string + }{ + { + "article and time stripped out of basic HTML", + "
hello world
", + &testDate, + "
hello world
", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func (t *testing.T) { + item := feed.FeedItem{} + err := item.ParseContent(tt.input) + if err != nil { + t.Errorf("error: %s", err) + } + if item.RawText != tt.want_article { + t.Errorf("got %s, want %s", item.RawText, tt.want_article) + } + if tt.want_time != nil && !item.PubTime.Equal(*tt.want_time) { + t.Errorf("got %s, want %s", item.PubTime, *tt.want_time) + } + }) + } +} diff --git a/feed/rss.go b/feed/rss.go new file mode 100644 index 0000000..a088669 --- /dev/null +++ b/feed/rss.go @@ -0,0 +1,64 @@ +package feed + +import ( + "fmt" + "strings" + "time" + + "golang.org/x/net/html" +) + +const feedfmt = ` + + + %s + %s + %s%s + + ` + +const itemfmt = ` + + Content Title + %s + %s + %s + + %s + + ` + +type RSSBuilder struct { + Info FeedInfo + Items []FeedItem +} + +func GenerateRss(siteUrl, siteTitle, siteDesc string, pageUrls ...string) (string, error) { + var items strings.Builder + var errs strings.Builder + var err error + + for _, u := range pageUrls { + var formattedArticle strings.Builder + var err error + page, err := fetchPage(u) + if err != nil { + continue + } + article, atime, err := parseArticle(page) + if err != nil && article == "" { + errs.WriteString(fmt.Sprintf("error parsing article %s: %s", u, err.Error())) + continue + } + for _, line := range strings.Split(article, "\n") { + formattedArticle.WriteString(fmt.Sprintf("\t\t%s\n", html.EscapeString(line))) + } + if atime != nil { + items.WriteString(fmt.Sprintf(itemfmt, u, u, atime.Format("Mon, 2 Jan 2006 15:04:05 MST"), formattedArticle.String())) + } else { + items.WriteString(fmt.Sprintf(itemfmt, u, u, time.Now().Format("Mon, 2 Jan 2006 15:04:05 MST"), formattedArticle.String())) + } + } + + return fmt.Sprintf(feedfmt, siteTitle, siteUrl, siteDesc, items.String()), err +} diff --git a/rss/rss.go b/rss/rss.go deleted file mode 100644 index 987b50c..0000000 --- a/rss/rss.go +++ /dev/null @@ -1,121 +0,0 @@ -package rss - -import ( - "fmt" - "io" - "net/http" - "strings" - "time" - - "golang.org/x/net/html" -) -const feedfmt = ` - - - %s - %s - %s - %s - - ` - -const itemfmt = ` - Content Title - %s - %s - %s - - ` - -func fetchPage(url string) (string, error) { - resp, err := http.Get(url) - if err != nil { - return "", fmt.Errorf("Error sending Get request: %w", err) - } - defer resp.Body.Close() - body, err := io.ReadAll(resp.Body) - - return string(body), nil -} - -func parseTime(timestr string) (*time.Time, error) { - var formats = []string { - time.ANSIC, - time.UnixDate, - time.RubyDate, - time.RFC822, - time.RFC822Z, - time.RFC850, - time.RFC1123, - time.RFC1123Z, - time.RFC3339, - time.RFC3339Nano, - time.DateTime, - time.DateOnly, - } - for _, f := range formats { - pagetime, err := time.Parse(f, timestr) - if err == nil { - return &pagetime, nil - } - } - return nil, fmt.Errorf("Error parsing time: invalid format") -} - -// parseArticle returns an error if it could not parse the HTML or if it could not parse a time -// if a time could not be parsed, the parsed html article will still be returned -func parseArticle(content string) (string, *time.Time, error) { - doc, err := html.Parse(strings.NewReader(content)) - if err != nil { - return "", nil, fmt.Errorf("Error parsing HTML: %w", err) - } - var f func(*html.Node, string) - var element *html.Node - var pagetime *time.Time - f = func(n *html.Node, tag string) { - if n.Type == html.ElementNode && n.Data == tag { - element = n - return - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c, tag) - } - } - - f(doc, "article") - var builder strings.Builder - html.Render(&builder, element) - - f(element, "time") - for _, d := range element.Attr { - if d.Key == "datetime" { - pagetime, err = parseTime(d.Val) - } - } - - return builder.String(), pagetime, nil -} - -func GenerateRss(siteUrl, siteTitle, siteDesc string, pageUrls ...string) (string, error) { - var items strings.Builder - var err error - - for _, u := range pageUrls { - var err error - page, err := fetchPage(u) - if err != nil { - continue - } - article, atime, err := parseArticle(page) - if err != nil && article == "" { - continue - } - if atime != nil { - items.WriteString(fmt.Sprintf(itemfmt, u, u, atime.Format("Mon, 2 Jan 2006 15:04:05 MST"), article)) - } else { - items.WriteString(fmt.Sprintf(itemfmt, u, u, time.Now().Format("Mon, 2 Jan 2006 15:04:05 MST"), article)) - } - } - - return fmt.Sprintf(feedfmt, siteTitle, siteUrl, siteDesc, items.String()), err -}