diff --git a/README.md b/README.md index e10980c..a36f197 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,38 @@ # xml-feed-generator -an RSS/atom feed generator for my personal site :cat: - ---- - -## disclaimer -**this script is not meant to be used as is (yet).** - -expect to do *a lot* of modification to fit your needs, as this was originally tailored for my site structure and blog markup. +an RSS/atom feed generator for my personal site :cat: tested to work with PHP version 8.2. --- ## how it works 1. match files in a specificied directory. 2. load the DOM of each file. -3. parse values retrieved from the following strings/HTML elements (prioritizing [h-entry markup](https://microformats.org/wiki/h-entry)), as children of a feed ``: - - `` (or `

`) as `` - - the `datetime` attribute of `<time>` (or the file creation date) as `<updated>` - - `<element class="p-summary">` as `<summary>` - - `<element class="e-content">` (or `<article>`) as `<content>` - - `/path/to/blog/entry` as `<id>` and `<link>` -4. output all of the above into a new file named **articles.xml**. +3. parse the following elements as children of `<entry>`, in order of priority: + +| original HTML | feed output | +|-------------------------------|---------------------------------| +| 1. `class="p-name"` | `<title>` | +| 2. `<title>` in `<head>` | | +| 3. the XML feed title | | +| | | +| 1. `datetime` attribute of `class=dt-updated` | `<updated>` | +| 2. `datetime` attribute of the first `<time>` element | | +| 3. file modification date, retrieved from the server | | +| | | +| 1. `datetime` attribute of `class=dt-published` | `<published>` | +| 2. `datetime` attribute of the first `<time>` element | | +| 3. file creation date, retrieved from the server | | +| | | +| 1. `class="p-summary"` | `<summary>` | +| 2. `<meta property="description">` | | +| | | +| 1. `class="e-content"` | `<content>` | +| 2. `<article>` | | +| | | +| 1. /path/to/blog/entry | `<id>` and `<link rel="alternate">` | + + + +4. output all of the above into a new file named **articles.xml** (default name, but can be changed). ## ways to use - configure a cron job on your web server to run automatically every now and then. diff --git a/feed_generator.php b/feed_generator.php index bfdfa8f..4d8aeda 100644 --- a/feed_generator.php +++ b/feed_generator.php @@ -6,13 +6,16 @@ // the timezone referenced by the system for automatic timestamping. // suported timezones: https://www.php.net/manual/en/timezones.php $timezone = 'Asia/Jakarta'; + $max_entries = 10; - // FEED METADATA ////////////////////////////////////////////////////////////////////// + // FEED METADATA // // certain characters must be escaped as HTML entities - note that XML only accepts five of them. - // reference: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references + // reference for character entities: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references $feed_title = 'jasmine's b(rain)log | jasm1nii.xyz'; $feed_subtitle = 'blog articles by jasmine'; + // location of the blog index page (or if unavailable, your main page). $blog_url = 'https://jasm1nii.xyz/blog/articles'; + // permalink to the XML feed on your site. $feed_url = 'https://jasm1nii.xyz/blog/articles/articles.xml'; $author_name = 'jasmine'; $author_email = 'contact@jasm1nii.xyz'; @@ -20,23 +23,32 @@ $feed_icon = 'https://jasm1nii.xyz/assets/media/itchio-textless-white.svg'; $feed_logo = 'https://jasm1nii.xyz/assets/media/main/07042023-me_compressed.webp'; - // PATH TO FETCH PAGES FROM /////////////////////////////////////////////////////////// + // PATH TO FETCH PAGES FROM // // __DIR__ is the directory where *this script* is located. // in my case, i first need to go up two directories to get to the site root. $site_root = dirname(__DIR__, 2); // once i'm there, i specify the parent directory where i keep all of my blog pages. + // because the values of $blog_root and $blog_entries will be used for generating entry links, forward slashes are a *must*. $blog_root = $site_root.'/blog/articles'; - // then i specify a pattern that matches the path of each individual page. - // my setup is /YYYY/MM/DD/entry.html + // then, specify a pattern that matches the path of each individual page. + // for example, this will match /YYYY/MM/DD/entry.html $blog_entries = $blog_root.'/*/*/*/*.html'; - // ------------------------------------------------------------------------------------ + // ENTRY METADATA // + // depending on your site setup, this might not be the same as $blog_url. + // the generator will appended $blog_root to the URL specified below. + $blog_directory_url = 'https://jasm1nii.xyz/blog/articles'; + + // NAME OF FEED FILE + $file = 'articles.xml'; + + // -------------------------------------------- // create beginning of feed template. - // reference for required elements: https://validator.w3.org/feed/docs/atom.html ob_start(); date_default_timezone_set($timezone); + // reference for required elements: https://validator.w3.org/feed/docs/atom.html echo '<?xml version="1.0" encoding="utf-8"?>' .'<feed xmlns="http://www.w3.org/2005/Atom">' // optionally specify feed generator for debugging purposes. @@ -96,21 +108,23 @@ $title = $x->query("//*[@class='" . $title_class . "']"); if ($title->length > 0) { echo '<title>'. $title[0]->nodeValue . ''; - } elseif (!empty($title)) { - $title = $article_dom->getElementsByTagName('h2'); + } elseif ($title->length == 0) { + $title = $article_dom->getElementsByTagName('title'); echo ''.$title[0]->nodeValue.''; } else { echo $feed_title; } // id - echo 'https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . ''; + echo '' . $blog_directory_url . '/' . ltrim($article, $blog_root) . ''; // alternate link - echo ''; + echo ''; - $updated = $article_dom->getElementsByTagName('time'); - if (!empty($updated)) { + // date updated + $updated_class = 'dt-updated'; + $updated = $x->query("//*[@class='" . $updated_class . "']"); + if ($updated->length > 0) { $timestamp = $updated[0]->getAttribute('datetime'); if (strlen($timestamp) == 10) { echo '' . $timestamp . 'T00:00:00' . date('P'). ''; @@ -118,9 +132,43 @@ elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) { echo '' . $timestamp .''; } - } else { - $article_created = filectime($article); + } + if ($updated->length == 0) { + $updated = $article_dom->getElementsByTagName('time'); + $timestamp = $updated[0]->getAttribute('datetime'); + if (strlen($timestamp) == 10) { + echo '' . $timestamp . 'T00:00:00' . date('P'). ''; + } elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) { + echo '' . $timestamp .''; + } else { + $article_created = filemtime($article); echo '' . date(DATE_ATOM, $article_created) . ''; + } + } + + // date published + $published_class = 'dt-published'; + $published = $x->query("//*[@class='" . $published_class . "']"); + if ($published->length > 0) { + $timestamp = $published[0]->getAttribute('datetime'); + if (strlen($timestamp) == 10) { + echo '' . $timestamp . 'T00:00:00' . date('P'). ''; + } + elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) { + echo '' . $timestamp .''; + } + } + if ($published->length == 0) { + $published = $article_dom->getElementsByTagName('time'); + $timestamp = $published[0]->getAttribute('datetime'); + if (strlen($timestamp) == 10) { + echo '' . $timestamp . 'T00:00:00' . date('P'). ''; + } elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) { + echo '' . $timestamp .''; + } else { + $article_created = filectime($article); + echo '' . date(DATE_ATOM, $article_created) . ''; + } } // summary @@ -130,6 +178,11 @@ echo ''; echo $summary->item(0)->nodeValue; echo ''; + } elseif($summary->length == 0) { + $summary = get_meta_tags($article)['description']; + echo ''; + echo $summary; + echo ''; } else { echo '' . 'A summary of this content is not available.' . ''; } @@ -139,22 +192,24 @@ $content = $x->query("//*[@class='" . $content_class . "']"); if ($content->length > 0) { // strip line breaks and output a maximum of 500 characters. - echo '' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (<a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '">read more</a>)' . ''; + echo '' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (read more on the original page)' . ''; } elseif (!empty($content)) { $content = $article_dom->getElementsByTagName('article'); - echo '' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (<a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '">read more</a>)' . ''; + echo '' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (read more on the original page)' . ''; } else { - echo '' . 'Content could not be parsed for previewing - view the original article on the website.' . ''; + echo '' . 'Content could not be parsed as a preview - view the original article on the website.' . ''; } echo ''; - // add no more than 10 entries. - if(++$i > 9) break; + if(++$i > ($max_entries-1)) break; } echo ''; $xml_str = ob_get_contents(); ob_end_clean(); - file_put_contents($blog_root.'/articles.xml', $xml_str); + file_put_contents($blog_root . DIRECTORY_SEPARATOR . $file, $xml_str); + + echo strtoupper(date("h:i:sa")) . ' - Feed successfully generated in ' . realpath($blog_root) . DIRECTORY_SEPARATOR . $file; + echo '
Validate your feed at https://validator.w3.org/feed/'; ?> \ No newline at end of file