update readme, support more elements
This commit is contained in:
parent
eee6415633
commit
d785d9e142
43
README.md
43
README.md
|
@ -1,25 +1,38 @@
|
|||
# xml-feed-generator
|
||||
an RSS/atom feed generator for my personal site :cat:
|
||||
|
||||
---
|
||||
|
||||
## disclaimer
|
||||
**this script is not meant to be used as is (yet).**
|
||||
|
||||
expect to do *a lot* of modification to fit your needs, as this was originally tailored for my site structure and blog markup.
|
||||
an RSS/atom feed generator for my personal site :cat: tested to work with PHP version 8.2.
|
||||
|
||||
---
|
||||
|
||||
## how it works
|
||||
1. match files in a specificied directory.
|
||||
2. load the DOM of each file.
|
||||
3. parse values retrieved from the following strings/HTML elements (prioritizing [h-entry markup](https://microformats.org/wiki/h-entry)), as children of a feed `<entry>`:
|
||||
- `<element class=p-name>` (or `<h2>`) as `<title>`
|
||||
- the `datetime` attribute of `<time>` (or the file creation date) as `<updated>`
|
||||
- `<element class="p-summary">` as `<summary>`
|
||||
- `<element class="e-content">` (or `<article>`) as `<content>`
|
||||
- `/path/to/blog/entry` as `<id>` and `<link>`
|
||||
4. output all of the above into a new file named **articles.xml**.
|
||||
3. parse the following elements as children of `<entry>`, in order of priority:
|
||||
|
||||
| original HTML | feed output |
|
||||
|-------------------------------|---------------------------------|
|
||||
| 1. `class="p-name"` | `<title>` |
|
||||
| 2. `<title>` in `<head>` | |
|
||||
| 3. the XML feed title | |
|
||||
| | |
|
||||
| 1. `datetime` attribute of `class=dt-updated` | `<updated>` |
|
||||
| 2. `datetime` attribute of the first `<time>` element | |
|
||||
| 3. file modification date, retrieved from the server | |
|
||||
| | |
|
||||
| 1. `datetime` attribute of `class=dt-published` | `<published>` |
|
||||
| 2. `datetime` attribute of the first `<time>` element | |
|
||||
| 3. file creation date, retrieved from the server | |
|
||||
| | |
|
||||
| 1. `class="p-summary"` | `<summary>` |
|
||||
| 2. `<meta property="description">` | |
|
||||
| | |
|
||||
| 1. `class="e-content"` | `<content>` |
|
||||
| 2. `<article>` | |
|
||||
| | |
|
||||
| 1. /path/to/blog/entry | `<id>` and `<link rel="alternate">` |
|
||||
|
||||
|
||||
|
||||
4. output all of the above into a new file named **articles.xml** (default name, but can be changed).
|
||||
|
||||
## ways to use
|
||||
- configure a cron job on your web server to run automatically every now and then.
|
||||
|
|
|
@ -6,13 +6,16 @@
|
|||
// the timezone referenced by the system for automatic timestamping.
|
||||
// suported timezones: https://www.php.net/manual/en/timezones.php
|
||||
$timezone = 'Asia/Jakarta';
|
||||
$max_entries = 10;
|
||||
|
||||
// FEED METADATA //////////////////////////////////////////////////////////////////////
|
||||
// FEED METADATA //
|
||||
// certain characters must be escaped as HTML entities - note that XML only accepts five of them.
|
||||
// reference: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
||||
// reference for character entities: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
|
||||
$feed_title = 'jasmine's b(rain)log | jasm1nii.xyz';
|
||||
$feed_subtitle = 'blog articles by jasmine';
|
||||
// location of the blog index page (or if unavailable, your main page).
|
||||
$blog_url = 'https://jasm1nii.xyz/blog/articles';
|
||||
// permalink to the XML feed on your site.
|
||||
$feed_url = 'https://jasm1nii.xyz/blog/articles/articles.xml';
|
||||
$author_name = 'jasmine';
|
||||
$author_email = 'contact@jasm1nii.xyz';
|
||||
|
@ -20,23 +23,32 @@
|
|||
$feed_icon = 'https://jasm1nii.xyz/assets/media/itchio-textless-white.svg';
|
||||
$feed_logo = 'https://jasm1nii.xyz/assets/media/main/07042023-me_compressed.webp';
|
||||
|
||||
// PATH TO FETCH PAGES FROM ///////////////////////////////////////////////////////////
|
||||
// PATH TO FETCH PAGES FROM //
|
||||
// __DIR__ is the directory where *this script* is located.
|
||||
// in my case, i first need to go up two directories to get to the site root.
|
||||
$site_root = dirname(__DIR__, 2);
|
||||
// once i'm there, i specify the parent directory where i keep all of my blog pages.
|
||||
// because the values of $blog_root and $blog_entries will be used for generating entry links, forward slashes are a *must*.
|
||||
$blog_root = $site_root.'/blog/articles';
|
||||
// then i specify a pattern that matches the path of each individual page.
|
||||
// my setup is /YYYY/MM/DD/entry.html
|
||||
// then, specify a pattern that matches the path of each individual page.
|
||||
// for example, this will match /YYYY/MM/DD/entry.html
|
||||
$blog_entries = $blog_root.'/*/*/*/*.html';
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
// ENTRY METADATA //
|
||||
// depending on your site setup, this might not be the same as $blog_url.
|
||||
// the generator will appended $blog_root to the URL specified below.
|
||||
$blog_directory_url = 'https://jasm1nii.xyz/blog/articles';
|
||||
|
||||
// NAME OF FEED FILE
|
||||
$file = 'articles.xml';
|
||||
|
||||
// --------------------------------------------
|
||||
|
||||
// create beginning of feed template.
|
||||
// reference for required elements: https://validator.w3.org/feed/docs/atom.html
|
||||
ob_start();
|
||||
date_default_timezone_set($timezone);
|
||||
|
||||
// reference for required elements: https://validator.w3.org/feed/docs/atom.html
|
||||
echo '<?xml version="1.0" encoding="utf-8"?>'
|
||||
.'<feed xmlns="http://www.w3.org/2005/Atom">'
|
||||
// optionally specify feed generator for debugging purposes.
|
||||
|
@ -96,21 +108,23 @@
|
|||
$title = $x->query("//*[@class='" . $title_class . "']");
|
||||
if ($title->length > 0) {
|
||||
echo '<title>'. $title[0]->nodeValue . '</title>';
|
||||
} elseif (!empty($title)) {
|
||||
$title = $article_dom->getElementsByTagName('h2');
|
||||
} elseif ($title->length == 0) {
|
||||
$title = $article_dom->getElementsByTagName('title');
|
||||
echo '<title>'.$title[0]->nodeValue.'</title>';
|
||||
} else {
|
||||
echo $feed_title;
|
||||
}
|
||||
|
||||
// id
|
||||
echo '<id>https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '</id>';
|
||||
echo '<id>' . $blog_directory_url . '/' . ltrim($article, $blog_root) . '</id>';
|
||||
|
||||
// alternate link
|
||||
echo '<link rel="alternate" type="text/html" href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"/>';
|
||||
echo '<link rel="alternate" type="text/html" href="' . $blog_directory_url . '/' . ltrim($article, $blog_root) . '"/>';
|
||||
|
||||
$updated = $article_dom->getElementsByTagName('time');
|
||||
if (!empty($updated)) {
|
||||
// date updated
|
||||
$updated_class = 'dt-updated';
|
||||
$updated = $x->query("//*[@class='" . $updated_class . "']");
|
||||
if ($updated->length > 0) {
|
||||
$timestamp = $updated[0]->getAttribute('datetime');
|
||||
if (strlen($timestamp) == 10) {
|
||||
echo '<updated>' . $timestamp . 'T00:00:00' . date('P'). '</updated>';
|
||||
|
@ -118,9 +132,43 @@
|
|||
elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) {
|
||||
echo '<updated>' . $timestamp .'</updated>';
|
||||
}
|
||||
}
|
||||
if ($updated->length == 0) {
|
||||
$updated = $article_dom->getElementsByTagName('time');
|
||||
$timestamp = $updated[0]->getAttribute('datetime');
|
||||
if (strlen($timestamp) == 10) {
|
||||
echo '<updated>' . $timestamp . 'T00:00:00' . date('P'). '</updated>';
|
||||
} elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) {
|
||||
echo '<updated>' . $timestamp .'</updated>';
|
||||
} else {
|
||||
$article_created = filemtime($article);
|
||||
echo '<updated>' . date(DATE_ATOM, $article_created) . '</updated>';
|
||||
}
|
||||
}
|
||||
|
||||
// date published
|
||||
$published_class = 'dt-published';
|
||||
$published = $x->query("//*[@class='" . $published_class . "']");
|
||||
if ($published->length > 0) {
|
||||
$timestamp = $published[0]->getAttribute('datetime');
|
||||
if (strlen($timestamp) == 10) {
|
||||
echo '<published>' . $timestamp . 'T00:00:00' . date('P'). '</published>';
|
||||
}
|
||||
elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) {
|
||||
echo '<published>' . $timestamp .'</published>';
|
||||
}
|
||||
}
|
||||
if ($published->length == 0) {
|
||||
$published = $article_dom->getElementsByTagName('time');
|
||||
$timestamp = $published[0]->getAttribute('datetime');
|
||||
if (strlen($timestamp) == 10) {
|
||||
echo '<published>' . $timestamp . 'T00:00:00' . date('P'). '</published>';
|
||||
} elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) {
|
||||
echo '<published>' . $timestamp .'</published>';
|
||||
} else {
|
||||
$article_created = filectime($article);
|
||||
echo '<updated>' . date(DATE_ATOM, $article_created) . '</updated>';
|
||||
echo '<published>' . date(DATE_ATOM, $article_created) . '</published>';
|
||||
}
|
||||
}
|
||||
|
||||
// summary
|
||||
|
@ -130,6 +178,11 @@
|
|||
echo '<summary type="html">';
|
||||
echo $summary->item(0)->nodeValue;
|
||||
echo '</summary>';
|
||||
} elseif($summary->length == 0) {
|
||||
$summary = get_meta_tags($article)['description'];
|
||||
echo '<summary type="html">';
|
||||
echo $summary;
|
||||
echo '</summary>';
|
||||
} else {
|
||||
echo '<summary type="html">' . 'A summary of this content is not available.' . '</summary>';
|
||||
}
|
||||
|
@ -139,22 +192,24 @@
|
|||
$content = $x->query("//*[@class='" . $content_class . "']");
|
||||
if ($content->length > 0) {
|
||||
// strip line breaks and output a maximum of 500 characters.
|
||||
echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (<a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '">read more</a>)' . '</content>';
|
||||
echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (read more on the original page)' . '</content>';
|
||||
} elseif (!empty($content)) {
|
||||
$content = $article_dom->getElementsByTagName('article');
|
||||
echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (<a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '">read more</a>)' . '</content>';
|
||||
echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (read more on the original page)' . '</content>';
|
||||
} else {
|
||||
echo '<content type="html">' . 'Content could not be parsed for previewing - view the original article on the website.' . '</content>';
|
||||
echo '<content type="html">' . 'Content could not be parsed as a preview - view the original article on the website.' . '</content>';
|
||||
}
|
||||
|
||||
echo '</entry>';
|
||||
|
||||
// add no more than 10 entries.
|
||||
if(++$i > 9) break;
|
||||
if(++$i > ($max_entries-1)) break;
|
||||
}
|
||||
echo '</feed>';
|
||||
|
||||
$xml_str = ob_get_contents();
|
||||
ob_end_clean();
|
||||
file_put_contents($blog_root.'/articles.xml', $xml_str);
|
||||
file_put_contents($blog_root . DIRECTORY_SEPARATOR . $file, $xml_str);
|
||||
|
||||
echo strtoupper(date("h:i:sa")) . ' - Feed successfully generated in ' . realpath($blog_root) . DIRECTORY_SEPARATOR . $file;
|
||||
echo '<br/>Validate your feed at https://validator.w3.org/feed/';
|
||||
?>
|
Loading…
Reference in New Issue