xml-feed-generator/feed_generator.php

160 lines
7.6 KiB
PHP
Raw Normal View History

2023-10-17 14:13:20 +00:00
<?php
// work in progress!!
2023-10-18 04:28:45 +00:00
// GENERAL SETTINGS -------------------------------------------------------------------
// the timezone referenced by the system for automatic timestamping.
// suported timezones: https://www.php.net/manual/en/timezones.php
$timezone = 'Asia/Jakarta';
// FEED METADATA //////////////////////////////////////////////////////////////////////
// certain characters must be escaped as HTML entities - note that XML only accepts five of them.
// reference: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
$feed_title = 'jasmine&apos;s b(rain)log | jasm1nii.xyz';
$feed_subtitle = 'blog articles by jasmine';
$blog_url = 'https://jasm1nii.xyz/blog/articles';
$feed_url = 'https://jasm1nii.xyz/blog/articles/articles.xml';
$author_name = 'jasmine';
$author_email = 'contact@jasm1nii.xyz';
$author_homepage = 'https://jasm1nii.xyz/';
$feed_icon = 'https://jasm1nii.xyz/assets/media/itchio-textless-white.svg';
$feed_logo = 'https://jasm1nii.xyz/assets/media/main/07042023-me_compressed.webp';
// PATH TO FETCH PAGES FROM ///////////////////////////////////////////////////////////
// __DIR__ is the directory where *this script* is located.
// in my case, i first need to go up two directories to get to the site root.
$site_root = dirname(__DIR__, 2);
// once i'm there, i specify the parent directory where i keep all of my blog pages.
$blog_root = $site_root.'/blog/articles';
// then i specify a pattern that matches the path of each individual page.
// my setup is /YYYY/MM/DD/entry.html
$blog_entries = $blog_root.'/*/*/*/*.html';
// ------------------------------------------------------------------------------------
2023-10-17 14:13:20 +00:00
// create beginning of feed template.
// reference for required elements: https://validator.w3.org/feed/docs/atom.html
ob_start();
2023-10-18 04:28:45 +00:00
date_default_timezone_set($timezone);
2023-10-17 14:13:20 +00:00
echo '<?xml version="1.0" encoding="utf-8"?>'
.'<feed xmlns="http://www.w3.org/2005/Atom">'
// optionally specify feed generator for debugging purposes.
2023-10-18 06:44:40 +00:00
.'<generator uri="https://github.com/jasm1nii/xml-feed-generator" version="1.1">PHP feed generator by jasm1nii.xyz | Last modified by system at ' . strtoupper(date("h:i:sa")) . ' (GMT' . date('P') . ')</generator>'
2023-10-18 04:28:45 +00:00
.'<title>' . $feed_title . '</title>'
.'<subtitle>' . $feed_subtitle . '</subtitle>'
.'<id>' . $blog_url . '</id>'
.'<link rel="self" href="'. $feed_url .'" type="application/atom+xml"/>'
.'<link rel="alternate" href="' . $blog_url .'" type="text/html"/>';
2023-10-17 14:13:20 +00:00
// force libxml to parse all HTML elements, including HTML 5. by default, the extension can only read valid HTML 4.
libxml_use_internal_errors(true);
// match feed update time with the newest entry.
2023-10-18 04:28:45 +00:00
$article_list = glob($blog_entries);
2023-10-17 14:13:20 +00:00
$first_article = array_pop($article_list);
$first_article_content = file_get_contents($first_article);
$first_article_dom = new DOMDocument;
$first_article_dom->loadHTML($first_article_content);
$feed_updated = $first_article_dom->getElementsByTagName('time');
2023-10-18 06:44:40 +00:00
if (!empty($feed_updated)) {
$feed_datetime = $feed_updated[0]->getAttribute('datetime');
if (strlen($feed_datetime) == 10) {
echo '<updated>' . $feed_datetime . 'T00:00:00' . date('P') .'</updated>';
}
elseif (strlen($feed_datetime) == 25 || strlen($feed_datetime) == 20) {
echo '<updated>' . $feed_datetime .'</updated>';
}
2023-10-18 04:28:45 +00:00
// if no RFC 3339 timestamp is found, use the file creation date.
2023-10-18 06:44:40 +00:00
} else {
2023-10-18 04:28:45 +00:00
$first_article_created = filectime($first_article);
echo '<updated>' . date(DATE_ATOM, $first_article_created) . '</updated>';
2023-10-17 14:13:20 +00:00
}
// rest of the template.
echo '<author>'
2023-10-18 04:28:45 +00:00
.'<name>' . $author_name . '</name>'
.'<email>' . $author_email . '</email>'
.'<uri>' . $author_homepage . '</uri>'
2023-10-17 14:13:20 +00:00
.'</author>'
2023-10-18 04:28:45 +00:00
.'<icon>' . $feed_icon . '</icon>'
.'<logo>' . $feed_logo . '</logo>';
2023-10-17 14:13:20 +00:00
// output entries.
$i = 0;
2023-10-18 04:28:45 +00:00
foreach (array_reverse(glob($blog_entries)) as $article) {
2023-10-17 14:13:20 +00:00
$article_content = file_get_contents($article);
$article_dom = new DOMDocument;
$article_dom->loadHTML($article_content);
echo '<entry>';
2023-10-18 06:44:40 +00:00
$x = new DOMXPath($article_dom);
2023-10-17 14:13:20 +00:00
// title
2023-10-18 06:44:40 +00:00
$title_class = 'p-name';
$title = $x->query("//*[@class='" . $title_class . "']");
if ($title->length > 0) {
echo '<title>'. $title[0]->nodeValue . '</title>';
} elseif (!empty($title)) {
$title = $article_dom->getElementsByTagName('h2');
echo '<title>'.$title[0]->nodeValue.'</title>';
} else {
echo $feed_title;
2023-10-17 14:13:20 +00:00
}
// id
2023-10-18 04:28:45 +00:00
echo '<id>https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '</id>';
2023-10-17 14:13:20 +00:00
// alternate link
2023-10-18 04:28:45 +00:00
echo '<link rel="alternate" type="text/html" href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"/>';
2023-10-17 14:13:20 +00:00
$updated = $article_dom->getElementsByTagName('time');
2023-10-18 06:44:40 +00:00
if (!empty($updated)) {
$timestamp = $updated[0]->getAttribute('datetime');
2023-10-18 04:28:45 +00:00
if (strlen($timestamp) == 10) {
echo '<updated>' . $timestamp . 'T00:00:00' . date('P'). '</updated>';
}
elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) {
echo '<updated>' . $timestamp .'</updated>';
}
2023-10-18 06:44:40 +00:00
} else {
2023-10-18 04:28:45 +00:00
$article_created = filectime($article);
echo '<updated>' . date(DATE_ATOM, $article_created) . '</updated>';
2023-10-17 14:13:20 +00:00
}
// summary
$summary_class = 'p-summary';
$summary = $x->query("//*[@class='" . $summary_class . "']");
if ($summary->length > 0) {
echo '<summary type="html">';
echo $summary->item(0)->nodeValue;
echo '</summary>';
2023-10-18 06:44:40 +00:00
} else {
echo '<summary type="html">' . 'A summary of this content is not available.' . '</summary>';
2023-10-17 14:13:20 +00:00
}
// content
$content_class = 'e-content';
$content = $x->query("//*[@class='" . $content_class . "']");
if ($content->length > 0) {
2023-10-18 04:28:45 +00:00
// strip line breaks and output a maximum of 500 characters.
echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (&lt;a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"&gt;read more&lt;/a&gt;)' . '</content>';
2023-10-18 06:44:40 +00:00
} elseif (!empty($content)) {
$content = $article_dom->getElementsByTagName('article');
echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (&lt;a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"&gt;read more&lt;/a&gt;)' . '</content>';
2023-10-17 14:13:20 +00:00
} else {
2023-10-18 06:44:40 +00:00
echo '<content type="html">' . 'Content could not be parsed for previewing - view the original article on the website.' . '</content>';
2023-10-17 14:13:20 +00:00
}
echo '</entry>';
2023-10-18 04:28:45 +00:00
// add no more than 10 entries.
2023-10-17 14:13:20 +00:00
if(++$i > 9) break;
}
echo '</feed>';
$xml_str = ob_get_contents();
ob_end_clean();
2023-10-18 04:28:45 +00:00
file_put_contents($blog_root.'/articles.xml', $xml_str);
2023-10-17 14:13:20 +00:00
?>