2023-10-17 14:13:20 +00:00
< ? php
// work in progress!!
2023-10-18 04:28:45 +00:00
// GENERAL SETTINGS -------------------------------------------------------------------
// the timezone referenced by the system for automatic timestamping.
// suported timezones: https://www.php.net/manual/en/timezones.php
$timezone = 'Asia/Jakarta' ;
2023-10-18 11:13:54 +00:00
$max_entries = 10 ;
2023-10-18 04:28:45 +00:00
2023-10-18 11:13:54 +00:00
// FEED METADATA //
2023-10-18 04:28:45 +00:00
// certain characters must be escaped as HTML entities - note that XML only accepts five of them.
2023-10-18 11:13:54 +00:00
// reference for character entities: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
2023-10-18 04:28:45 +00:00
$feed_title = 'jasmine's b(rain)log | jasm1nii.xyz' ;
$feed_subtitle = 'blog articles by jasmine' ;
2023-10-18 11:13:54 +00:00
// location of the blog index page (or if unavailable, your main page).
2023-10-18 04:28:45 +00:00
$blog_url = 'https://jasm1nii.xyz/blog/articles' ;
2023-10-18 11:13:54 +00:00
// permalink to the XML feed on your site.
2023-10-18 04:28:45 +00:00
$feed_url = 'https://jasm1nii.xyz/blog/articles/articles.xml' ;
$author_name = 'jasmine' ;
$author_email = 'contact@jasm1nii.xyz' ;
$author_homepage = 'https://jasm1nii.xyz/' ;
$feed_icon = 'https://jasm1nii.xyz/assets/media/itchio-textless-white.svg' ;
$feed_logo = 'https://jasm1nii.xyz/assets/media/main/07042023-me_compressed.webp' ;
2023-10-18 11:13:54 +00:00
// PATH TO FETCH PAGES FROM //
2023-10-18 04:28:45 +00:00
// __DIR__ is the directory where *this script* is located.
// in my case, i first need to go up two directories to get to the site root.
$site_root = dirname ( __DIR__ , 2 );
// once i'm there, i specify the parent directory where i keep all of my blog pages.
2023-10-18 11:13:54 +00:00
// because the values of $blog_root and $blog_entries will be used for generating entry links, forward slashes are a *must*.
2023-10-18 04:28:45 +00:00
$blog_root = $site_root . '/blog/articles' ;
2023-10-18 11:13:54 +00:00
// then, specify a pattern that matches the path of each individual page.
// for example, this will match /YYYY/MM/DD/entry.html
2023-10-18 04:28:45 +00:00
$blog_entries = $blog_root . '/*/*/*/*.html' ;
2023-10-18 11:13:54 +00:00
// ENTRY METADATA //
// depending on your site setup, this might not be the same as $blog_url.
// the generator will appended $blog_root to the URL specified below.
$blog_directory_url = 'https://jasm1nii.xyz/blog/articles' ;
// NAME OF FEED FILE
$file = 'articles.xml' ;
// --------------------------------------------
2023-10-17 14:13:20 +00:00
// create beginning of feed template.
ob_start ();
2023-10-18 04:28:45 +00:00
date_default_timezone_set ( $timezone );
2023-10-17 14:13:20 +00:00
2023-10-18 11:13:54 +00:00
// reference for required elements: https://validator.w3.org/feed/docs/atom.html
2023-10-17 14:13:20 +00:00
echo '<?xml version="1.0" encoding="utf-8"?>'
. '<feed xmlns="http://www.w3.org/2005/Atom">'
// optionally specify feed generator for debugging purposes.
2023-10-18 06:44:40 +00:00
. '<generator uri="https://github.com/jasm1nii/xml-feed-generator" version="1.1">PHP feed generator by jasm1nii.xyz | Last modified by system at ' . strtoupper ( date ( " h:i:sa " )) . ' (GMT' . date ( 'P' ) . ')</generator>'
2023-10-18 04:28:45 +00:00
. '<title>' . $feed_title . '</title>'
. '<subtitle>' . $feed_subtitle . '</subtitle>'
. '<id>' . $blog_url . '</id>'
. '<link rel="self" href="' . $feed_url . '" type="application/atom+xml"/>'
. '<link rel="alternate" href="' . $blog_url . '" type="text/html"/>' ;
2023-10-17 14:13:20 +00:00
// force libxml to parse all HTML elements, including HTML 5. by default, the extension can only read valid HTML 4.
libxml_use_internal_errors ( true );
// match feed update time with the newest entry.
2023-10-18 04:28:45 +00:00
$article_list = glob ( $blog_entries );
2023-10-17 14:13:20 +00:00
$first_article = array_pop ( $article_list );
$first_article_content = file_get_contents ( $first_article );
$first_article_dom = new DOMDocument ;
$first_article_dom -> loadHTML ( $first_article_content );
$feed_updated = $first_article_dom -> getElementsByTagName ( 'time' );
2023-10-18 06:44:40 +00:00
if ( ! empty ( $feed_updated )) {
$feed_datetime = $feed_updated [ 0 ] -> getAttribute ( 'datetime' );
if ( strlen ( $feed_datetime ) == 10 ) {
echo '<updated>' . $feed_datetime . 'T00:00:00' . date ( 'P' ) . '</updated>' ;
}
elseif ( strlen ( $feed_datetime ) == 25 || strlen ( $feed_datetime ) == 20 ) {
echo '<updated>' . $feed_datetime . '</updated>' ;
}
2023-10-18 04:28:45 +00:00
// if no RFC 3339 timestamp is found, use the file creation date.
2023-10-18 06:44:40 +00:00
} else {
2023-10-18 04:28:45 +00:00
$first_article_created = filectime ( $first_article );
echo '<updated>' . date ( DATE_ATOM , $first_article_created ) . '</updated>' ;
2023-10-17 14:13:20 +00:00
}
// rest of the template.
echo '<author>'
2023-10-18 04:28:45 +00:00
. '<name>' . $author_name . '</name>'
. '<email>' . $author_email . '</email>'
. '<uri>' . $author_homepage . '</uri>'
2023-10-17 14:13:20 +00:00
. '</author>'
2023-10-18 04:28:45 +00:00
. '<icon>' . $feed_icon . '</icon>'
. '<logo>' . $feed_logo . '</logo>' ;
2023-10-17 14:13:20 +00:00
// output entries.
$i = 0 ;
2023-10-18 04:28:45 +00:00
foreach ( array_reverse ( glob ( $blog_entries )) as $article ) {
2023-10-17 14:13:20 +00:00
$article_content = file_get_contents ( $article );
$article_dom = new DOMDocument ;
$article_dom -> loadHTML ( $article_content );
echo '<entry>' ;
2023-10-18 06:44:40 +00:00
$x = new DOMXPath ( $article_dom );
2023-10-17 14:13:20 +00:00
// title
2023-10-18 06:44:40 +00:00
$title_class = 'p-name' ;
$title = $x -> query ( " //*[@class=' " . $title_class . " '] " );
if ( $title -> length > 0 ) {
echo '<title>' . $title [ 0 ] -> nodeValue . '</title>' ;
2023-10-18 11:13:54 +00:00
} elseif ( $title -> length == 0 ) {
$title = $article_dom -> getElementsByTagName ( 'title' );
2023-10-18 06:44:40 +00:00
echo '<title>' . $title [ 0 ] -> nodeValue . '</title>' ;
} else {
echo $feed_title ;
2023-10-17 14:13:20 +00:00
}
// id
2023-10-18 11:13:54 +00:00
echo '<id>' . $blog_directory_url . '/' . ltrim ( $article , $blog_root ) . '</id>' ;
2023-10-17 14:13:20 +00:00
// alternate link
2023-10-18 11:13:54 +00:00
echo '<link rel="alternate" type="text/html" href="' . $blog_directory_url . '/' . ltrim ( $article , $blog_root ) . '"/>' ;
2023-10-17 14:13:20 +00:00
2023-10-18 11:13:54 +00:00
// date updated
$updated_class = 'dt-updated' ;
$updated = $x -> query ( " //*[@class=' " . $updated_class . " '] " );
if ( $updated -> length > 0 ) {
2023-10-18 06:44:40 +00:00
$timestamp = $updated [ 0 ] -> getAttribute ( 'datetime' );
2023-10-18 04:28:45 +00:00
if ( strlen ( $timestamp ) == 10 ) {
echo '<updated>' . $timestamp . 'T00:00:00' . date ( 'P' ) . '</updated>' ;
}
elseif ( strlen ( $timestamp ) == 25 || strlen ( $timestamp ) == 20 ) {
echo '<updated>' . $timestamp . '</updated>' ;
}
2023-10-18 11:13:54 +00:00
}
if ( $updated -> length == 0 ) {
$updated = $article_dom -> getElementsByTagName ( 'time' );
$timestamp = $updated [ 0 ] -> getAttribute ( 'datetime' );
if ( strlen ( $timestamp ) == 10 ) {
echo '<updated>' . $timestamp . 'T00:00:00' . date ( 'P' ) . '</updated>' ;
} elseif ( strlen ( $timestamp ) == 25 || strlen ( $timestamp ) == 20 ) {
echo '<updated>' . $timestamp . '</updated>' ;
} else {
$article_created = filemtime ( $article );
2023-10-18 04:28:45 +00:00
echo '<updated>' . date ( DATE_ATOM , $article_created ) . '</updated>' ;
2023-10-18 11:13:54 +00:00
}
}
// date published
$published_class = 'dt-published' ;
$published = $x -> query ( " //*[@class=' " . $published_class . " '] " );
if ( $published -> length > 0 ) {
$timestamp = $published [ 0 ] -> getAttribute ( 'datetime' );
if ( strlen ( $timestamp ) == 10 ) {
echo '<published>' . $timestamp . 'T00:00:00' . date ( 'P' ) . '</published>' ;
}
elseif ( strlen ( $timestamp ) == 25 || strlen ( $timestamp ) == 20 ) {
echo '<published>' . $timestamp . '</published>' ;
}
}
if ( $published -> length == 0 ) {
$published = $article_dom -> getElementsByTagName ( 'time' );
$timestamp = $published [ 0 ] -> getAttribute ( 'datetime' );
if ( strlen ( $timestamp ) == 10 ) {
echo '<published>' . $timestamp . 'T00:00:00' . date ( 'P' ) . '</published>' ;
} elseif ( strlen ( $timestamp ) == 25 || strlen ( $timestamp ) == 20 ) {
echo '<published>' . $timestamp . '</published>' ;
} else {
$article_created = filectime ( $article );
echo '<published>' . date ( DATE_ATOM , $article_created ) . '</published>' ;
}
2023-10-17 14:13:20 +00:00
}
// summary
$summary_class = 'p-summary' ;
$summary = $x -> query ( " //*[@class=' " . $summary_class . " '] " );
if ( $summary -> length > 0 ) {
echo '<summary type="html">' ;
echo $summary -> item ( 0 ) -> nodeValue ;
echo '</summary>' ;
2023-10-18 11:13:54 +00:00
} elseif ( $summary -> length == 0 ) {
$summary = get_meta_tags ( $article )[ 'description' ];
echo '<summary type="html">' ;
echo $summary ;
echo '</summary>' ;
2023-10-18 06:44:40 +00:00
} else {
echo '<summary type="html">' . 'A summary of this content is not available.' . '</summary>' ;
2023-10-17 14:13:20 +00:00
}
// content
$content_class = 'e-content' ;
$content = $x -> query ( " //*[@class=' " . $content_class . " '] " );
if ( $content -> length > 0 ) {
2023-10-18 04:28:45 +00:00
// strip line breaks and output a maximum of 500 characters.
2023-10-18 11:13:54 +00:00
echo '<content type="html">' . preg_replace ( '/\s\s+/' , ' ' ,( substr ( $content -> item ( 0 ) -> nodeValue , 0 , 500 ))) . '... (read more on the original page)' . '</content>' ;
2023-10-18 06:44:40 +00:00
} elseif ( ! empty ( $content )) {
$content = $article_dom -> getElementsByTagName ( 'article' );
2023-10-18 11:13:54 +00:00
echo '<content type="html">' . preg_replace ( '/\s\s+/' , ' ' ,( substr ( $content -> item ( 0 ) -> nodeValue , 0 , 500 ))) . '... (read more on the original page)' . '</content>' ;
2023-10-17 14:13:20 +00:00
} else {
2023-10-18 11:13:54 +00:00
echo '<content type="html">' . 'Content could not be parsed as a preview - view the original article on the website.' . '</content>' ;
2023-10-17 14:13:20 +00:00
}
echo '</entry>' ;
2023-10-18 04:28:45 +00:00
2023-10-18 11:13:54 +00:00
if ( ++ $i > ( $max_entries - 1 )) break ;
2023-10-17 14:13:20 +00:00
}
echo '</feed>' ;
$xml_str = ob_get_contents ();
ob_end_clean ();
2023-10-18 11:13:54 +00:00
file_put_contents ( $blog_root . DIRECTORY_SEPARATOR . $file , $xml_str );
echo strtoupper ( date ( " h:i:sa " )) . ' - Feed successfully generated in ' . realpath ( $blog_root ) . DIRECTORY_SEPARATOR . $file ;
echo '<br/>Validate your feed at https://validator.w3.org/feed/' ;
2023-10-17 14:13:20 +00:00
?>