xml-feed-generator/feed_generator.php

<?php
    // work in progress!!
    
    // GENERAL SETTINGS -------------------------------------------------------------------

    // the timezone referenced by the system for automatic timestamping.
    // suported timezones: https://www.php.net/manual/en/timezones.php
    $timezone = 'Asia/Jakarta';

    // FEED METADATA //////////////////////////////////////////////////////////////////////
    // certain characters must be escaped as HTML entities - note that XML only accepts five of them.
    // reference: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
    $feed_title = 'jasmine&apos;s b(rain)log | jasm1nii.xyz';
    $feed_subtitle = 'blog articles by jasmine';
    $blog_url = 'https://jasm1nii.xyz/blog/articles';
    $feed_url = 'https://jasm1nii.xyz/blog/articles/articles.xml';
    $author_name = 'jasmine';
    $author_email = 'contact@jasm1nii.xyz';
    $author_homepage = 'https://jasm1nii.xyz/';
    $feed_icon = 'https://jasm1nii.xyz/assets/media/itchio-textless-white.svg';
    $feed_logo = 'https://jasm1nii.xyz/assets/media/main/07042023-me_compressed.webp';

    // PATH TO FETCH PAGES FROM ///////////////////////////////////////////////////////////
    // __DIR__ is the directory where *this script* is located.
    // in my case, i first need to go up two directories to get to the site root.
    $site_root = dirname(__DIR__, 2);
    // once i'm there, i specify the parent directory where i keep all of my blog pages.
    $blog_root = $site_root.'/blog/articles';
    // then i specify a pattern that matches the path of each individual page.
    // my setup is /YYYY/MM/DD/entry.html
    $blog_entries = $blog_root.'/*/*/*/*.html';

    // ------------------------------------------------------------------------------------

    // create beginning of feed template.
    // reference for required elements: https://validator.w3.org/feed/docs/atom.html
    ob_start();
    date_default_timezone_set($timezone);

    echo    '<?xml version="1.0" encoding="utf-8"?>'
            .'<feed xmlns="http://www.w3.org/2005/Atom">'
            // optionally specify feed generator for debugging purposes.
            .'<generator uri="https://github.com/jasm1nii/xml-feed-generator" version="1.1">PHP feed generator by jasm1nii.xyz | Last modified by system at ' . strtoupper(date("h:i:sa")) . ' (GMT' . date('P') . ')</generator>'
            .'<title>' . $feed_title . '</title>'
            .'<subtitle>' . $feed_subtitle . '</subtitle>'
            .'<id>' . $blog_url . '</id>'
            .'<link rel="self" href="'. $feed_url .'" type="application/atom+xml"/>'
            .'<link rel="alternate" href="' . $blog_url .'" type="text/html"/>';

    // force libxml to parse all HTML elements, including HTML 5. by default, the extension can only read valid HTML 4.
    libxml_use_internal_errors(true);
    
    // match feed update time with the newest entry.
    $article_list = glob($blog_entries);
    $first_article = array_pop($article_list);
    $first_article_content = file_get_contents($first_article);
    $first_article_dom = new DOMDocument;
    $first_article_dom->loadHTML($first_article_content);
    $feed_updated = $first_article_dom->getElementsByTagName('time');
    if (!empty($feed_updated)) {
            $feed_datetime = $feed_updated[0]->getAttribute('datetime');
            if (strlen($feed_datetime) == 10) {
                echo    '<updated>' . $feed_datetime . 'T00:00:00' . date('P') .'</updated>';
            }
            elseif (strlen($feed_datetime) == 25 || strlen($feed_datetime) == 20) {
                echo    '<updated>' . $feed_datetime .'</updated>';
            }
    // if no RFC 3339 timestamp is found, use the file creation date.
    } else {
        $first_article_created = filectime($first_article);
        echo    '<updated>' . date(DATE_ATOM, $first_article_created) . '</updated>';
    }

    // rest of the template.
    echo    '<author>'
            .'<name>' . $author_name . '</name>'
            .'<email>' . $author_email . '</email>'
            .'<uri>' . $author_homepage . '</uri>'
            .'</author>'
            .'<icon>' . $feed_icon . '</icon>'
            .'<logo>' . $feed_logo . '</logo>';

    // output entries.
    $i = 0;
    foreach (array_reverse(glob($blog_entries)) as $article) {
        $article_content = file_get_contents($article);
        $article_dom = new DOMDocument;
        $article_dom->loadHTML($article_content);

        echo    '<entry>';

        $x = new DOMXPath($article_dom);

        // title
        $title_class = 'p-name';
        $title = $x->query("//*[@class='" . $title_class . "']");
        if ($title->length > 0) {
            echo    '<title>'. $title[0]->nodeValue . '</title>';
        } elseif (!empty($title))   {
            $title = $article_dom->getElementsByTagName('h2');
            echo    '<title>'.$title[0]->nodeValue.'</title>';
        } else {
            echo    $feed_title;
        }

        // id
        echo    '<id>https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '</id>';

        // alternate link
        echo    '<link rel="alternate" type="text/html" href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"/>';

        $updated = $article_dom->getElementsByTagName('time');
        if (!empty($updated)) {
            $timestamp = $updated[0]->getAttribute('datetime');
            if (strlen($timestamp) == 10) {
                echo    '<updated>' . $timestamp . 'T00:00:00' . date('P'). '</updated>';
            }
            elseif (strlen($timestamp) == 25 || strlen($timestamp) == 20) {
                echo    '<updated>' . $timestamp .'</updated>';
            }
        } else {
            $article_created = filectime($article);
            echo    '<updated>' . date(DATE_ATOM, $article_created) . '</updated>';
        }

        // summary
        $summary_class = 'p-summary';
        $summary = $x->query("//*[@class='" . $summary_class . "']");
        if ($summary->length > 0) {
            echo    '<summary type="html">';
            echo    $summary->item(0)->nodeValue;
            echo    '</summary>';
        } else {
            echo    '<summary type="html">' . 'A summary of this content is not available.' . '</summary>';
        }

        // content
        $content_class = 'e-content';
        $content = $x->query("//*[@class='" . $content_class . "']");
        if ($content->length > 0) {
            // strip line breaks and output a maximum of 500 characters.
            echo    '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (&lt;a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"&gt;read more&lt;/a&gt;)' . '</content>';
        } elseif (!empty($content)) {
            $content = $article_dom->getElementsByTagName('article');
            echo    '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (&lt;a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"&gt;read more&lt;/a&gt;)' . '</content>';
        } else {
            echo    '<content type="html">' . 'Content could not be parsed for previewing - view the original article on the website.' . '</content>';
        }

        echo    '</entry>';

        // add no more than 10 entries.
        if(++$i > 9) break;
    }
    echo '</feed>';

    $xml_str = ob_get_contents();
    ob_end_clean();
    file_put_contents($blog_root.'/articles.xml', $xml_str);
?>
Add files via upload 2023-10-17 21:13:20 +07:00			`<?php`
			`// work in progress!!`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00
			`// GENERAL SETTINGS -------------------------------------------------------------------`

			`// the timezone referenced by the system for automatic timestamping.`
			`// suported timezones: https://www.php.net/manual/en/timezones.php`
			`$timezone = 'Asia/Jakarta';`

			`// FEED METADATA //////////////////////////////////////////////////////////////////////`
			`// certain characters must be escaped as HTML entities - note that XML only accepts five of them.`
			`// reference: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references`
			`$feed_title = 'jasmine's b(rain)log \| jasm1nii.xyz';`
			`$feed_subtitle = 'blog articles by jasmine';`
			`$blog_url = 'https://jasm1nii.xyz/blog/articles';`
			`$feed_url = 'https://jasm1nii.xyz/blog/articles/articles.xml';`
			`$author_name = 'jasmine';`
			`$author_email = 'contact@jasm1nii.xyz';`
			`$author_homepage = 'https://jasm1nii.xyz/';`
			`$feed_icon = 'https://jasm1nii.xyz/assets/media/itchio-textless-white.svg';`
			`$feed_logo = 'https://jasm1nii.xyz/assets/media/main/07042023-me_compressed.webp';`

			`// PATH TO FETCH PAGES FROM ///////////////////////////////////////////////////////////`
			`// __DIR__ is the directory where this script is located.`
			`// in my case, i first need to go up two directories to get to the site root.`
			`$site_root = dirname(__DIR__, 2);`
			`// once i'm there, i specify the parent directory where i keep all of my blog pages.`
			`$blog_root = $site_root.'/blog/articles';`
			`// then i specify a pattern that matches the path of each individual page.`
			`// my setup is /YYYY/MM/DD/entry.html`
			`$blog_entries = $blog_root.'////.html';`

			`// ------------------------------------------------------------------------------------`
Add files via upload 2023-10-17 21:13:20 +07:00
			`// create beginning of feed template.`
			`// reference for required elements: https://validator.w3.org/feed/docs/atom.html`
			`ob_start();`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`date_default_timezone_set($timezone);`
Add files via upload 2023-10-17 21:13:20 +07:00
			`echo '<?xml version="1.0" encoding="utf-8"?>'`
			`.'<feed xmlns="http://www.w3.org/2005/Atom">'`
			`// optionally specify feed generator for debugging purposes.`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`.'<generator uri="https://github.com/jasm1nii/xml-feed-generator" version="1.1">PHP feed generator by jasm1nii.xyz \| Last modified by system at ' . strtoupper(date("h:i:sa")) . ' (GMT' . date('P') . ')</generator>'`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`.'<title>' . $feed_title . '</title>'`
			`.'<subtitle>' . $feed_subtitle . '</subtitle>'`
			`.'<id>' . $blog_url . '</id>'`
			`.'<link rel="self" href="'. $feed_url .'" type="application/atom+xml"/>'`
			`.'<link rel="alternate" href="' . $blog_url .'" type="text/html"/>';`
Add files via upload 2023-10-17 21:13:20 +07:00
			`// force libxml to parse all HTML elements, including HTML 5. by default, the extension can only read valid HTML 4.`
			`libxml_use_internal_errors(true);`

			`// match feed update time with the newest entry.`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`$article_list = glob($blog_entries);`
Add files via upload 2023-10-17 21:13:20 +07:00			`$first_article = array_pop($article_list);`
			`$first_article_content = file_get_contents($first_article);`
			`$first_article_dom = new DOMDocument;`
			`$first_article_dom->loadHTML($first_article_content);`
			`$feed_updated = $first_article_dom->getElementsByTagName('time');`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`if (!empty($feed_updated)) {`
			`$feed_datetime = $feed_updated[0]->getAttribute('datetime');`
			`if (strlen($feed_datetime) == 10) {`
			`echo '<updated>' . $feed_datetime . 'T00:00:00' . date('P') .'</updated>';`
			`}`
			`elseif (strlen($feed_datetime) == 25 \|\| strlen($feed_datetime) == 20) {`
			`echo '<updated>' . $feed_datetime .'</updated>';`
			`}`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`// if no RFC 3339 timestamp is found, use the file creation date.`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`} else {`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`$first_article_created = filectime($first_article);`
			`echo '<updated>' . date(DATE_ATOM, $first_article_created) . '</updated>';`
Add files via upload 2023-10-17 21:13:20 +07:00			`}`

			`// rest of the template.`
			`echo '<author>'`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`.'<name>' . $author_name . '</name>'`
			`.'<email>' . $author_email . '</email>'`
			`.'<uri>' . $author_homepage . '</uri>'`
Add files via upload 2023-10-17 21:13:20 +07:00			`.'</author>'`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`.'<icon>' . $feed_icon . '</icon>'`
			`.'<logo>' . $feed_logo . '</logo>';`
Add files via upload 2023-10-17 21:13:20 +07:00
			`// output entries.`
			`$i = 0;`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`foreach (array_reverse(glob($blog_entries)) as $article) {`
Add files via upload 2023-10-17 21:13:20 +07:00			`$article_content = file_get_contents($article);`
			`$article_dom = new DOMDocument;`
			`$article_dom->loadHTML($article_content);`

			`echo '<entry>';`

remove unnecessary loops 2023-10-18 13:44:40 +07:00			`$x = new DOMXPath($article_dom);`

Add files via upload 2023-10-17 21:13:20 +07:00			`// title`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`$title_class = 'p-name';`
			`$title = $x->query("//*[@class='" . $title_class . "']");`
			`if ($title->length > 0) {`
			`echo '<title>'. $title[0]->nodeValue . '</title>';`
			`} elseif (!empty($title)) {`
			`$title = $article_dom->getElementsByTagName('h2');`
			`echo '<title>'.$title[0]->nodeValue.'</title>';`
			`} else {`
			`echo $feed_title;`
Add files via upload 2023-10-17 21:13:20 +07:00			`}`

			`// id`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`echo '<id>https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '</id>';`
Add files via upload 2023-10-17 21:13:20 +07:00
			`// alternate link`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`echo '<link rel="alternate" type="text/html" href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '"/>';`
Add files via upload 2023-10-17 21:13:20 +07:00
			`$updated = $article_dom->getElementsByTagName('time');`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`if (!empty($updated)) {`
			`$timestamp = $updated[0]->getAttribute('datetime');`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`if (strlen($timestamp) == 10) {`
			`echo '<updated>' . $timestamp . 'T00:00:00' . date('P'). '</updated>';`
			`}`
			`elseif (strlen($timestamp) == 25 \|\| strlen($timestamp) == 20) {`
			`echo '<updated>' . $timestamp .'</updated>';`
			`}`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`} else {`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`$article_created = filectime($article);`
			`echo '<updated>' . date(DATE_ATOM, $article_created) . '</updated>';`
Add files via upload 2023-10-17 21:13:20 +07:00			`}`

			`// summary`
			`$summary_class = 'p-summary';`
			`$summary = $x->query("//*[@class='" . $summary_class . "']");`
			`if ($summary->length > 0) {`
			`echo '<summary type="html">';`
			`echo $summary->item(0)->nodeValue;`
			`echo '</summary>';`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`} else {`
			`echo '<summary type="html">' . 'A summary of this content is not available.' . '</summary>';`
Add files via upload 2023-10-17 21:13:20 +07:00			`}`

			`// content`
			`$content_class = 'e-content';`
			`$content = $x->query("//*[@class='" . $content_class . "']");`
			`if ($content->length > 0) {`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`// strip line breaks and output a maximum of 500 characters.`
			`echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (<a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '">read more</a>)' . '</content>';`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`} elseif (!empty($content)) {`
			`$content = $article_dom->getElementsByTagName('article');`
			`echo '<content type="html">' . preg_replace('/\s\s+/', ' ',(substr($content->item(0)->nodeValue,0,500))) . '... (<a href="https://jasm1nii.xyz/blog/articles/' . ltrim($article, $blog_root) . '">read more</a>)' . '</content>';`
Add files via upload 2023-10-17 21:13:20 +07:00			`} else {`
remove unnecessary loops 2023-10-18 13:44:40 +07:00			`echo '<content type="html">' . 'Content could not be parsed for previewing - view the original article on the website.' . '</content>';`
Add files via upload 2023-10-17 21:13:20 +07:00			`}`

			`echo '</entry>';`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00
			`// add no more than 10 entries.`
Add files via upload 2023-10-17 21:13:20 +07:00			`if(++$i > 9) break;`
			`}`
			`echo '</feed>';`

			`$xml_str = ob_get_contents();`
			`ob_end_clean();`
add variables, fallback for timestamps 2023-10-18 11:28:45 +07:00			`file_put_contents($blog_root.'/articles.xml', $xml_str);`
Add files via upload 2023-10-17 21:13:20 +07:00			`?>`