Current File : /home/pacjaorg/public_html/km/administrator/components/com_finder/src/Indexer/Parser/Html.php |
<?php
/**
* @package Joomla.Administrator
* @subpackage com_finder
*
* @copyright (C) 2011 Open Source Matters, Inc. <https://www.joomla.org>
* @license GNU General Public License version 2 or later; see LICENSE.txt
*/
namespace Joomla\Component\Finder\Administrator\Indexer\Parser;
use Joomla\Component\Finder\Administrator\Indexer\Parser;
// phpcs:disable PSR1.Files.SideEffects
\defined('_JEXEC') or die;
// phpcs:enable PSR1.Files.SideEffects
/**
* HTML Parser class for the Finder indexer package.
*
* @since 2.5
*/
class Html extends Parser
{
/**
* Method to parse input and extract the plain text. Because this method is
* called from both inside and outside the indexer, it needs to be able to
* batch out its parsing functionality to deal with the inefficiencies of
* regular expressions. We will parse recursively in 2KB chunks.
*
* @param string $input The input to parse.
*
* @return string The plain text input.
*
* @since 2.5
*/
public function parse($input)
{
// Strip invalid UTF-8 characters.
$oldSetting = ini_get('mbstring.substitute_character');
ini_set('mbstring.substitute_character', 'none');
$input = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
ini_set('mbstring.substitute_character', $oldSetting);
// Remove anything between <head> and </head> tags. Do this first
// because there might be <script> or <style> tags nested inside.
$input = $this->removeBlocks($input, '<head>', '</head>');
// Convert <style> and <noscript> tags to <script> tags
// so we can remove them efficiently.
$search = [
'<style', '</style',
'<noscript', '</noscript',
];
$replace = [
'<script', '</script',
'<script', '</script',
];
$input = str_replace($search, $replace, $input);
// Strip all script blocks.
$input = $this->removeBlocks($input, '<script', '</script>');
// Decode HTML entities.
$input = html_entity_decode($input, ENT_QUOTES, 'UTF-8');
// Convert entities equivalent to spaces to actual spaces.
$input = str_replace([' ', ' '], ' ', $input);
// Add a space before both the OPEN and CLOSE tags of BLOCK and LINE BREAKING elements,
// e.g. 'all<h1><em>m</em>obile List</h1>' will become 'all mobile List'
$input = preg_replace('/(<|<\/)(' .
'address|article|aside|blockquote|br|canvas|dd|div|dl|dt|' .
'fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|' .
'main|nav|noscript|ol|output|p|pre|section|table|tfoot|ul|video' .
')\b/i', ' $1$2', $input);
// Strip HTML tags.
$input = strip_tags($input);
return parent::parse($input);
}
/**
* Method to process HTML input and extract the plain text.
*
* @param string $input The input to process.
*
* @return string The plain text input.
*
* @since 2.5
*/
protected function process($input)
{
// Replace any amount of white space with a single space.
return preg_replace('#\s+#u', ' ', $input);
}
/**
* Method to remove blocks of text between a start and an end tag.
* Each block removed is effectively replaced by a single space.
*
* Note: The start tag and the end tag must be different.
* Note: Blocks must not be nested.
* Note: This method will function correctly with multi-byte strings.
*
* @param string $input String to be processed.
* @param string $startTag String representing the start tag.
* @param string $endTag String representing the end tag.
*
* @return string with blocks removed.
*
* @since 3.4
*/
private function removeBlocks($input, $startTag, $endTag)
{
$return = '';
$offset = 0;
$startTagLength = strlen($startTag);
$endTagLength = strlen($endTag);
// Find the first start tag.
$start = stripos($input, $startTag);
// If no start tags were found, return the string unchanged.
if ($start === false) {
return $input;
}
// Look for all blocks defined by the start and end tags.
while ($start !== false) {
// Accumulate the substring up to the start tag.
$return .= substr($input, $offset, $start - $offset) . ' ';
// Look for an end tag corresponding to the start tag.
$end = stripos($input, $endTag, $start + $startTagLength);
// If no corresponding end tag, leave the string alone.
if ($end === false) {
// Fix the offset so part of the string is not duplicated.
$offset = $start;
break;
}
// Advance the start position.
$offset = $end + $endTagLength;
// Look for the next start tag and loop.
$start = stripos($input, $startTag, $offset);
}
// Add in the final substring after the last end tag.
$return .= substr($input, $offset);
return $return;
}
}