PortableInfobox/includes/services/Sanitizers/NodeSanitizer.php

268 lines
7.5 KiB
PHP
Raw Normal View History

2015-12-23 14:54:03 +00:00
<?php
namespace PortableInfobox\Sanitizers;
use PortableInfobox\Helpers\HtmlHelper;
use MediaWiki\Logger\LoggerFactory;
2015-12-23 14:54:03 +00:00
abstract class NodeSanitizer implements NodeTypeSanitizerInterface {
2016-01-04 11:09:50 +00:00
private $rootNodeTag = 'body';
2016-01-04 11:09:50 +00:00
/**
* Sanitizer configuration
* Can be overridden by child classes
*/
// these are selectors for explicitly allowed tags, like 'a'
2016-01-04 12:09:39 +00:00
protected $allowedTags = [ ];
// these are valid 'internal' node names (per libxml convention, i.e. a, #text, etc)
2016-01-04 11:09:50 +00:00
protected $validNodeNames = [ '#text' ];
// these are selectors that describe nodes containing text that should be padded with whitespace
protected $selectorsWrappingTextToPad = [ ];
// these are selectors that describe root nodes of known features that should not be sanitized
2016-01-04 11:09:50 +00:00
protected $selectorsWrappingAllowedFeatures = [ ];
// these are selectors that describe nodes for full removal
2016-01-04 11:09:50 +00:00
protected $selectorsForFullRemoval = [ ];
2015-12-28 09:16:33 +00:00
2015-12-23 14:54:03 +00:00
/**
2016-01-04 11:09:50 +00:00
* Sanitize a single text element, i.e. title or label
2015-12-23 14:54:03 +00:00
*
* @param $elementText
* @return string
*/
2016-01-12 09:08:37 +00:00
protected function sanitizeElementData( $elementText ) {
$dom = HtmlHelper::createDOMDocumentFromText( $this->prepareValidXML( $elementText ) );
2016-01-04 12:09:39 +00:00
$elementTextAfterTrim = trim( $this->cleanUpDOM( $dom ) );
2015-12-23 14:54:03 +00:00
if ( $elementTextAfterTrim !== $elementText ) {
LoggerFactory::getInstance( 'PortableInfobox' )->info(
'Stripping HTML tags from infobox element: [ originalText: "{originalText}", trimmedText: "{trimmedText}" ]',
[
'originalText' => $elementText,
'trimmedText' => $elementTextAfterTrim
]
);
2015-12-23 14:54:03 +00:00
$elementText = $elementTextAfterTrim;
}
return $elementText;
}
2015-12-28 09:16:33 +00:00
2016-01-04 12:09:39 +00:00
/**
* Wraps text in root node and prefixes with XML header providing explicit encoding
*
2016-01-04 12:09:39 +00:00
* @param $elementText
* @return string
*/
protected function prepareValidXML( $elementText ) {
$wrappedText = implode('',[
\Xml::openElement( $this->rootNodeTag ),
$elementText,
\Xml::closeElement( $this->rootNodeTag )
]);
2016-01-04 12:09:39 +00:00
return $wrappedText;
}
2015-12-28 09:16:33 +00:00
/**
* Removes nodes that do not need to remain in the resulting output. By default leaves only text nodes
*
2016-01-04 11:09:50 +00:00
* @param $dom DOMDocument
2015-12-28 09:16:33 +00:00
* @return string
*/
2016-01-04 12:09:39 +00:00
protected function cleanUpDOM( $dom ) {
$xpath = new \DOMXPath( $dom );
2016-01-04 11:09:50 +00:00
$this->removeNodesBySelector( $xpath, $this->selectorsForFullRemoval );
2016-01-04 12:09:39 +00:00
$nodes = $this->extractNeededNodes( $xpath );
return $this->normalizeWhitespace( $this->generateHTML( $nodes, $dom ) );
}
/**
2016-01-04 11:09:50 +00:00
* Produces sanitized HTML markup from DOMNode array
*
* @param $nodes array of DOMNode
* @param $dom DOMDocument
* @return string
*/
2016-01-04 11:09:50 +00:00
protected function generateHTML( $nodes, $dom ) {
$result = [ ];
2016-01-04 11:09:50 +00:00
foreach ( $nodes as $node ) {
$outputHtml = $rawHtml = $dom->saveHTML( $node );
if ( $node->nodeName === '#text' ) {
// As the input text is already escaped, we make sure that our output will be escaped too
$outputHtml = htmlspecialchars( $rawHtml, ENT_QUOTES );
}
if ( $node->parentNode && in_array( $node->parentNode->nodeName, $this->selectorsWrappingTextToPad ) ) {
$outputHtml = sprintf( ' %s ', $rawHtml );
}
$result[] = $outputHtml;
}
return implode( '', $result );
}
/**
* Replaces multiple whitespaces with single ones.
* Transparent from non-preformatted HTML point of view
*
* @param $text string
* @return string
*/
protected function normalizeWhitespace( $text ) {
return mbereg_replace( "\s+", " ", $text );
2016-01-04 11:09:50 +00:00
}
2016-01-04 11:09:50 +00:00
/**
* Returns xpath string covering all legal tag and text nodes concerning the sanitizer
* @return string
*/
2016-01-04 12:09:39 +00:00
protected function getAllNodesXPath() {
$xpathExpressions = [ ];
foreach ( $this->selectorsWrappingAllowedFeatures as $selector ) {
$xpathExpressions [] = sprintf( '//%s//%s', $this->rootNodeTag, $selector );
}
foreach ( $this->allowedTags as $selector ) {
$xpathExpressions [] = sprintf( '//%s//%s', $this->rootNodeTag, $selector );
}
$xpathExpressions [] = sprintf( '//%s//text()', $this->rootNodeTag );
return implode( ' | ', $xpathExpressions );
}
/**
2016-01-04 11:09:50 +00:00
* @param $DOMnode DOMNode
* @return bool
*/
2016-01-04 11:09:50 +00:00
protected function shouldNodeBeRemoved( $DOMnode ) {
return ( $DOMnode && $DOMnode->nodeName === 'a' && $DOMnode->nodeValue === '' );
}
2015-12-28 09:16:33 +00:00
/**
2016-01-04 11:09:50 +00:00
*
* @param $node DOMNode
* @return bool
*/
2016-01-04 12:09:39 +00:00
protected function isNodeAllowedByTag( $node ) {
2016-01-04 11:09:50 +00:00
// tags that are explicitly allowed
2016-01-04 12:09:39 +00:00
if ( in_array( $node->nodeName, array_merge( $this->validNodeNames, $this->allowedTags ), true ) ) {
return true;
}
return false;
}
/**
2016-01-04 11:09:50 +00:00
* Removes nodes specified by tag completely
* @param $xpath DOMXPath
* @param $selectorsToRemove array
*/
protected function removeNodesBySelector( $xpath, $selectorsToRemove = [ ] ) {
2016-01-04 11:09:50 +00:00
foreach ( $selectorsToRemove as $selector ) {
$nodesToRemove = $xpath->query( sprintf( '//%s//%s', $this->rootNodeTag, $selector ) );
foreach ( $nodesToRemove as $node ) {
$node->parentNode->removeChild( $node );
}
}
}
/**
2016-01-04 12:09:39 +00:00
* Returns nodes that have contents allowed by current sanitizer's config
*
2016-01-04 11:09:50 +00:00
* @param $xpath DOMXPath
* @param $allowedTags
2016-01-04 12:09:39 +00:00
* @return array of DOMNode
*/
2016-01-04 12:09:39 +00:00
protected function extractNeededNodes( $xpath ) {
$nodes = [ ];
$featureNodes = [ ];
2016-01-04 12:09:39 +00:00
$allNodes = $xpath->query( $this->getAllNodesXPath() );
foreach ( $allNodes as $node ) {
if ( $this->shouldNodeBeProcessed( $node, $nodes, $featureNodes, $xpath ) ) {
$nodes [] = $node;
// Store the information that a given feature node was processed
if ( $this->isAllowedFeatureNode( $node, $xpath ) ) {
$featureNodes [] = $node;
}
}
}
return $nodes;
}
/**
2016-01-04 11:09:50 +00:00
* Checks if current node is a wrapping tag for a feature
*
* @param $node DOMNode
* @param $xpath DOMXPath
* @return bool
2015-12-28 09:16:33 +00:00
*/
2016-01-04 12:09:39 +00:00
protected function isAllowedFeatureNode( $node, $xpath ) {
foreach ( $this->selectorsWrappingAllowedFeatures as $selector ) {
2016-01-04 11:09:50 +00:00
$nodeQueryResult = $xpath->query( sprintf( '//%s//%s', $this->rootNodeTag, $selector ), $node );
if ( $nodeQueryResult->length ) {
return true;
}
}
return false;
2015-12-28 09:16:33 +00:00
}
2016-01-04 11:09:50 +00:00
/**
* Used for skipping processing on subnodes of feature nodes (i.e. citation)
*
* @param $node DOMNode
* @param $nodes
* @param $featureNodes
* @return bool
*/
2016-01-04 12:09:39 +00:00
protected function isDescendantOfProcessedFeatureNode( $node, $featureNodes ) {
2016-01-04 11:09:50 +00:00
$parent = $node->parentNode;
while ( $parent ) {
if ( in_array( $parent, $featureNodes, true ) ) {
return true;
}
$parent = $parent->parentNode;
}
return false;
}
2016-01-04 12:09:39 +00:00
/**
* @param $node
* @param $nodes
* @return bool
*/
protected function isChildOfProcessedTagNode( $node, $nodes ) {
return (
in_array( $node->parentNode, $nodes, true )
&& $node->parentNode
&& $node->parentNode->childNodes->length === 1
);
}
/**
* Returns whether a node should be fully processed based on multiple factors
* captured in submethods; the key two criteria are:
* - is it an allowed node (by tag or a selector)
* - is it excluded from processing because of its state (i.e. empty) or already processed ancestor nodes
*
* @param $node DOMNode
* @param $simpleTagNodes array of DOMNode
* @param $featureNodes array of DOMNode
* @param $xpath DOMXPath
* @return bool
*/
protected function shouldNodeBeProcessed( $node, $simpleTagNodes, $featureNodes, $xpath ) {
return (
$this->isNodeAllowedByTag( $node, $this->allowedTags )
|| $this->isAllowedFeatureNode( $node, $xpath )
)
&& !$this->shouldNodeBeRemoved( $node )
&& !$this->isChildOfProcessedTagNode( $node, $simpleTagNodes )
&& !$this->isDescendantOfProcessedFeatureNode( $node, $featureNodes );
}
2015-12-23 14:54:03 +00:00
}