2015-12-23 14:54:03 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
use Wikia\Logger\WikiaLogger;
|
|
|
|
|
2015-12-29 07:10:24 +00:00
|
|
|
abstract class NodeSanitizer implements NodeTypeSanitizerInterface {
|
2016-01-04 11:09:50 +00:00
|
|
|
private $rootNodeTag = 'body';
|
2015-12-30 10:00:10 +00:00
|
|
|
|
2016-01-04 11:09:50 +00:00
|
|
|
/**
|
|
|
|
* Sanitizer configuration
|
|
|
|
* Can be overridden by child classes
|
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected $allowedTags = [ ];
|
2016-01-04 11:09:50 +00:00
|
|
|
protected $validNodeNames = [ '#text' ];
|
|
|
|
protected $selectorsWrappingAllowedFeatures = [ ];
|
|
|
|
protected $selectorsForFullRemoval = [ ];
|
2015-12-28 09:16:33 +00:00
|
|
|
|
2015-12-23 14:54:03 +00:00
|
|
|
/**
|
2016-01-04 11:09:50 +00:00
|
|
|
* Sanitize a single text element, i.e. title or label
|
2015-12-23 14:54:03 +00:00
|
|
|
*
|
|
|
|
* @param $elementText
|
|
|
|
* @return string
|
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function sanitizeElementData( $elementText ) {
|
2016-01-04 11:09:50 +00:00
|
|
|
$dom = new \DOMDocument();
|
|
|
|
$dom->loadHTML( $this->wrapTextInRootNode( $elementText ) );
|
2016-01-04 12:09:39 +00:00
|
|
|
$elementTextAfterTrim = trim( $this->cleanUpDOM( $dom ) );
|
2016-01-04 11:09:50 +00:00
|
|
|
libxml_clear_errors();
|
2015-12-23 14:54:03 +00:00
|
|
|
|
|
|
|
if ( $elementTextAfterTrim !== $elementText ) {
|
2015-12-30 10:00:10 +00:00
|
|
|
WikiaLogger::instance()->info(
|
|
|
|
'Stripping HTML tags from infobox element',
|
|
|
|
[
|
|
|
|
'originalText' => $elementText,
|
|
|
|
'trimmedText' => $elementTextAfterTrim
|
|
|
|
]
|
|
|
|
);
|
2015-12-23 14:54:03 +00:00
|
|
|
$elementText = $elementTextAfterTrim;
|
|
|
|
}
|
|
|
|
return $elementText;
|
|
|
|
}
|
2015-12-28 09:16:33 +00:00
|
|
|
|
2016-01-04 12:09:39 +00:00
|
|
|
/**
|
|
|
|
* @param $elementText
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function wrapTextInRootNode( $elementText ) {
|
|
|
|
$wrappedText = \Xml::openElement( $this->rootNodeTag ) . $elementText . \Xml::closeElement( $this->rootNodeTag );
|
|
|
|
return $wrappedText;
|
|
|
|
}
|
|
|
|
|
2015-12-28 09:16:33 +00:00
|
|
|
/**
|
2015-12-30 10:00:10 +00:00
|
|
|
* Removes nodes that do not need to remain in the resulting output. By default leaves only text nodes
|
|
|
|
*
|
2016-01-04 11:09:50 +00:00
|
|
|
* @param $dom DOMDocument
|
2015-12-28 09:16:33 +00:00
|
|
|
* @return string
|
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function cleanUpDOM( $dom ) {
|
2015-12-30 10:00:10 +00:00
|
|
|
$xpath = new \DOMXPath( $dom );
|
2016-01-04 11:09:50 +00:00
|
|
|
$this->removeNodesBySelector( $xpath, $this->selectorsForFullRemoval );
|
2016-01-04 12:09:39 +00:00
|
|
|
$nodes = $this->extractNeededNodes( $xpath );
|
2015-12-30 10:00:10 +00:00
|
|
|
|
2016-01-05 07:45:16 +00:00
|
|
|
return $this->normalizeWhitespace( $this->generateHTML( $nodes, $dom ) );
|
2015-12-30 10:00:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-01-04 11:09:50 +00:00
|
|
|
* Produces sanitized HTML markup from DOMNode array
|
|
|
|
*
|
|
|
|
* @param $nodes array of DOMNode
|
|
|
|
* @param $dom DOMDocument
|
|
|
|
* @return string
|
2015-12-30 10:00:10 +00:00
|
|
|
*/
|
2016-01-04 11:09:50 +00:00
|
|
|
protected function generateHTML( $nodes, $dom ) {
|
|
|
|
$result = [];
|
|
|
|
foreach ( $nodes as $node ) {
|
|
|
|
/*
|
|
|
|
* store the result; As the input text is already escaped, we make sure that
|
|
|
|
* our output will be escaped too
|
|
|
|
*/
|
|
|
|
$result[] = ( $node->nodeName === '#text' ) ? htmlspecialchars( $dom->saveHTML( $node ), ENT_QUOTES ) : $dom->saveHTML( $node );
|
2015-12-30 10:00:10 +00:00
|
|
|
}
|
2016-01-05 07:45:16 +00:00
|
|
|
return implode( ' ', $result );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Replaces multiple whitespaces with single ones.
|
|
|
|
* Transparent from non-preformatted HTML point of view
|
|
|
|
*
|
|
|
|
* @param $text string
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function normalizeWhitespace( $text ) {
|
|
|
|
return mbereg_replace( "\s+", " ", $text );
|
2016-01-04 11:09:50 +00:00
|
|
|
}
|
2015-12-30 10:00:10 +00:00
|
|
|
|
2016-01-04 11:09:50 +00:00
|
|
|
/**
|
|
|
|
* Returns xpath string covering all legal tag and text nodes concerning the sanitizer
|
|
|
|
* @return string
|
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function getAllNodesXPath() {
|
2016-01-04 11:09:50 +00:00
|
|
|
return sprintf('//%s/* | //%s//text()', $this->rootNodeTag, $this->rootNodeTag);
|
2015-12-30 10:00:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-01-04 11:09:50 +00:00
|
|
|
* @param $DOMnode DOMNode
|
2015-12-30 10:00:10 +00:00
|
|
|
* @return bool
|
|
|
|
*/
|
2016-01-04 11:09:50 +00:00
|
|
|
protected function shouldNodeBeRemoved( $DOMnode ) {
|
|
|
|
return ( $DOMnode && $DOMnode->nodeName === 'a' && $DOMnode->nodeValue === '' );
|
2015-12-30 10:00:10 +00:00
|
|
|
}
|
2015-12-28 09:16:33 +00:00
|
|
|
|
2015-12-30 10:00:10 +00:00
|
|
|
/**
|
2016-01-04 11:09:50 +00:00
|
|
|
*
|
2015-12-30 10:00:10 +00:00
|
|
|
* @param $node DOMNode
|
|
|
|
* @return bool
|
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function isNodeAllowedByTag( $node ) {
|
2016-01-04 11:09:50 +00:00
|
|
|
// tags that are explicitly allowed
|
2016-01-04 12:09:39 +00:00
|
|
|
if ( in_array( $node->nodeName, array_merge( $this->validNodeNames, $this->allowedTags ), true ) ) {
|
2015-12-30 10:00:10 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-01-04 11:09:50 +00:00
|
|
|
* Removes nodes specified by tag completely
|
|
|
|
* @param $xpath DOMXPath
|
|
|
|
* @param $selectorsToRemove array
|
2015-12-30 10:00:10 +00:00
|
|
|
*/
|
2016-01-04 11:09:50 +00:00
|
|
|
protected function removeNodesBySelector( $xpath, $selectorsToRemove = [] ) {
|
|
|
|
foreach ( $selectorsToRemove as $selector ) {
|
|
|
|
$nodesToRemove = $xpath->query( sprintf( '//%s//%s', $this->rootNodeTag, $selector ) );
|
2015-12-30 10:00:10 +00:00
|
|
|
foreach ( $nodesToRemove as $node ) {
|
|
|
|
$node->parentNode->removeChild( $node );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-01-04 12:09:39 +00:00
|
|
|
* Returns nodes that have contents allowed by current sanitizer's config
|
|
|
|
*
|
2016-01-04 11:09:50 +00:00
|
|
|
* @param $xpath DOMXPath
|
2015-12-30 10:00:10 +00:00
|
|
|
* @param $allowedTags
|
2016-01-04 12:09:39 +00:00
|
|
|
* @return array of DOMNode
|
2015-12-30 10:00:10 +00:00
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function extractNeededNodes( $xpath ) {
|
2015-12-30 10:00:10 +00:00
|
|
|
$nodes = [ ];
|
|
|
|
$featureNodes = [ ];
|
2016-01-04 12:09:39 +00:00
|
|
|
$allNodes = $xpath->query( $this->getAllNodesXPath() );
|
|
|
|
foreach ( $allNodes as $node ) {
|
|
|
|
if ( $this->shouldNodeBeProcessed( $node, $nodes, $featureNodes, $xpath ) ) {
|
|
|
|
$nodes [] = $node;
|
|
|
|
|
|
|
|
// Store the information that a given feature node was processed
|
|
|
|
if ( $this->isAllowedFeatureNode( $node, $xpath ) ) {
|
2015-12-30 10:00:10 +00:00
|
|
|
$featureNodes [] = $node;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $nodes;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-01-04 11:09:50 +00:00
|
|
|
* Checks if current node is a wrapping tag for a feature
|
|
|
|
*
|
|
|
|
* @param $node DOMNode
|
|
|
|
* @param $xpath DOMXPath
|
|
|
|
* @return bool
|
2015-12-28 09:16:33 +00:00
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function isAllowedFeatureNode( $node, $xpath ) {
|
|
|
|
foreach ( $this->selectorsWrappingAllowedFeatures as $selector ) {
|
2016-01-04 11:09:50 +00:00
|
|
|
$nodeQueryResult = $xpath->query( sprintf( '//%s//%s', $this->rootNodeTag, $selector ), $node );
|
|
|
|
if ( $nodeQueryResult->length ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
2015-12-28 09:16:33 +00:00
|
|
|
}
|
|
|
|
|
2016-01-04 11:09:50 +00:00
|
|
|
/**
|
|
|
|
* Used for skipping processing on subnodes of feature nodes (i.e. citation)
|
|
|
|
*
|
|
|
|
* @param $node DOMNode
|
|
|
|
* @param $nodes
|
|
|
|
* @param $featureNodes
|
|
|
|
* @return bool
|
|
|
|
*/
|
2016-01-04 12:09:39 +00:00
|
|
|
protected function isDescendantOfProcessedFeatureNode( $node, $featureNodes ) {
|
2016-01-04 11:09:50 +00:00
|
|
|
$parent = $node->parentNode;
|
|
|
|
while ( $parent ) {
|
|
|
|
if ( in_array( $parent, $featureNodes, true ) ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
$parent = $parent->parentNode;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2016-01-04 12:09:39 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @param $node
|
|
|
|
* @param $nodes
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
protected function isChildOfProcessedTagNode( $node, $nodes ) {
|
|
|
|
return (
|
|
|
|
in_array( $node->parentNode, $nodes, true )
|
|
|
|
&& $node->parentNode
|
|
|
|
&& $node->parentNode->childNodes->length === 1
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns whether a node should be fully processed based on multiple factors
|
|
|
|
* captured in submethods; the key two criteria are:
|
|
|
|
* - is it an allowed node (by tag or a selector)
|
|
|
|
* - is it excluded from processing because of its state (i.e. empty) or already processed ancestor nodes
|
|
|
|
*
|
|
|
|
* @param $node DOMNode
|
|
|
|
* @param $simpleTagNodes array of DOMNode
|
|
|
|
* @param $featureNodes array of DOMNode
|
|
|
|
* @param $xpath DOMXPath
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
protected function shouldNodeBeProcessed( $node, $simpleTagNodes, $featureNodes, $xpath ) {
|
|
|
|
return (
|
|
|
|
$this->isNodeAllowedByTag( $node, $this->allowedTags )
|
|
|
|
|| $this->isAllowedFeatureNode( $node, $xpath )
|
|
|
|
)
|
|
|
|
&& !$this->shouldNodeBeRemoved( $node )
|
|
|
|
&& !$this->isChildOfProcessedTagNode( $node, $simpleTagNodes )
|
|
|
|
&& !$this->isDescendantOfProcessedFeatureNode( $node, $featureNodes );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-12-23 14:54:03 +00:00
|
|
|
}
|