diff --git a/services/PortableInfoboxRenderService.class.php b/services/PortableInfoboxRenderService.class.php index 9b27ed3..f24b748 100644 --- a/services/PortableInfoboxRenderService.class.php +++ b/services/PortableInfoboxRenderService.class.php @@ -173,7 +173,6 @@ class PortableInfoboxRenderService extends WikiaService { for ( $i = 0; $i < count($data); $i++ ) { $data[$i][ 'context' ] = self::MEDIA_CONTEXT_INFOBOX; $data[$i] = $helper->extendImageData( $data[$i] ); - $data[$i] = SanitizerBuilder::createFromType( $type )->sanitize( $data[$i] ); if ( !!$data[$i] ) { $images[] = $data[$i]; @@ -198,6 +197,9 @@ class PortableInfoboxRenderService extends WikiaService { $templateName = $type; } + /** + * Currently, based on business decision, sanitization happens ONLY on Mercury + */ if ( $helper->isWikiaMobile() ) { $data = SanitizerBuilder::createFromType( $type )->sanitize( $data ); } diff --git a/services/Sanitizers/NodeHeroImageSanitizer.php b/services/Sanitizers/NodeHeroImageSanitizer.php index 6fad8d5..bf77a80 100644 --- a/services/Sanitizers/NodeHeroImageSanitizer.php +++ b/services/Sanitizers/NodeHeroImageSanitizer.php @@ -1,6 +1,11 @@ sanitizeElementData( $data[ 'title' ][ 'value' ] ); } + if ( !empty( $data[ 'image' ][ 'caption' ] ) ) { + $data[ 'image' ]['caption'] = $this->sanitizeElementData( $data[ 'image' ]['caption'] ); + } return $data; } diff --git a/services/Sanitizers/NodeImageSanitizer.php b/services/Sanitizers/NodeImageSanitizer.php index 14737b5..caadd61 100644 --- a/services/Sanitizers/NodeImageSanitizer.php +++ b/services/Sanitizers/NodeImageSanitizer.php @@ -2,6 +2,7 @@ class NodeImageSanitizer extends NodeSanitizer { protected $allowedTags = [ 'a' ]; + protected $selectorsWrappingTextToPad = [ 'li' ]; protected $selectorsWrappingAllowedFeatures = [ 'sup[@class="reference"]' ]; protected $selectorsForFullRemoval = [ 'script', 'span[@itemprop="duration"]' ]; diff --git a/services/Sanitizers/NodeSanitizer.php b/services/Sanitizers/NodeSanitizer.php index 83ecd59..a0b3c57 100644 --- a/services/Sanitizers/NodeSanitizer.php +++ b/services/Sanitizers/NodeSanitizer.php @@ -9,9 +9,15 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { * Sanitizer configuration * Can be overridden by child classes */ + // these are selectors for explicitly allowed tags, like 'a' protected $allowedTags = [ ]; + // these are valid 'internal' node names (per libxml convention, i.e. a, #text, etc) protected $validNodeNames = [ '#text' ]; + // these are selectors that describe nodes containing text that should be padded with whitespace + protected $selectorsWrappingTextToPad = [ ]; + // these are selectors that describe root nodes of known features that should not be sanitized protected $selectorsWrappingAllowedFeatures = [ ]; + // these are selectors that describe nodes for full removal protected $selectorsForFullRemoval = [ ]; /** @@ -20,8 +26,8 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { * @param $elementText * @return string */ - protected function sanitizeElementData( $elementText ) { - $dom = new \DOMDocument( ); + protected function sanitizeElementData( $elementText ) { + $dom = new \DOMDocument(); $dom->loadHTML( $this->prepareValidXML( $elementText ) ); $elementTextAfterTrim = trim( $this->cleanUpDOM( $dom ) ); @@ -65,6 +71,7 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { protected function cleanUpDOM( $dom ) { $xpath = new \DOMXPath( $dom ); $this->removeNodesBySelector( $xpath, $this->selectorsForFullRemoval ); + $nodes = $this->extractNeededNodes( $xpath ); return $this->normalizeWhitespace( $this->generateHTML( $nodes, $dom ) ); @@ -78,15 +85,22 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { * @return string */ protected function generateHTML( $nodes, $dom ) { - $result = []; + $result = [ ]; foreach ( $nodes as $node ) { - /* - * store the result; As the input text is already escaped, we make sure that - * our output will be escaped too - */ - $result[] = ( $node->nodeName === '#text' ) ? htmlspecialchars( $dom->saveHTML( $node ), ENT_QUOTES ) : $dom->saveHTML( $node ); + $outputHtml = $rawHtml = $dom->saveHTML( $node ); + if ( $node->nodeName === '#text' ) { + // As the input text is already escaped, we make sure that our output will be escaped too + $outputHtml = htmlspecialchars( $rawHtml, ENT_QUOTES ); + } + if ( $node->parentNode && in_array( $node->parentNode->nodeName, $this->selectorsWrappingTextToPad ) ) { + $outputHtml = sprintf( ' %s ', $rawHtml ); + } + + $result[] = $outputHtml; + + } - return implode( ' ', $result ); + return implode( '', $result ); } /** @@ -105,7 +119,16 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { * @return string */ protected function getAllNodesXPath() { - return sprintf('//%s/* | //%s//text()', $this->rootNodeTag, $this->rootNodeTag); + $xpathExpressions = [ ]; + foreach ( $this->selectorsWrappingAllowedFeatures as $selector ) { + $xpathExpressions [] = sprintf( '//%s//%s', $this->rootNodeTag, $selector ); + } + foreach ( $this->allowedTags as $selector ) { + $xpathExpressions [] = sprintf( '//%s//%s', $this->rootNodeTag, $selector ); + } + $xpathExpressions [] = sprintf( '//%s//text()', $this->rootNodeTag ); + + return implode( ' | ', $xpathExpressions ); } /** @@ -135,7 +158,7 @@ abstract class NodeSanitizer implements NodeTypeSanitizerInterface { * @param $xpath DOMXPath * @param $selectorsToRemove array */ - protected function removeNodesBySelector( $xpath, $selectorsToRemove = [] ) { + protected function removeNodesBySelector( $xpath, $selectorsToRemove = [ ] ) { foreach ( $selectorsToRemove as $selector ) { $nodesToRemove = $xpath->query( sprintf( '//%s//%s', $this->rootNodeTag, $selector ) ); foreach ( $nodesToRemove as $node ) { diff --git a/tests/PortableInfoboxRenderServiceTest.php b/tests/PortableInfoboxRenderServiceTest.php index 5fef5a9..1a52b35 100644 --- a/tests/PortableInfoboxRenderServiceTest.php +++ b/tests/PortableInfoboxRenderServiceTest.php @@ -572,7 +572,7 @@ class PortableInfoboxRenderServiceTest extends WikiaBaseTest { 'output' => '