Add Parsoid support for syntaxhighlight

* Added Parsoid config, and refactored code slightly to
  add native Parsoid handlers for parser tags exposed
  by this extension.
* Enabled parsoid mode testing on the test file.
* Added html/parsoid sections on a few tests.
* Marked rest of tests as wt2html and wt2wt only since
  html2wt and html2html will fail without a html/parsoid section
  and there is no real benefit to adding them to all tests.
* Added a couple tests to the known failures list:
  - One is because of T299103.
  - The other is because Parsoid always emits attributes in the
    form <tag .. foo="bar"..> instead of just <tag ... foo ..>
    Since Parsoid needs to accept this format that is present on
    wikis, I added a html/parsoid section for this test and
    added the failures to the known failures list.

Bug: T272939
Change-Id: Ie30aa6b082d4fc43c73296ff2ed6cb8c3873f48f
This commit is contained in:
Subramanya Sastry 2022-07-22 17:14:03 -05:00 committed by Arlo Breault
parent 3bee59df01
commit 0eef7add67
5 changed files with 182 additions and 34 deletions

View file

@ -102,6 +102,9 @@
],
"SoftwareInfo": "SyntaxHighlight::onSoftwareInfo"
},
"ParsoidModules": [
"MediaWiki\\SyntaxHighlight\\ParsoidExt"
],
"attributes": {
"SyntaxHighlight": {
"Models": {}

39
includes/ParsoidExt.php Normal file
View file

@ -0,0 +1,39 @@
<?php
declare( strict_types = 1 );
namespace MediaWiki\SyntaxHighlight;
use Wikimedia\Parsoid\Ext\ExtensionModule;
class ParsoidExt implements ExtensionModule {
/** @inheritDoc */
public function getConfig(): array {
return [
'name' => 'SyntaxHighlight',
'tags' => [
[
'name' => 'source',
'handler' => SyntaxHighlight::class,
'options' => [
// Strip nowiki markers from #tag parser-function arguments.
// This will be used to resolve T299103.
// This is primarily a b/c flag in Parsoid.
'stripNowiki' => true
]
],
[
'name' => 'syntaxhighlight',
'handler' => SyntaxHighlight::class,
'options' => [
// Strip nowiki markers from #tag parser-function arguments.
// This will be used to resolve T299103.
// This is primarily a b/c flag in Parsoid.
'stripNowiki' => true
]
]
]
];
}
}

View file

@ -34,7 +34,11 @@ use TextContent;
use Title;
use WANObjectCache;
class SyntaxHighlight {
use Wikimedia\Parsoid\DOM\DocumentFragment;
use Wikimedia\Parsoid\Ext\ExtensionTagHandler;
use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
class SyntaxHighlight extends ExtensionTagHandler {
/** @var int The maximum number of lines that may be selected for highlighting. */
private const HIGHLIGHT_MAX_LINES = 1000;
@ -116,6 +120,45 @@ class SyntaxHighlight {
return self::parserHook( $text, $args, $parser );
}
/**
* @return array
*/
private static function getModuleStyles(): array {
return [ 'ext.pygments' ];
}
/**
*
* @param string $text
* @param array $args
* @param ?Parser $parser
* @return array
* @throws MWException
*/
private static function processContent( string $text, array $args, ?Parser $parser = null ): array {
// Don't trim leading spaces away, just the linefeeds
$out = preg_replace( '/^\n+/', '', rtrim( $text ) );
$trackingCats = [];
// Convert deprecated attributes
if ( isset( $args['enclose'] ) ) {
if ( $args['enclose'] === 'none' ) {
$args['inline'] = true;
}
unset( $args['enclose'] );
$trackingCats[] = 'syntaxhighlight-enclose-category';
}
$lexer = $args['lang'] ?? '';
$result = self::highlight( $out, $lexer, $args, $parser );
if ( !$result->isGood() ) {
$trackingCats[] = 'syntaxhighlight-error-category';
}
return [ 'html' => $result->getValue(), 'cats' => $trackingCats ];
}
/**
* Parser hook for both <source> and <syntaxhighlight> logic
*
@ -129,32 +172,30 @@ class SyntaxHighlight {
// Replace strip markers (For e.g. {{#tag:syntaxhighlight|<nowiki>...}})
$out = $parser->getStripState()->unstripNoWiki( $text );
// Don't trim leading spaces away, just the linefeeds
$out = preg_replace( '/^\n+/', '', rtrim( $out ) );
// Convert deprecated attributes
if ( isset( $args['enclose'] ) ) {
if ( $args['enclose'] === 'none' ) {
$args['inline'] = true;
}
unset( $args['enclose'] );
$parser->addTrackingCategory( 'syntaxhighlight-enclose-category' );
$result = self::processContent( $out, $args, $parser );
foreach ( $result['cats'] as $cat ) {
$parser->addTrackingCategory( $cat );
}
$lexer = $args['lang'] ?? '';
$result = self::highlight( $out, $lexer, $args, $parser );
if ( !$result->isGood() ) {
$parser->addTrackingCategory( 'syntaxhighlight-error-category' );
}
$out = $result->getValue();
// Register CSS
// TODO: Consider moving to a separate method so that public method
// highlight() can be used without needing to know the module name.
$parser->getOutput()->addModuleStyles( [ 'ext.pygments' ] );
$parser->getOutput()->addModuleStyles( self::getModuleStyles() );
return $out;
return $result['html'];
}
/** @inheritDoc */
public function sourceToDom(
ParsoidExtensionAPI $extApi, string $text, array $extArgs
): ?DocumentFragment {
$result = self::processContent( $text, $extApi->extArgsToArray( $extArgs ) );
// FIXME: There is no API method in Parsoid to add tracking categories
// So, $result['cats'] is being ignored
// Register CSS
$extApi->addModuleStyles( self::getModuleStyles() );
return $extApi->htmlToDom( $result['html'] );
}
/**
@ -529,7 +570,7 @@ class SyntaxHighlight {
}
$out = $status->getValue();
$parserOutput->addModuleStyles( [ 'ext.pygments' ] );
$parserOutput->addModuleStyles( self::getModuleStyles() );
$parserOutput->addModules( [ 'ext.pygments.linenumbers' ] );
$parserOutput->setText( $out );
@ -566,7 +607,7 @@ class SyntaxHighlight {
$out = '<pre' . $encodedAttrs . '>' . substr( $out, strlen( $m[0] ) );
}
$output = $context->getOutput();
$output->addModuleStyles( 'ext.pygments' );
$output->addModuleStyles( self::getModuleStyles() );
$output->addHTML( '<div dir="ltr">' . $out . '</div>' );
// Inform MediaWiki that we have parsed this page and it shouldn't mess with it.

View file

@ -0,0 +1,26 @@
{
"Enclose with nowiki": {
"wt2html": "<p data-parsoid='{\"dsr\":[0,69,0,0]}'><code class=\"mw-highlight mw-highlight-lang-text mw-content-ltr\" dir=\"ltr\" typeof=\"mw:Extension/syntaxhighlight mw:Transclusion\" about=\"#mwt2\" data-parsoid='{\"pi\":[[{\"k\":\"1\"},{\"k\":\"lang\",\"named\":true},{\"k\":\"inline\",\"named\":true}]],\"dsr\":[0,69,null,null]}' data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"#tag:syntaxhighlight\",\"function\":\"tag\"},\"params\":{\"1\":{\"wt\":\"&lt;nowiki>foo&lt;/nowiki>\"},\"lang\":{\"wt\":\"\\\"text\\\"\"},\"inline\":{\"wt\":\"none\"}},\"i\":0}}]}'>&lt;nowiki>foo&lt;/nowiki></code></p>",
"html2html": "<p data-parsoid='{\"dsr\":[0,69,0,0]}'><code class=\"mw-highlight mw-highlight-lang-text mw-content-ltr\" dir=\"ltr\" typeof=\"mw:Extension/syntaxhighlight mw:Transclusion\" about=\"#mwt2\" data-parsoid='{\"pi\":[[{\"k\":\"1\"},{\"k\":\"lang\",\"named\":true},{\"k\":\"inline\",\"named\":true}]],\"dsr\":[0,69,null,null]}' data-mw='{\"parts\":[{\"template\":{\"target\":{\"wt\":\"#tag:syntaxhighlight\",\"function\":\"tag\"},\"params\":{\"1\":{\"wt\":\"&lt;nowiki>foo&lt;/nowiki>\"},\"lang\":{\"wt\":\"\\\"text\\\"\"},\"inline\":{\"wt\":\"none\"}},\"i\":0}}]}'>&lt;nowiki>foo&lt;/nowiki></code></p>"
},
"Inline attribute (inline code)": {
"wt2wt": "Text <source lang=\"javascript\" inline=\"\">var a;</source>.",
"selser [1]": "Text <source lang=\"javascript\" inline>var a;</source>.",
"selser [[4,0,4]]": "121y1t4<source lang=\"javascript\" inline>var a;</source>1ibg90y",
"selser [[0,0,2]]": "Text <source lang=\"javascript\" inline>var a;</source>yeuqlm.",
"selser [2]": "brhsc5\n\nText <source lang=\"javascript\" inline>var a;</source>.",
"selser [[0,0,4]]": "Text <source lang=\"javascript\" inline>var a;</source>4x62b6",
"selser [[2,0,0]]": "1mjitw8Text <source lang=\"javascript\" inline>var a;</source>.",
"selser [[0,0,3]]": "Text <source lang=\"javascript\" inline>var a;</source>",
"selser [[2,0,4]]": "uuwnfvText <source lang=\"javascript\" inline>var a;</source>qjkvx1",
"selser [[3,0,3]]": "<source lang=\"javascript\" inline>var a;</source>",
"selser [[4,0,0]]": "uouej0<source lang=\"javascript\" inline>var a;</source>.",
"selser [[3,0,0]]": "<source lang=\"javascript\" inline>var a;</source>.",
"selser [[4,0,3]]": "1wcoh54<source lang=\"javascript\" inline>var a;</source>",
"selser [[2,0,2]]": "11th7ozText <source lang=\"javascript\" inline>var a;</source>1c8ff9m.",
"selser [[2,0,3]]": "1cyv6rText <source lang=\"javascript\" inline>var a;</source>",
"selser [[4,0,2]]": "g6ytvp<source lang=\"javascript\" inline>var a;</source>10gwvyi.",
"selser [[3,0,2]]": "<source lang=\"javascript\" inline>var a;</source>kjtjv6.",
"selser [[3,0,4]]": "<source lang=\"javascript\" inline>var a;</source>zs6oqy"
}
}

View file

@ -1,4 +1,8 @@
!! Version 2
!! options
parsoid-compatible
version=2
!! end
# Force the test runner to ensure the extension is loaded
!! hooks
source
@ -10,12 +14,16 @@ Non-existent language
<source lang="doesnotexist">
foobar
</source>
!! html
!! html/php
<div class="mw-highlight mw-content-ltr" dir="ltr"><pre>foobar</pre></div>
!! html/parsoid
<div class="mw-highlight mw-content-ltr" dir="ltr" about="#mwt1" typeof="mw:Extension/source" data-mw='{"name":"source","attrs":{"lang":"doesnotexist"},"body":{"extsrc":"\nfoobar\n"}}'><pre>foobar</pre></div>
!! end
!! test
No language specified
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source>
foo
@ -26,6 +34,8 @@ foo
!! test
No language specified (no wellformed xml)
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! config
!! wikitext
<source>
@ -37,6 +47,8 @@ bar
!! test
XSS is escaped
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source lang="doesnotexist">
<script>alert("pwnd")</script>
@ -61,8 +73,10 @@ SRC=&amp;#106;&amp;#97;&amp;#118;&amp;#97;&amp;#115;&amp;#99;&amp;#114;&amp;#105
!! test
XSS is escaped (inline)
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source lang="doesnotexist" inline>
<source lang="doesnotexist" inline="">
<script>alert("pwnd")</script>
<IMG SRC=`javascript:alert("RSnake says, 'XSS'")`>
<IMG
@ -79,6 +93,8 @@ SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#1
!! test
Default behaviour (inner is pre)
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source lang="javascript">
var a;
@ -90,11 +106,13 @@ var a;
!! test
Multiline <source/> in lists
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
* <source>a
*<source>a
b</source>
* foo <source>a
*foo <source>a
b</source>
!! html
<ul><li><div class="mw-highlight mw-content-ltr" dir="ltr"><pre>a
@ -105,6 +123,8 @@ b</pre></div></li></ul>
!! test
Custom attributes
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source lang="javascript" id="foo" class="bar" dir="rtl" style="font-size: larger;">var a;</source>
!! html
@ -112,17 +132,25 @@ Custom attributes
</pre></div>
!! end
# The html/parsoid section verifies that Parsoid can handle attributes
# that are not in key=value XML-like syntax.
!! test
Inline attribute (inline code)
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
Text <source lang="javascript" inline>var a;</source>.
!! html
!! html/php
<p>Text <code class="mw-highlight mw-highlight-lang-javascript mw-content-ltr" dir="ltr"><span class="kd">var</span> <span class="nx">a</span><span class="p">;</span></code>.
</p>
!! html/parsoid
<p>Text <code class="mw-highlight mw-highlight-lang-javascript mw-content-ltr" dir="ltr" about="#mwt1" typeof="mw:Extension/source" data-mw='{"name":"source","attrs":{"lang":"javascript","inline":""},"body":{"extsrc":"var a;"}}'><span class="kd">var</span> <span class="nx">a</span><span class="p">;</span></code>.</p>
!! end
!! test
Enclose none (inline code)
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
Text <source lang="javascript" enclose="none">var a;</source>.
!! html
@ -134,13 +162,17 @@ Text <source lang="javascript" enclose="none">var a;</source>.
Enclose with nowiki
!! wikitext
{{#tag:syntaxhighlight|<nowiki>foo</nowiki>|lang="text"|inline=none}}
!! html
!! html/php
<p><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" dir="ltr">foo</code>
</p>
!! html/parsoid
<p><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" dir="ltr" about="#mwt1" typeof="mw:Extension/syntaxhighlight mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"#tag:syntaxhighlight","function":"tag"},"params":{"1":{"wt":"&lt;nowiki>foo&lt;/nowiki>"},"lang":{"wt":"\"text\""},"inline":{"wt":"none"}},"i":0}}]}'>foo</code></p>
!! end
!! test
No code
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source lang="CSharp"></source>
!! html
@ -149,6 +181,8 @@ No code
!! test
Just whitespace
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<source lang="CSharp"> </source>
!! html
@ -157,6 +191,8 @@ Just whitespace
!! test
tabs plus tidy (T32930, T59826)
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! wikitext
<syntaxhighlight lang="javascript" enclose="pre" highlight="2-3">
function doSomething() {
@ -175,19 +211,22 @@ function doSomething() {
!! test
deprecated source tag adds tracking category
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
!! options
cat
!! wikitext
<source lang="python">print('Hi')</source>
!! html
!! html/php
cat=Pages_using_deprecated_source_tags sort=
!! end
!! test
deprecated enclose option adds tracking category
!! options
parsoid={ "modes": ["wt2html","wt2wt"], "normalizePhp": true }
cat
!! wikitext
<syntaxhighlight enclose="none" lang="python">print('Hi')</syntaxhighlight>
!! html
!! html/php
cat=Pages_using_deprecated_enclose_attributes sort=
!! end