2021-09-03 02:30:32 +00:00
|
|
|
<?php
|
|
|
|
/**
|
|
|
|
* Copyright (C) 2021 Kunal Mehta <legoktm@debian.org>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
*/
|
|
|
|
|
|
|
|
namespace MediaWiki\SyntaxHighlight;
|
|
|
|
|
|
|
|
use MediaWiki\MediaWikiServices;
|
|
|
|
use Shellbox\Command\BoxedCommand;
|
Pygmentize: Treat Shellbox network loss like non-zero exit code
Prior to the shellbox migration, if during the parsing of a page,
pygmentize failed (i.e. non-zero exit from its local shell command,
pretty much the only way a php shell exec could fail), then
SyntaxHighlight would fallback to outputting a preformatted plain
`<pre>`.
The logic still exists in the code, and is still triggered for cases
where the command reached shellbox and its result was "successfully"
communicated to MediaWiki (HTTP 200), with the boxed result reporting
the non-zero exit code on the shellbox server.
However, the more likely scenario in the new setup is that the command
times out or never reaches the server in the first place, in which
case we don't get any shell exit code. Instead, we get a Shellbox
exception since the result is unknowable.
Instead of fatalling the entire pageview with a PHP exception and
HTTP 500 from MW, use the same graceful fallback.
Bug: T292663
Change-Id: Icaa8c34ff97ad8a99d044beab529ef943071269c
2023-06-03 13:16:19 +00:00
|
|
|
use Shellbox\ShellboxError;
|
2021-09-03 02:30:32 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Wrapper around the `pygmentize` command
|
|
|
|
*/
|
|
|
|
class Pygmentize {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* If no pygmentize is configured, use bundled
|
|
|
|
*/
|
|
|
|
public static function useBundled(): bool {
|
|
|
|
global $wgPygmentizePath;
|
|
|
|
return $wgPygmentizePath === false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get a real path to pygmentize
|
|
|
|
*/
|
|
|
|
private static function getPath(): string {
|
|
|
|
global $wgPygmentizePath;
|
|
|
|
|
|
|
|
// If $wgPygmentizePath is unset, use the bundled copy.
|
2022-07-11 16:43:55 +00:00
|
|
|
return $wgPygmentizePath ?: __DIR__ . '/../pygments/pygmentize';
|
2021-09-03 02:30:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2022-07-11 16:43:55 +00:00
|
|
|
* Get the version of pygments (cached)
|
2021-09-03 02:30:32 +00:00
|
|
|
*/
|
|
|
|
public static function getVersion(): string {
|
|
|
|
static $version;
|
|
|
|
if ( $version !== null ) {
|
|
|
|
return $version;
|
|
|
|
}
|
|
|
|
if ( self::useBundled() ) {
|
|
|
|
$version = self::getBundledVersion();
|
|
|
|
return $version;
|
|
|
|
}
|
|
|
|
|
2022-07-11 16:43:55 +00:00
|
|
|
// This is called a lot, during both page views, edits, and load.php startup request.
|
|
|
|
// It also gets called multiple times during the same request. As such, prefer
|
|
|
|
// low latency via php-apcu.
|
|
|
|
//
|
|
|
|
// This value also controls cache invalidation and propagation through embedding
|
|
|
|
// in other keys from this class, and thus has a low expiry. Avoid latency from
|
|
|
|
// frequent cache misses by by sharing the values with other servers via Memcached
|
|
|
|
// as well.
|
|
|
|
|
|
|
|
$srvCache = MediaWikiServices::getInstance()->getLocalServerObjectCache();
|
|
|
|
return $srvCache->getWithSetCallback(
|
|
|
|
$srvCache->makeGlobalKey( 'pygmentize-version' ),
|
|
|
|
// Spread between 55 min and 1 hour
|
|
|
|
mt_rand( 55 * $srvCache::TTL_MINUTE, 60 * $srvCache::TTL_MINUTE ),
|
|
|
|
static function () {
|
|
|
|
$wanCache = MediaWikiServices::getInstance()->getMainWANObjectCache();
|
|
|
|
return $wanCache->getWithSetCallback(
|
|
|
|
$wanCache->makeGlobalKey( 'pygmentize-version' ),
|
|
|
|
// Must be under 55 min to avoid renewing stale data in upper layer
|
|
|
|
30 * $wanCache::TTL_MINUTE,
|
|
|
|
[ __CLASS__, 'fetchVersion' ]
|
|
|
|
);
|
2021-09-03 02:30:32 +00:00
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the version of bundled pygments
|
|
|
|
*/
|
|
|
|
private static function getBundledVersion(): string {
|
|
|
|
return trim( file_get_contents( __DIR__ . '/../pygments/VERSION' ) );
|
|
|
|
}
|
|
|
|
|
2022-07-11 16:43:55 +00:00
|
|
|
/**
|
|
|
|
* Shell out to get installed pygments version
|
|
|
|
*
|
2022-07-16 01:09:06 +00:00
|
|
|
* @internal For use by WANObjectCache/BagOStuff only
|
2022-07-11 16:43:55 +00:00
|
|
|
*/
|
2022-07-16 01:09:06 +00:00
|
|
|
public static function fetchVersion(): string {
|
2022-07-11 16:43:55 +00:00
|
|
|
$result = self::boxedCommand()
|
|
|
|
->params( self::getPath(), '-V' )
|
|
|
|
->includeStderr()
|
|
|
|
->execute();
|
|
|
|
self::recordShellout( 'version' );
|
|
|
|
|
|
|
|
$output = $result->getStdout();
|
|
|
|
if ( $result->getExitCode() != 0 ||
|
|
|
|
!preg_match( '/^Pygments version (\S+),/', $output, $matches )
|
|
|
|
) {
|
|
|
|
throw new PygmentsException( $output );
|
|
|
|
}
|
|
|
|
|
|
|
|
return $matches[1];
|
|
|
|
}
|
|
|
|
|
2021-09-03 02:30:32 +00:00
|
|
|
/**
|
|
|
|
* Get the pygments generated CSS (cached)
|
|
|
|
*
|
|
|
|
* Note: if using bundled, the CSS is already available
|
|
|
|
* in modules/pygments.generated.css.
|
|
|
|
*/
|
|
|
|
public static function getGeneratedCSS(): string {
|
2022-07-11 16:43:55 +00:00
|
|
|
// This is rarely called as the result gets HTTP-cached via long-expiry load.php.
|
|
|
|
// When it gets called once, after a deployment, during that brief spike of
|
|
|
|
// dedicated requests from each wiki. Leverage Memcached to share this.
|
|
|
|
// Its likely not needed again on the same server for a while after that.
|
2021-09-03 02:30:32 +00:00
|
|
|
$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
|
|
|
|
return $cache->getWithSetCallback(
|
|
|
|
$cache->makeGlobalKey( 'pygmentize-css', self::getVersion() ),
|
|
|
|
$cache::TTL_WEEK,
|
|
|
|
[ __CLASS__, 'fetchGeneratedCSS' ]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Shell out to get generated CSS from pygments
|
|
|
|
*
|
|
|
|
* @internal Only public for updateCSS.php
|
|
|
|
*/
|
|
|
|
public static function fetchGeneratedCSS(): string {
|
|
|
|
$result = self::boxedCommand()
|
|
|
|
->params(
|
|
|
|
self::getPath(), '-f', 'html',
|
|
|
|
'-S', 'default', '-a', '.mw-highlight' )
|
|
|
|
->includeStderr()
|
|
|
|
->execute();
|
|
|
|
self::recordShellout( 'generated_css' );
|
|
|
|
$output = $result->getStdout();
|
|
|
|
if ( $result->getExitCode() != 0 ) {
|
|
|
|
throw new PygmentsException( $output );
|
|
|
|
}
|
|
|
|
return $output;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the list of supported lexers by pygments (cached)
|
|
|
|
*
|
2024-01-22 08:41:41 +00:00
|
|
|
* @return array<string,true>
|
2021-09-03 02:30:32 +00:00
|
|
|
*/
|
|
|
|
public static function getLexers(): array {
|
|
|
|
if ( self::useBundled() ) {
|
|
|
|
return require __DIR__ . '/../SyntaxHighlight.lexers.php';
|
|
|
|
}
|
|
|
|
|
2022-07-11 16:43:55 +00:00
|
|
|
// This is called during page views and edits, and may be called
|
|
|
|
// repeatedly. Trade low latency for higher shell rate by caching
|
|
|
|
// on each server separately. This is made up for with a high TTL,
|
|
|
|
// which is fine because we vary by version, thus ensuring quick
|
|
|
|
// propagation separate from the TTL.
|
2021-09-03 02:30:32 +00:00
|
|
|
$cache = MediaWikiServices::getInstance()->getLocalServerObjectCache();
|
|
|
|
return $cache->getWithSetCallback(
|
|
|
|
$cache->makeGlobalKey( 'pygmentize-lexers', self::getVersion() ),
|
2022-07-11 16:43:55 +00:00
|
|
|
$cache::TTL_WEEK,
|
2021-09-03 02:30:32 +00:00
|
|
|
[ __CLASS__, 'fetchLexers' ]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2022-07-17 02:29:53 +00:00
|
|
|
/**
|
|
|
|
* Determine if the pygments command line supports the --json option
|
|
|
|
*
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
private static function pygmentsSupportsJsonOutput(): bool {
|
|
|
|
$version = self::getVersion();
|
|
|
|
return ( version_compare( $version, '2.11.0' ) !== -1 );
|
|
|
|
}
|
|
|
|
|
2021-09-03 02:30:32 +00:00
|
|
|
/**
|
|
|
|
* Shell out to get supported lexers by pygments
|
|
|
|
*
|
|
|
|
* @internal Only public for updateLexerList.php
|
2024-01-22 08:41:41 +00:00
|
|
|
* @return array<string,true>
|
2021-09-03 02:30:32 +00:00
|
|
|
*/
|
|
|
|
public static function fetchLexers(): array {
|
2022-07-17 02:29:53 +00:00
|
|
|
$cliParams = [ self::getPath(), '-L', 'lexer' ];
|
|
|
|
if ( self::pygmentsSupportsJsonOutput() ) {
|
|
|
|
$cliParams[] = '--json';
|
|
|
|
}
|
|
|
|
|
2021-09-03 02:30:32 +00:00
|
|
|
$result = self::boxedCommand()
|
2022-07-17 02:29:53 +00:00
|
|
|
->params( $cliParams )
|
2021-09-03 02:30:32 +00:00
|
|
|
->includeStderr()
|
|
|
|
->execute();
|
|
|
|
self::recordShellout( 'fetch_lexers' );
|
|
|
|
$output = $result->getStdout();
|
|
|
|
if ( $result->getExitCode() != 0 ) {
|
|
|
|
throw new PygmentsException( $output );
|
|
|
|
}
|
|
|
|
|
2022-07-17 02:29:53 +00:00
|
|
|
if ( self::pygmentsSupportsJsonOutput() ) {
|
|
|
|
$lexers = self::parseLexersFromJson( $output );
|
|
|
|
} else {
|
|
|
|
$lexers = self::parseLexersFromText( $output );
|
|
|
|
}
|
|
|
|
|
|
|
|
sort( $lexers );
|
2024-01-22 08:41:41 +00:00
|
|
|
return array_fill_keys( $lexers, true );
|
2022-07-17 02:29:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse json output of the pygments lexers list and return as php array
|
|
|
|
*
|
|
|
|
* @param string $output JSON formatted output of pygments lexers list
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
private static function parseLexersFromJson( $output ): array {
|
|
|
|
$data = json_decode( $output, true );
|
|
|
|
if ( $data === null ) {
|
|
|
|
throw new PygmentsException(
|
|
|
|
'Got invalid JSON from Pygments: ' . $output );
|
|
|
|
}
|
|
|
|
$lexers = [];
|
|
|
|
foreach ( array_values( $data['lexers'] ) as $lexer ) {
|
|
|
|
$lexers = array_merge( $lexers, $lexer['aliases'] );
|
|
|
|
}
|
|
|
|
return $lexers;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse original stdout of the pygments lexers list
|
|
|
|
* This was the only format available before pygments 2.11.0
|
|
|
|
* NOTE: Should be removed when pygments 2.11 is the minimum version expected to be installed
|
|
|
|
*
|
|
|
|
* @param string $output Textual list of pygments lexers
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
private static function parseLexersFromText( $output ): array {
|
2021-09-03 02:30:32 +00:00
|
|
|
$lexers = [];
|
|
|
|
foreach ( explode( "\n", $output ) as $line ) {
|
2024-01-22 08:41:41 +00:00
|
|
|
if ( str_starts_with( $line, '*' ) ) {
|
2022-11-21 05:46:55 +00:00
|
|
|
$newLexers = explode( ', ', trim( $line, "* :\r\n" ) );
|
2021-09-03 02:30:32 +00:00
|
|
|
|
|
|
|
// Skip internal, unnamed lexers
|
|
|
|
if ( $newLexers[0] !== '' ) {
|
|
|
|
$lexers = array_merge( $lexers, $newLexers );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-07-17 02:29:53 +00:00
|
|
|
return $lexers;
|
2021-09-03 02:30:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Actually highlight some text
|
|
|
|
*
|
|
|
|
* @param string $lexer Lexer name
|
|
|
|
* @param string $code Code to highlight
|
|
|
|
* @param array $options Options to pass to pygments
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function highlight( $lexer, $code, array $options ): string {
|
|
|
|
$optionPairs = [];
|
|
|
|
foreach ( $options as $k => $v ) {
|
|
|
|
$optionPairs[] = "{$k}={$v}";
|
|
|
|
}
|
|
|
|
self::recordShellout( 'highlight' );
|
|
|
|
|
Pygmentize: Treat Shellbox network loss like non-zero exit code
Prior to the shellbox migration, if during the parsing of a page,
pygmentize failed (i.e. non-zero exit from its local shell command,
pretty much the only way a php shell exec could fail), then
SyntaxHighlight would fallback to outputting a preformatted plain
`<pre>`.
The logic still exists in the code, and is still triggered for cases
where the command reached shellbox and its result was "successfully"
communicated to MediaWiki (HTTP 200), with the boxed result reporting
the non-zero exit code on the shellbox server.
However, the more likely scenario in the new setup is that the command
times out or never reaches the server in the first place, in which
case we don't get any shell exit code. Instead, we get a Shellbox
exception since the result is unknowable.
Instead of fatalling the entire pageview with a PHP exception and
HTTP 500 from MW, use the same graceful fallback.
Bug: T292663
Change-Id: Icaa8c34ff97ad8a99d044beab529ef943071269c
2023-06-03 13:16:19 +00:00
|
|
|
try {
|
|
|
|
$result = self::boxedCommand()
|
|
|
|
->params(
|
|
|
|
self::getPath(),
|
|
|
|
'-l', $lexer,
|
|
|
|
'-f', 'html',
|
|
|
|
'-O', implode( ',', $optionPairs ),
|
|
|
|
'file'
|
|
|
|
)
|
|
|
|
->inputFileFromString( 'file', $code )
|
|
|
|
->execute();
|
|
|
|
} catch ( ShellboxError $exception ) {
|
|
|
|
// If we have trouble sending or receiving over the network to
|
|
|
|
// Shellbox, we technically don't know if the command succeed or failed,
|
|
|
|
// but, treat the highlight() command as recoverable by wrapping this in
|
|
|
|
// PygmentsException. This permits the Parser tag to fallback to
|
|
|
|
// plainCodeWrap(), thus avoiding a fatal on pageviews (T292663).
|
|
|
|
throw new PygmentsException( 'ShellboxError', 0, $exception );
|
|
|
|
}
|
|
|
|
|
2021-09-03 02:30:32 +00:00
|
|
|
$output = $result->getStdout();
|
|
|
|
if ( $result->getExitCode() != 0 ) {
|
2024-05-16 23:27:08 +00:00
|
|
|
if ( $output === "" || $output === null ) {
|
|
|
|
// Stdout was empty, report stderr instead
|
|
|
|
$output = $result->getStderr();
|
|
|
|
}
|
|
|
|
throw new PygmentsException( (string)$output );
|
2021-09-03 02:30:32 +00:00
|
|
|
}
|
Pygmentize: Treat Shellbox network loss like non-zero exit code
Prior to the shellbox migration, if during the parsing of a page,
pygmentize failed (i.e. non-zero exit from its local shell command,
pretty much the only way a php shell exec could fail), then
SyntaxHighlight would fallback to outputting a preformatted plain
`<pre>`.
The logic still exists in the code, and is still triggered for cases
where the command reached shellbox and its result was "successfully"
communicated to MediaWiki (HTTP 200), with the boxed result reporting
the non-zero exit code on the shellbox server.
However, the more likely scenario in the new setup is that the command
times out or never reaches the server in the first place, in which
case we don't get any shell exit code. Instead, we get a Shellbox
exception since the result is unknowable.
Instead of fatalling the entire pageview with a PHP exception and
HTTP 500 from MW, use the same graceful fallback.
Bug: T292663
Change-Id: Icaa8c34ff97ad8a99d044beab529ef943071269c
2023-06-03 13:16:19 +00:00
|
|
|
|
2021-09-03 02:30:32 +00:00
|
|
|
return $output;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function boxedCommand(): BoxedCommand {
|
2022-11-03 01:57:38 +00:00
|
|
|
$command = MediaWikiServices::getInstance()->getShellCommandFactory()
|
2021-09-03 02:30:32 +00:00
|
|
|
->createBoxed( 'syntaxhighlight' )
|
|
|
|
->disableNetwork()
|
|
|
|
->firejailDefaultSeccomp()
|
|
|
|
->routeName( 'syntaxhighlight-pygments' );
|
2022-11-03 01:57:38 +00:00
|
|
|
|
|
|
|
if ( wfIsWindows() ) {
|
|
|
|
// Python requires the SystemRoot environment variable to initialize (T300223)
|
|
|
|
$command->environment( [
|
|
|
|
'SystemRoot' => getenv( 'SystemRoot' ),
|
|
|
|
] );
|
|
|
|
}
|
|
|
|
|
|
|
|
return $command;
|
2021-09-03 02:30:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Track how often we do each type of shellout in statsd
|
|
|
|
*
|
|
|
|
* @param string $type Type of shellout
|
|
|
|
*/
|
|
|
|
private static function recordShellout( $type ) {
|
|
|
|
$statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
|
|
|
|
$statsd->increment( "syntaxhighlight_shell.$type" );
|
|
|
|
}
|
|
|
|
}
|