mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-25 16:58:35 +00:00
e5564cf942
mw.text.unstrip is too broad, it's allowing for unstripping things that cause problems when unstripped (e.g. bug 61268). Since the original request was only for unstripping <nowiki>, let's add a function that does only that. We should also add an interface to StripState::killMarkers(), instead of requiring everyone to roll their own work-alike. Then, to fix the bug, we can make mw.text.unstrip be the combination of the two. This is the most like the original behavior of mw.text.unstrip (removes all strip markers, replacing them with text where applicable) without causing issues. Bug: 61268 Change-Id: I3a151fd678b365d629b71b4f1cb0d5d284b98555
288 lines
6.7 KiB
Lua
288 lines
6.7 KiB
Lua
local mwtext = {}
|
|
local php
|
|
local options
|
|
|
|
local util = require 'libraryUtil'
|
|
local checkType = util.checkType
|
|
local checkTypeForNamedArg = util.checkTypeForNamedArg
|
|
|
|
function mwtext.setupInterface( opts )
|
|
-- Boilerplate
|
|
mwtext.setupInterface = nil
|
|
php = mw_interface
|
|
mw_interface = nil
|
|
options = opts
|
|
|
|
-- Register this library in the "mw" global
|
|
mw = mw or {}
|
|
mw.text = mwtext
|
|
|
|
package.loaded['mw.text'] = mwtext
|
|
end
|
|
|
|
function mwtext.trim( s, charset )
|
|
charset = charset or '\t\r\n\f '
|
|
s = mw.ustring.gsub( s, '^[' .. charset .. ']*(.-)[' .. charset .. ']*$', '%1' )
|
|
return s
|
|
end
|
|
|
|
local htmlencode_map = {
|
|
['>'] = '>',
|
|
['<'] = '<',
|
|
['&'] = '&',
|
|
['"'] = '"',
|
|
["'"] = ''',
|
|
['\194\160'] = ' ',
|
|
}
|
|
local htmldecode_map = {}
|
|
for k, v in pairs( htmlencode_map ) do
|
|
htmldecode_map[v] = k
|
|
end
|
|
local decode_named_entities = nil
|
|
|
|
function mwtext.encode( s, charset )
|
|
charset = charset or '<>&"\'\194\160'
|
|
s = mw.ustring.gsub( s, '[' .. charset .. ']', function ( m )
|
|
if not htmlencode_map[m] then
|
|
local e = string.format( '&#%d;', mw.ustring.codepoint( m ) )
|
|
htmlencode_map[m] = e
|
|
htmldecode_map[e] = m
|
|
end
|
|
return htmlencode_map[m]
|
|
end )
|
|
return s
|
|
end
|
|
|
|
function mwtext.decode( s, decodeNamedEntities )
|
|
local dec
|
|
if decodeNamedEntities then
|
|
if decode_named_entities == nil then
|
|
decode_named_entities = php.getEntityTable()
|
|
setmetatable( decode_named_entities, { __index = htmldecode_map } )
|
|
end
|
|
dec = decode_named_entities
|
|
else
|
|
dec = htmldecode_map
|
|
end
|
|
-- string.gsub is safe here, because only ASCII chars are in the pattern
|
|
s = string.gsub( s, '(&(#?x?)([a-zA-Z0-9]+);)', function ( m, flg, name )
|
|
if not dec[m] then
|
|
local n = nil
|
|
if flg == '#' then
|
|
n = tonumber( name, 10 )
|
|
elseif flg == '#x' then
|
|
n = tonumber( name, 16 )
|
|
end
|
|
if n and n <= 0x10ffff then
|
|
n = mw.ustring.char( n )
|
|
if n then
|
|
htmldecode_map[m] = n
|
|
htmlencode_map[n] = m
|
|
end
|
|
end
|
|
end
|
|
return dec[m]
|
|
end )
|
|
return s
|
|
end
|
|
|
|
local nowikiRepl1 = {
|
|
['"'] = '"',
|
|
['&'] = '&',
|
|
["'"] = ''',
|
|
['<'] = '<',
|
|
['='] = '=',
|
|
['>'] = '>',
|
|
['['] = '[',
|
|
[']'] = ']',
|
|
['{'] = '{',
|
|
['|'] = '|',
|
|
['}'] = '}',
|
|
}
|
|
|
|
local nowikiRepl2 = {
|
|
["\n#"] = "\n#", ["\r#"] = "\r#",
|
|
["\n*"] = "\n*", ["\r*"] = "\r*",
|
|
["\n:"] = "\n:", ["\r:"] = "\r:",
|
|
["\n;"] = "\n;", ["\r;"] = "\r;",
|
|
["\n "] = "\n ", ["\r "] = "\r ",
|
|
["\n\n"] = "\n ", ["\r\n"] = " \n",
|
|
["\n\r"] = "\n ", ["\r\r"] = "\r ",
|
|
["\n\t"] = "\n	", ["\r\t"] = "\r	",
|
|
}
|
|
|
|
local nowikiReplMagic = {}
|
|
for sp, esc in pairs( {
|
|
[' '] = ' ',
|
|
['\t'] = '	',
|
|
['\r'] = ' ',
|
|
['\n'] = ' ',
|
|
['\f'] = '',
|
|
} ) do
|
|
nowikiReplMagic['ISBN' .. sp] = 'ISBN' .. esc
|
|
nowikiReplMagic['RFC' .. sp] = 'RFC' .. esc
|
|
nowikiReplMagic['PMID' .. sp] = 'PMID' .. esc
|
|
end
|
|
|
|
function mwtext.nowiki( s )
|
|
-- string.gsub is safe here, because we're only caring about ASCII chars
|
|
s = string.gsub( s, '["&\'<=>%[%]{|}]', nowikiRepl1 )
|
|
s = '\n' .. s
|
|
s = string.gsub( s, '[\r\n][#*:; \n\r\t]', nowikiRepl2 )
|
|
s = string.gsub( s, '([\r\n])%-%-%-%-', '%1----' )
|
|
s = string.sub( s, 2 )
|
|
s = string.gsub( s, '__', '__' )
|
|
s = string.gsub( s, '://', '://' )
|
|
s = string.gsub( s, 'ISBN%s', nowikiReplMagic )
|
|
s = string.gsub( s, 'RFC%s', nowikiReplMagic )
|
|
s = string.gsub( s, 'PMID%s', nowikiReplMagic )
|
|
for k, v in pairs( options.nowiki_protocols ) do
|
|
s = string.gsub( s, k, v )
|
|
end
|
|
|
|
return s
|
|
end
|
|
|
|
function mwtext.tag( name, attrs, content )
|
|
local named = false
|
|
if type( name ) == 'table' then
|
|
named = true
|
|
name, attrs, content = name.name, name.attrs, name.content
|
|
checkTypeForNamedArg( 'tag', 'name', name, 'string' )
|
|
checkTypeForNamedArg( 'tag', 'attrs', attrs, 'table', true )
|
|
else
|
|
checkType( 'tag', 1, name, 'string' )
|
|
checkType( 'tag', 2, attrs, 'table', true )
|
|
end
|
|
|
|
local ret = { '<' .. name }
|
|
for k, v in pairs( attrs or {} ) do
|
|
if type( k ) ~= 'string' then
|
|
error( "bad named argument attrs to 'tag' (keys must be strings, found " .. type( k ) .. ")", 2 )
|
|
end
|
|
if string.match( k, '[\t\r\n\f /<>"\'=]' ) then
|
|
error( "bad named argument attrs to 'tag' (invalid key '" .. k .. "')", 2 )
|
|
end
|
|
local tp = type( v )
|
|
if tp == 'boolean' then
|
|
if v then
|
|
ret[#ret+1] = ' ' .. k
|
|
end
|
|
elseif tp == 'string' or tp == 'number' then
|
|
ret[#ret+1] = string.format( ' %s="%s"', k, mwtext.encode( tostring( v ) ) )
|
|
else
|
|
error( "bad named argument attrs to 'tag' (value for key '" .. k .. "' may not be " .. tp .. ")", 2 )
|
|
end
|
|
end
|
|
|
|
local tp = type( content )
|
|
if content == nil then
|
|
ret[#ret+1] = '>'
|
|
elseif content == false then
|
|
ret[#ret+1] = ' />'
|
|
elseif tp == 'string' or tp == 'number' then
|
|
ret[#ret+1] = '>'
|
|
ret[#ret+1] = content
|
|
ret[#ret+1] = '</' .. name .. '>'
|
|
else
|
|
if named then
|
|
checkTypeForNamedArg( 'tag', 'content', content, 'string, number, nil, or false' )
|
|
else
|
|
checkType( 'tag', 3, content, 'string, number, nil, or false' )
|
|
end
|
|
end
|
|
|
|
return table.concat( ret )
|
|
end
|
|
|
|
function mwtext.unstrip( s )
|
|
return php.unstrip( s )
|
|
end
|
|
|
|
function mwtext.unstripNoWiki( s )
|
|
return php.unstripNoWiki( s )
|
|
end
|
|
|
|
function mwtext.killMarkers( s )
|
|
return php.killMarkers( s )
|
|
end
|
|
|
|
function mwtext.split( text, pattern, plain )
|
|
local ret = {}
|
|
for m in mwtext.gsplit( text, pattern, plain ) do
|
|
ret[#ret+1] = m
|
|
end
|
|
return ret
|
|
end
|
|
|
|
function mwtext.gsplit( text, pattern, plain )
|
|
local s, l = 1, mw.ustring.len( text )
|
|
return function ()
|
|
if s then
|
|
local e, n = mw.ustring.find( text, pattern, s, plain )
|
|
local ret
|
|
if not e then
|
|
ret = mw.ustring.sub( text, s )
|
|
s = nil
|
|
elseif n < e then
|
|
-- Empty separator!
|
|
ret = mw.ustring.sub( text, s, e )
|
|
if e < l then
|
|
s = e + 1
|
|
else
|
|
s = nil
|
|
end
|
|
else
|
|
ret = e > s and mw.ustring.sub( text, s, e - 1 ) or ''
|
|
s = n + 1
|
|
end
|
|
return ret
|
|
end
|
|
end, nil, nil
|
|
end
|
|
|
|
function mwtext.listToText( list, separator, conjunction )
|
|
separator = separator or options.comma
|
|
conjunction = conjunction or options['and']
|
|
local n = #list
|
|
|
|
local ret
|
|
if n > 1 then
|
|
ret = table.concat( list, separator, 1, n - 1 ) .. conjunction .. list[n]
|
|
else
|
|
ret = tostring( list[1] or '' )
|
|
end
|
|
|
|
return ret
|
|
end
|
|
|
|
function mwtext.truncate( text, length, ellipsis, adjustLength )
|
|
local l = mw.ustring.len( text )
|
|
if l <= math.abs( length ) then
|
|
return text
|
|
end
|
|
|
|
ellipsis = ellipsis or options.ellipsis
|
|
local elen = 0
|
|
if adjustLength then
|
|
elen = mw.ustring.len( ellipsis )
|
|
end
|
|
|
|
local ret
|
|
if math.abs( length ) <= elen then
|
|
ret = ellipsis
|
|
elseif length > 0 then
|
|
ret = mw.ustring.sub( text, 1, length - elen ) .. ellipsis
|
|
else
|
|
ret = ellipsis .. mw.ustring.sub( text, length + elen )
|
|
end
|
|
|
|
if mw.ustring.len( ret ) < l then
|
|
return ret
|
|
else
|
|
return text
|
|
end
|
|
end
|
|
|
|
return mwtext
|