Moduli:Smallem
Appearance
require('Module:No globals');
local getArgs = require ('Module:Arguments').getArgs;
local cfg = mw.loadData ('Module:Citation/CS1/Configuration'); -- load the configuration module
--[[-------------------------< A D D _ T O _ L I S T >---------------------------------------------------------
adds <code>/<name> pair to <list> table as ['<code>:<name>'] = true; uses name from <override_list> if available
this format avoids duplicates so only unique <code>/<name> pairs are added to <list>
]]
local function add_to_list (list, override_list, code, name)
if override_list[code] then -- look in the override table for this code
list[code .. ':' .. override_list[code]] = true; -- use the name from the override table
else
list[code .. ':' .. name] = true; -- use the MediaWiki name and code
end
end
--[[-------------------------< L I S T _ F O R M A T >---------------------------------------------------------
formats <code>/<name> pair into a sequence table of find/replace strings for rendering
the original:
['<code>:<name>'] = true → "\|\s*language\s*=\s*<name>\b" "|language=<code>" \
the new so that |language= appearing in non-cs1 template is ignored:
['<code>:<name>'] = true → "(\{\{\s*cit[aeio][^\}]*\|\s*language\s*=\s*)<name>(\s*[\|\}])" "\1<code>\2" \
and another new:
['<code>:<name>'] = true → (r"(\{\{\s*cit[aeio][^\}]*\|\s*language\s*=\s*)<name>(\s*[\|\}])", r"\1<code>\2"),
the above with quotes and escapes
<result> list of find/replace strings
<list> source of code / name pairs
<plain> boolean true for machine readable version; human readable else
]]
local function list_format (result, list)
local count = 0;
for k, _ in pairs (list) do
local code, name = k:match ('([^:]+):(.+)'); -- split key into <code>/<name> pair
name = name:gsub (' +', '\\ '):gsub ('[%(%)/"]', '\\%1'); -- escape whitespace, then parens and virgule
-- local str = string.format ('"\\|\\s*language\\s*=\\s*%s\\b" "|language=%s" \\', name, code);
-- local str = string.format ('"(\\{\\{\\s*cit[aeio][^\\}]*\\|\\s*language\\s*=\\s*)%s(\\s*[\\|\\}])" "\\1%s\\2" \\', name, code);
local str = string.format ('(r"(\\{\\{\\s*cit[aeio][^\\}]*\\|\\s*language\\s*=\\s*)%s(\\s*[\\|\\}])", r"\\1%s\\2"),\t', name, code);
table.insert (result, str);
count = count + 1
end
return count;
end
--[[-------------------------< L A N G _ L I S T E R >---------------------------------------------------------
Module entry point
{{#invoke:test|lang_lister|lang=<code>, <code>, <code>, ...}}
There is an issue with pasting Unicode Gothic block text into Windows cmd.exe. Until a better solution arises
this function skips any language name that contains Unicode Gothic block text (U+10330–U+U1034A)
https://www.unicode.org/charts/PDF/U10330.pdf
\240\144\140\176-\240\144\141\138 -- decimal equivalent of hex UTF-8 code units F0 90 8C B0 – F0 90 8D 8A
when skipped, this function emits an error message.
There is an issue with the processing outside of this module where some process converts U+200B zero width space,
U+200C zero width non-joiner, and U+200D zero width joiner unicode codepoints to text strings '<200b>', '<200c>',
and '<200d>'. This function skips language names that have these unicode codepoints so that the succeeding process
doesn't have the opportunity to mangle the name in the regex. This function emits an error message for each name
that it skips.
]]
local reason_map = { -- map unicode codepoints (skip_reason) to plain text for error messages
['\226\128\139'] = 'U+200B zero width space',
['\226\128\140'] = 'U+200C zero width non-joiner',
['\226\128\141'] = 'U+200D zero width joiner',
}
local function lang_lister (frame)
local args = getArgs (frame);
local plain = 'yes' == args.plain;
local lang;
local source_list;
local override = cfg.lang_tag_remap;
local list={};
local en_ref_list = mw.language.fetchLanguageNames ('en', 'all'); -- make a en.wiki language list
local iw_map = mw.site.interwikiMap ('local'); -- get list of all local wikis
local lang_codes = {};
local lang_code_check_list = {};
for k, v in pairs (iw_map) do -- look at each wiki in the iw map
if en_ref_list[v.prefix] then -- if the prefix is a language code
table.insert (lang_codes, v.prefix); -- add the prefix to the lang codes table
end
end
-- local function sort (a, b) -- for descending sort
-- return a > b;
-- end
-- table.sort (lang_codes, sort); -- descending sort
table.sort (lang_codes); -- ascending sort
if args.list then
if plain then
return 'CodeListBegin:' .. table.concat (lang_codes, ', ') .. ':CodeListEnd'; -- make a semi pretty list of the code and done
else -- for human readable
local max = #lang_codes; -- local copy of the number of codes
local list_num = 99; -- default for 100-item lists
if tonumber (args.list) then -- if |list= has number value
if max < tonumber (args.list) then -- |list= cannot be more than the number of codes
list_num = 99; -- default to 100-item lists
else
list_num = args.list - 1; -- adjust for table.concat limit
end
else
list_num = 99; -- default for 100-item lists
end
local out = {'CodeListBegin<div>'}; -- create initialized output table
for i=1, max, list_num+1 do -- for each |list= number of codes (or whatever remains at the end)
local limit = i+list_num; -- set table.concat limit
limit = max > limit and limit or max; -- set limit to prevent nil concatenation at end
table.insert (out, table.concat (lang_codes, ', ', i, limit)); -- concat codes from lang_codes[i] to lang_code[limit] and save in out{}
if limit ~= max then -- not yet got to max
table.insert (out, '<br /><br />'); -- insert line breaks for each group (except last)
end
end
table.insert (out, '</div>CodeListEnd'); -- close the code list
return table.concat (out); -- final concat and done
end
end
mw.log (table.concat (lang_codes, ', ')); -- put a copy of the language code list in the Lua log
if not args.lang then
return '<span style=\"font-size: 100%; font-style: normal;\" class=\"error\">missing or empty <code style="color: inherit; background: inherit; border: none; padding: inherit;">|lang=</code>';
end
args.lang = args.lang:gsub ('%s*,$', ''); -- strip trailing comma if present
local lang_codes = mw.text.split (args.lang, '%s*,%s*'); -- make a table of lang codes from comma separated list
for _, code in ipairs (lang_codes) do -- error check code
code = code:lower();
if not en_ref_list [code] then -- codes from |lang= must be found in the English list of codes and names
return '<span style=\"font-size: 100%; font-style: normal;\" class=\"error\">\
<code style="color: inherit; background: inherit; border: none; padding: inherit;">|lang=</code> has invalid code: \
<code style="color: inherit; background: inherit; border: none; padding: inherit;">' .. code .. '</code>';
end
end
local skipped = {};
for _, lang_code in ipairs (lang_codes) do -- for each lang code in the list
source_list = mw.language.fetchLanguageNames (lang_code, 'all'); -- make a source list for that language
for code, name in pairs (source_list) do -- get <code>/<name> pairs from the source list
local name_not_ascii = name ~= string.match (name, '[%w%p ]*'); -- test for values that are simple ASCII text and bypass other tests if true
if name_not_ascii then
mw.log (name)
end
local skip_reason; -- init/re-init; holds skip_reason text string or capture from unicode codepoint match
if name_not_ascii then -- skip tests – test only those names that have non-ascii characters
if mw.ustring.find (name, '[\240\144\140\176-\240\144\141\138]+') then -- unicode gothic block (U+10330–U+1034A); breaks windows cmd.exe
skip_reason = 'gothic unicode block';
else
skip_reason = mw.ustring.match (name, '[\226\128\139-\226\128\141]'); -- U+200B zero width space, U+200C zero width non-joiner, U+200D zero width joiner
end
end
if en_ref_list[name:lower()] then -- if lowercase language name is same as a known language tag
if name:lower() ~= code then -- if lowercase language name does not use itself as a language tag (tiv for Tiv, ok; ga for Irish; not ok)
skip_reason = 'language name matches another language\'s tag'; -- 'Ga' is a language name; 'ga' is language tag for Irish
end
end
if skip_reason then -- if there is a reason to skip
table.insert (skipped, table.concat ({
'skipped: ',
'<code style="color: inherit; background: inherit; border: none; padding: inherit;">',
code,
'</code>: ',
name,
'; from: ',
lang_code,
'.wiki [',
reason_map[skip_reason] or skip_reason, -- add the reason for skipping
']'
}));
else
if name_not_ascii then -- character delete tests – test only those names that have non-ascii characters
name = mw.ustring.gsub (name, '[\226\128\142-\226\128\143]', ''); -- replace spurious U+200E left-to-right and U+200F right-to-left marks with empty string
name = mw.ustring.gsub (name, '\239\187\191', ''); -- replace spurious U+FEFF zero width no-break space with empty string
end
add_to_list (list, override, code, name); -- <list> is where we will add <code>/<name> pairs, will use <name> from <override> if available
end
end
end
local result = {}; -- temp table
local out = {}; -- final output goes here
local count; -- debug to find out how may items are in result{}
count = list_format (result, list); -- formats <code>/<name> pairs into find/replace strings
mw.logObject (count, 'count')
if 0 ~= #skipped then -- if we skipped any
table.insert (out, '<div style=\"font-size: 100%; font-style: normal;\" class=\"error\">' .. table.concat (skipped, '<br />') .. '</div>'); -- make a big string and put it in the output at the top
end
table.sort (result);
if plain then -- for machine readable version
table.insert (result, 1, 'RegexListBegin:'); -- opening keyword at begining of sequence table
result[#result] = result[#result]:gsub (', $', ''); -- remove trailing comma<space> from last regex
table.insert (result, ':RegexListEnd'); -- closing keyword at end of sequence table
table.insert (out, table.concat (result));
table.insert (out, table.concat ({'<div>Regex count: ', count, '</div>'})); -- add count of regexes rendered
return table.concat (out); -- final concatenation and done
else -- for human readable version, make a bulleted list in columns
table.insert (result, 1, 'RegexListBegin<div class="div-col columns column-width" style="column-width:30em">');
table.insert (out, table.concat (result, '\n*'));
table.insert (out, '</div>RegexListEnd');
table.insert (out, table.concat ({'<div>Regex count: ', count, '</div>'})); -- add count of regexes rendered
return table.concat (out, '\n'); -- final concatenation and done
end
end
--[[-------------------------< E X P O R T E D F U N C T I O N S >------------------------------------------
]]
return {
lang_lister = lang_lister,
};