Moduli:Smallem

require('Module:No globals');
local getArgs = require ('Module:Arguments').getArgs;

local cfg = mw.loadData ('Module:Citation/CS1/Configuration');					-- load the configuration module


--[[-------------------------< A D D _ T O _ L I S T >---------------------------------------------------------

adds <code>/<name> pair to <list> table as ['<code>:<name>'] = true; uses name from <override_list> if available
this format avoids duplicates so only unique <code>/<name> pairs are added to <list>
]]

local function add_to_list (list, override_list, code, name)
	if override_list[code] then													-- look in the override table for this code
		list[code .. ':' .. override_list[code]] = true;						-- use the name from the override table
	else
		list[code .. ':' .. name] = true;										-- use the MediaWiki name and code
	end
end


--[[-------------------------< L I S T _ F O R M A T >---------------------------------------------------------

formats <code>/<name> pair into a sequence table of find/replace strings for rendering
	the original:
		['<code>:<name>'] = true → "\|\s*language\s*=\s*<name>\b" "|language=<code>" \
	the new so that |language= appearing in non-cs1 template is ignored:
		['<code>:<name>'] = true → "(\{\{\s*cit[aeio][^\}]*\|\s*language\s*=\s*)<name>(\s*[\|\}])" "\1<code>\2" \
	and another new:
		['<code>:<name>'] = true → (r"(\{\{\s*cit[aeio][^\}]*\|\s*language\s*=\s*)<name>(\s*[\|\}])", r"\1<code>\2"),

		the above with quotes and escapes

	<result> list of find/replace strings
	<list> source of code / name pairs
	<plain> boolean true for machine readable version; human readable else

]]

local function list_format (result, list)
	local count = 0;
	for k, _ in pairs (list)	do
		local code, name = k:match ('([^:]+):(.+)');							-- split key into <code>/<name> pair
		name = name:gsub (' +', '\\ '):gsub ('[%(%)/"]', '\\%1');				-- escape whitespace, then parens and virgule
--		local str = string.format ('"\\|\\s*language\\s*=\\s*%s\\b" "|language=%s" \\', name, code);
--		local str = string.format ('"(\\{\\{\\s*cit[aeio][^\\}]*\\|\\s*language\\s*=\\s*)%s(\\s*[\\|\\}])" "\\1%s\\2" \\', name, code);
		local str = string.format ('(r"(\\{\\{\\s*cit[aeio][^\\}]*\\|\\s*language\\s*=\\s*)%s(\\s*[\\|\\}])", r"\\1%s\\2"),\t', name, code);
		table.insert (result, str);
		count = count + 1
	end
	return count;
end


--[[-------------------------< L A N G _ L I S T E R >---------------------------------------------------------

Module entry point

{{#invoke:test|lang_lister|lang=<code>, <code>, <code>, ...}}

There is an issue with pasting Unicode Gothic block text into Windows cmd.exe.  Until a better solution arises
this function skips any language name that contains Unicode Gothic block text (U+10330–U+U1034A)
https://www.unicode.org/charts/PDF/U10330.pdf

\240\144\140\176-\240\144\141\138		-- decimal equivalent of hex UTF-8 code units F0 90 8C B0 – F0 90 8D 8A

when skipped, this function emits an error message.

There is an issue with the processing outside of this module where some process converts U+200B zero width space,
U+200C zero width non-joiner, and U+200D zero width joiner unicode codepoints to text strings '<200b>', '<200c>',
and '<200d>'.  This function skips language names that have these unicode codepoints so that the succeeding process
doesn't have the opportunity to mangle the name in the regex.  This function emits an error message for each name
that it skips.

]]

local reason_map = {															-- map unicode codepoints (skip_reason) to plain text for error messages
	['\226\128\139'] = 'U+200B zero width space',
	['\226\128\140'] = 'U+200C zero width non-joiner',
	['\226\128\141'] = 'U+200D zero width joiner',
	}

local function lang_lister (frame)
	local args = getArgs (frame);
	local plain = 'yes' == args.plain;
	local lang;
	local source_list;
	local override = cfg.lang_tag_remap;
	local list={};
	local en_ref_list = mw.language.fetchLanguageNames ('en', 'all');			-- make a en.wiki language list
	local iw_map = mw.site.interwikiMap ('local');								-- get list of all local wikis
	
	local lang_codes = {};
	local lang_code_check_list = {};
	
	for k, v in pairs (iw_map) do												-- look at each wiki in the iw map
		if en_ref_list[v.prefix] then											-- if the prefix is a language code
			table.insert (lang_codes, v.prefix);								-- add the prefix to the lang codes table
		end
	end

--	local function sort (a, b)													-- for descending sort
--		return a > b;
--	end
	
--	table.sort (lang_codes, sort);												-- descending sort
	table.sort (lang_codes);													-- ascending sort
	if args.list then
		if plain then
			return 'CodeListBegin:' .. table.concat (lang_codes, ', ') .. ':CodeListEnd';			-- make a semi pretty list of the code and done
		else																	-- for human readable
			local max = #lang_codes;											-- local copy of the number of codes
			local list_num = 99;												-- default for 100-item lists

			if tonumber (args.list) then										-- if |list= has number value
				if max < tonumber (args.list) then								-- |list= cannot be more than the number of codes
					list_num = 99;												-- default to 100-item lists
				else
					list_num = args.list - 1;									-- adjust for table.concat limit
				end
			else
				list_num = 99;													-- default for 100-item lists
			end

			local out = {'CodeListBegin<div>'};									-- create initialized output table 
			for i=1, max, list_num+1 do											-- for each |list= number of codes (or whatever remains at the end)
				local limit = i+list_num;										-- set table.concat limit
				limit = max > limit and limit or max;							-- set limit to prevent nil concatenation at end
				table.insert (out, table.concat (lang_codes, ', ', i, limit));	-- concat codes from lang_codes[i] to lang_code[limit] and save in out{}
				if limit ~= max then											-- not yet got to max
					table.insert (out, '<br /><br />');							-- insert line breaks for each group (except last)
				end
			end
			table.insert (out, '</div>CodeListEnd');							-- close the code list
			return table.concat (out);											-- final concat and done
		end
	end
	mw.log (table.concat (lang_codes, ', '));									-- put a copy of the language code list in the Lua log

	if not args.lang then
		return '<span style=\"font-size: 100%; font-style: normal;\" class=\"error\">missing or empty <code style="color: inherit; background: inherit; border: none; padding: inherit;">|lang=</code>';
	end
	
	args.lang = args.lang:gsub ('%s*,$', '');									-- strip trailing comma if present
	local lang_codes = mw.text.split (args.lang, '%s*,%s*');					-- make a table of lang codes from comma separated list

	for _, code in ipairs (lang_codes) do										-- error check code
		code = code:lower();
		if not en_ref_list [code] then											-- codes from |lang= must be found in the English list of codes and names
			return '<span style=\"font-size: 100%; font-style: normal;\" class=\"error\">\
				<code style="color: inherit; background: inherit; border: none; padding: inherit;">|lang=</code> has invalid code: \
				<code style="color: inherit; background: inherit; border: none; padding: inherit;">' .. code .. '</code>';
		end
	end
	
	local skipped = {};

	for _, lang_code in ipairs (lang_codes) do									-- for each lang code in the list
		source_list = mw.language.fetchLanguageNames (lang_code, 'all');		-- make a source list for that language

		for code, name in pairs (source_list) do								-- get <code>/<name> pairs from the source list
			local name_not_ascii = name ~= string.match (name, '[%w%p ]*');		-- test for values that are simple ASCII text and bypass other tests if true
if name_not_ascii then
	mw.log (name)
end
			local skip_reason;													-- init/re-init; holds skip_reason text string or capture from unicode codepoint match

			if name_not_ascii then												-- skip tests – test only those names that have non-ascii characters
				if mw.ustring.find (name, '[\240\144\140\176-\240\144\141\138]+') then	-- unicode gothic block (U+10330–U+1034A); breaks windows cmd.exe
					skip_reason = 'gothic unicode block';
				else
					skip_reason = mw.ustring.match (name, '[\226\128\139-\226\128\141]');	-- U+200B zero width space, U+200C zero width non-joiner, U+200D zero width joiner
				end
			end
			if en_ref_list[name:lower()] then									-- if lowercase language name is same as a known language tag
				if name:lower() ~= code then									-- if lowercase language name does not use itself as a language tag (tiv for Tiv, ok; ga for Irish; not ok)
					skip_reason = 'language name matches another language\'s tag';		-- 'Ga' is a language name; 'ga' is language tag for Irish
				end
			end
			if skip_reason then													-- if there is a reason to skip
				table.insert (skipped, table.concat ({
					'skipped: ',
					'<code style="color: inherit; background: inherit; border: none; padding: inherit;">',
					code,
					'</code>: ',
					name,
					'; from: ',
					lang_code,
					'.wiki [',
					reason_map[skip_reason] or skip_reason,						-- add the reason for skipping
					']'
				}));
			else
				if name_not_ascii then											-- character delete tests – test only those names that have non-ascii characters
					name = mw.ustring.gsub (name, '[\226\128\142-\226\128\143]', '');	-- replace spurious U+200E left-to-right and U+200F right-to-left marks with empty string
					name = mw.ustring.gsub (name, '\239\187\191', '');			-- replace spurious U+FEFF zero width no-break space with empty string
				end
				add_to_list (list, override, code, name);						-- <list> is where we will add <code>/<name> pairs, will use <name> from <override> if available
			end
		end
	end
	local result = {};															-- temp table
	local out = {};																-- final output goes here
	local count;																-- debug to find out how may items are in result{}
	count = list_format (result, list);											-- formats <code>/<name> pairs into find/replace strings

	mw.logObject (count, 'count')

	if 0 ~= #skipped then														-- if we skipped any
		table.insert (out, '<div style=\"font-size: 100%; font-style: normal;\" class=\"error\">' .. table.concat (skipped, '<br />') .. '</div>');	-- make a big string and put it in the output at the top
	end

	table.sort (result);
	if plain then																-- for machine readable version
		table.insert (result, 1, 'RegexListBegin:');							-- opening keyword at begining of sequence table
		result[#result] = result[#result]:gsub (', $', '');						-- remove trailing comma<space> from last regex
		table.insert (result, ':RegexListEnd');									-- closing keyword at end of sequence table
		table.insert (out, table.concat (result));
		table.insert (out, table.concat ({'<div>Regex count: ', count, '</div>'}));	-- add count of regexes rendered
		return table.concat (out);												-- final concatenation and done
	else																		-- for human readable version, make a bulleted list in columns
		table.insert (result, 1, 'RegexListBegin<div class="div-col columns column-width" style="column-width:30em">');
		table.insert (out, table.concat (result, '\n*'));
		table.insert (out, '</div>RegexListEnd');
		table.insert (out, table.concat ({'<div>Regex count: ', count, '</div>'}));	-- add count of regexes rendered
		return table.concat (out, '\n');										-- final concatenation and done
	end
end


--[[-------------------------< E X P O R T E D   F U N C T I O N S >------------------------------------------
]]

return {
	lang_lister = lang_lister,
	};