Moduli:Lang/data/make is latn data
Pamja
< Moduli:Lang | data
require ('strict');
local title_object = mw.title.getCurrentTitle (); -- get this module's title object
if not title_object.fullText:find ('/doc$') then -- are we are looking at the ~/doc page or the module page?
local module_doc_title = title_object.fullText .. '/doc'; -- looking at the module page so make a page name for this module's doc page
title_object = mw.title.new (module_doc_title); -- reset title object to this module's doc page
end
local content = title_object:getContent(); -- get the doc page content
local common_scripts_singles_t = {}; -- these used when constructing final output
local common_scripts_ranges_t = {};
local latn_scripts_singles_t = {};
local latn_scripts_ranges_t = {};
local extension_scripts_singles_t = {};
local extension_scripts_ranges_t = {};
--[[--------------------------< Z Y Y Y _ L A T N _ C O D E P O I N T S _ G E T >-----------------------------
extract zyyy-script (common) and latn codepoints from Module:Unicode data/scripts. There are individual codepoints
and ranges of codepoints.
]]
local function zyyy_latn_codepoints_get ()
local unicode_scripts = mw.loadData ('Module:Unicode data/scripts');
for code_point, script in pairs (unicode_scripts.singles) do -- spin through the ~/scripts.singles table
if 'Latn' == script then
latn_scripts_singles_t[code_point] = true; -- not a sequence so we can check for duplicates later
elseif 'Zyyy' == script then
common_scripts_singles_t[code_point] = true; -- not a sequence so we can check for duplicates later
end
end
for i, code_points_t in ipairs (unicode_scripts.ranges) do -- spin through the ~/scripts.ranges table
if 'Latn' == code_points_t[3] then
table.insert (latn_scripts_ranges_t, {code_points_t[1], code_points_t[2]});
elseif 'Zyyy' == code_points_t[3] then
table.insert (common_scripts_ranges_t, {code_points_t[1], code_points_t[2]});
end
end
end
--[[--------------------------< E X T E N S I O N _ C O D E P O I N T S _ G E T >-----------------------------
read a local copy of the current unicode ScriptExtensions-xx.x.x.txt file (hidden in this module's doc page).
extract latn-script codepoints and ranges. Convert codepoints from hex to decimal (same format as codepoints
extracted from Unicode data/scripts).
]]
local function extension_codepoints_get ()
local line_pattern = '%x+[^\r\n]+';
for line in content:gmatch (line_pattern) do -- read each line of extensions text file
local single = line:match ('(%x+)%s*;[^#]*Latn[^#]*#%s*%a%a%s*(.+)');
if single then
extension_scripts_singles_t[tonumber ('0x' .. single)] = true; -- convert hex index to decimal and save
end
local range_t = {}; -- a single codepoint-range
range_t[1], range_t[2] = line:match ('(%x+)%.%.(%x+)%s*;[^#]*Latn[^#]*#%s*%a%a%s*%[%d+%]%s*(.+)');
if range_t[1] then
range_t[1] = tonumber ('0x' .. range_t[1]); -- convert hex index to decimal
range_t[2] = tonumber ('0x' .. range_t[2]); -- convert hex index to decimal
table.insert (extension_scripts_ranges_t, range_t); -- and save
end
end
end
--[[--------------------------< B I N A R Y _ S E A R C H >---------------------------------------------------
]]
local function binary_search (target, ranges_t)
local idx_bot = 1; -- initialize to index of first key
local idx_top = #ranges_t; -- initialize to index of last key (number of keys)
if (target < ranges_t[idx_bot][1]) or (target > ranges_t[idx_top][2]) then -- invalid; target out of range
return; -- TODO: return something meaningful?
end
local idx_mid;
local flag = false;
while 1 do
idx_mid = math.ceil ((idx_bot + idx_top) / 2); -- get the mid-point in the sequence
if (target >= ranges_t[idx_mid][1]) and (target <= ranges_t[idx_mid][2]) then -- indexed low value <= target <= indexed high value
return true; -- we found the range that holds the <target> character; return true
elseif (target > ranges_t[idx_mid][2]) then -- is <target> > indexed high value?
idx_bot = idx_mid; -- adjust <idx_bot> up
else -- here when <target> less than indexed low value
idx_top = idx_mid - 1; -- adjust <idx_top> down
end
if flag then
break; -- here when we just evaluated the last range and <target> not found
end
if not flag and (idx_bot == idx_top) then -- set true just before we evaluate the last range
flag = true;
end
end
end
--[[--------------------------< E X P A N D _ R A N G E >-----------------------------------------------------
expand range <range_t>[1] to <range_t>[2] into <out_t> as singles:
{10, 15} -> {10, 11, 12, 13, 14, 15}
]]
local function expand_range (range_t, out_t)
for i=range_t[1], range_t[2] do
table.insert (out_t, i);
end
end
--[[--------------------------< M A K E _ R A N G E S _ F R O M _ S I N G L E S >------------------------------
search <scripts_singles_t> and for ranges of contiguous codepoints to be added to the ranges list. Singles
ranges added to the ranges will be removed from the final singles list later during output formatting.
]]
local function make_ranges_from_singles (scripts_singles_t, ranges_from_singles_t)
local singles_t = {}; -- sequence of singles suitable for sorting
for k, _ in pairs (scripts_singles_t) do
table.insert (singles_t, k); -- add codepoint to singles_t
end
table.sort (singles_t); -- ascending sort to get them all in increasing order
local bottom, top; -- bottom and top of extracted range
for i, single in ipairs (singles_t) do
if not bottom and (single + 1) == singles_t[i+1] then -- if (singles_t[i]+1) same value as next element ([i+1])
bottom = single; -- set new range bottom value
top = singles_t[i+1]; -- set new range top value
table.remove (singles_t, i); -- remove range bottom value from table (<i> now indexes top range value)
while (top + 1) == singles_t[i+1] do -- if (singles_t[i]+1) same value as next element ([i+1])
top = singles_t[i+1]; -- set new top
table.remove (singles_t, i); -- remove range bottom value from table (<i> now indexes new top range value)
end
end
if bottom then -- not nil when we have extracted a range
mw.log (string.format ('%s–%s (%.4X..%.4X) extracted from singles_t', bottom, top, bottom, top));
table.insert (ranges_from_singles_t, {bottom, top}) -- save the extracted range
bottom = nil; -- unset these for the next range
top = nil;
end
end
end
--[[--------------------------< M A I N >---------------------------------------------------------------------
{{#invoke:Sandbox/trappist the monk/is latn|main}}
build composite lists (single and ranges) of common- and latn-script codepoints.
Duplicates are singles and ranges swallowed.
When a range has a different length from another range with the same starting point, this function takes the
longest range.
When a range is a subset of a larger range, the subset range is removed from the list.
Contiguous ranges (ending codepoint of one range is one less than the starting codepoint of the next range) are
joined to make a single range.
Expands all ranges into singles and combines with separately defined singles to create one long list of singles
because why not?
Finally the lists are made all pretty-like and rendered for copy pasta into an appropriate data module for use
by Module:Lang.
TODO: detect and remove overlapping ranges where one range starts in one range and ends in another range?
TODO: there are contiguous codepoints listed in the singles list; combine these into ranges
]]
local function main (frame)
zyyy_latn_codepoints_get(); -- get common- and latn-script codepoints from [[Module:Unicode data/scripts]]
extension_codepoints_get(); -- get latn-script codepoints from local copy of unicode scripts text file
local scripts_singles_t = {};
for _, scripts_t in ipairs ({latn_scripts_singles_t, common_scripts_singles_t, extension_scripts_singles_t}) do
for k, v in pairs (scripts_t) do
scripts_singles_t[k] = v; -- duplicates (if any) are swallowed
end
end
local ranges_from_singles_t = {}; -- a sequence of sequences
make_ranges_from_singles (scripts_singles_t, ranges_from_singles_t); -- add contiguous singles in <scripts_singles_t> to <ranges_from_singles_t>
local temp_t = {}; -- for ranges; <k> is range low value, <v> is range high value
for _, ranges_t in ipairs ({latn_scripts_ranges_t, common_scripts_ranges_t, extension_scripts_ranges_t, ranges_from_singles_t}) do
for _, range_t in pairs (ranges_t) do
if temp_t[range_t[1]] then
if temp_t[range_t[1]] ~= range_t[2] then
mw.log (range_t[1] .. '–' .. range_t[2] .. string.format (' (%x..%x) ', range_t[1], range_t[2]) .. 'does not match: ' .. temp_t[range_t[1]] .. string.format (' (%x)', temp_t[range_t[1]]));
if temp_t[range_t[1]] > range_t[2] then
range_t[2] = temp_t[range_t[1]]; -- use the greater high value
end
else
mw.log (range_t[1] .. '–' .. range_t[2] .. string.format (' (%x..%x) ', range_t[1], range_t[2]) .. ' is duplicate'); -- log and overwrite existing range
end
end
temp_t[range_t[1]] = range_t[2]; -- add to temp table
end
end
local scripts_ranges_t = {};
for k, v in pairs (temp_t) do -- make a sequence of codepoint range sequences
table.insert (scripts_ranges_t, {k, v});
end
local function sort (a_t, b_t) -- local function to ascending sort range tables
return a_t[1] < b_t[1];
end
table.sort (scripts_ranges_t, sort); -- ascending sort the range sequence
for k, v_t in ipairs (scripts_ranges_t) do -- remove ranges that are subsets of other ranges; must be sorted first
if k == #scripts_ranges_t then
break; -- done because there is no scripts_ranges_t[k+1]
end
if (scripts_ranges_t[k+1][1] < v_t[2]) and (scripts_ranges_t[k+1][2] < v_t[2]) then -- next range start and end less than current range end
mw.log ('removed subrange' .. scripts_ranges_t[k+1][1] .. '–' .. scripts_ranges_t[k+1][2] .. string.format (' (%x..%x) ', scripts_ranges_t[k+1][1], scripts_ranges_t[k+1][2]));
table.remove (scripts_ranges_t, k+1)
end
end
local i = 1; -- indexer
while i ~= #scripts_ranges_t do -- join contiguous ranges into a single range;
if (scripts_ranges_t[i][2] + 1) == scripts_ranges_t[i+1][1] then -- example: if {0, 64+1} == {65, 90} then join
mw.log (string.format ('joined: %s..%s and %s..%s', scripts_ranges_t[i][1], scripts_ranges_t[i][2], scripts_ranges_t[i+1][1], scripts_ranges_t[i+1][2]))
scripts_ranges_t[i][2] = scripts_ranges_t[i+1][2]; -- join
table.remove (scripts_ranges_t, i+1); -- remove joined
else
i = i+1; -- not contiguous, bump the indexer
end
end
local singles_out_t = {}; -- sequence to hold singles_out_t
local expanded_out_t = {}; -- sequence to hold singles + explanded ranges
for k, _ in pairs (scripts_singles_t) do
if binary_search (k, scripts_ranges_t) then -- omit singles that are included in ranges_out_t
mw.log (string.format ('removed: %s (%X)', k, k));
else
table.insert (singles_out_t, k);
table.insert (expanded_out_t, k);
end
end
table.sort (singles_out_t);
for i, v in ipairs (singles_out_t) do
local single_str = string.format ('[%s] = true,', v);
local rep = math.ceil ((80 - (4 + single_str:len())) / 4);
singles_out_t[i] = string.format ('\t%s%s-- %.4X', single_str, string.rep ('\t', rep), v);
end
table.insert (singles_out_t, 1, '<syntaxhighlight lang="lua">local singles_t = {'); -- opening stuff
table.insert (singles_out_t, '\t}</syntaxhighlight>'); -- to close the table
local ranges_out_t = {};
for _, v_t in ipairs (scripts_ranges_t) do
local range_str = string.format ('{%s, %s},', v_t[1], v_t[2]);
local rep = math.ceil ((80 - (4 + range_str:len())) / 4);
table.insert (ranges_out_t, string.format ('\t%s%s-- %.4X..%.4X', range_str, string.rep ('\t', rep), v_t[1], v_t[2]));
expand_range (v_t, expanded_out_t); -- expand this range into <expanded_out_t>
end
table.insert (ranges_out_t, 1, '<syntaxhighlight lang="lua">local ranges_t = {'); -- opening stuff
table.insert (ranges_out_t, '\t}</syntaxhighlight>'); -- to close the table
table.sort (expanded_out_t);
for i, v in ipairs (expanded_out_t) do
local single_str = string.format ('[%s] = true,', v);
local rep = math.ceil ((80 - (4 + single_str:len())) / 4);
expanded_out_t[i] = string.format ('\t%s%s-- %.4X', single_str, string.rep ('\t', rep), v);
end
table.insert (expanded_out_t, 1, '<syntaxhighlight lang="lua">local singles_t = {'); -- opening stuff
table.insert (expanded_out_t, '\t}</syntaxhighlight>'); -- to close the table
return frame:preprocess (table.concat ({ -- make a big string and done
table.concat (singles_out_t, '\n'),
'\n\n',
table.concat (ranges_out_t, '\n'),
'\n\n',
table.concat (expanded_out_t, '\n'),
}));
end
--[[--------------------------< E X P O R T S >---------------------------------------------------------------
]]
return {
main = main,
}