Module:Unicode data: Difference between revisions

m 1 revision imported: Templates and CSS files
m 1 revision imported: template update
 
(3 intermediate revisions by 3 users not shown)
Line 98: Line 98:
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x30000,  0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x31350,  0x323AF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension H
{  0x2EBF0,  0x2EE5D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension I
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{  0xF0000,  0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
{ 0x100000, 0x10FFFD, "<private-use-%04X>" }  -- Plane 16 Private Use
Line 125: Line 126:
--]]
--]]


-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.is_noncharacter(codepoint)
function p.lookup_name(codepoint)
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned
-- (Cn) and specifically noncharacters:
-- (Cn) and specifically noncharacters:
-- https://www.unicode.org/faq/private_use.html#nonchar4
-- https://www.unicode.org/faq/private_use.html#nonchar4
if 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
return 0xFDD0 <= codepoint and (codepoint <= 0xFDEF
or floor(codepoint % 0x10000) >= 0xFFFE) then
or floor(codepoint % 0x10000) >= 0xFFFE)
end
 
-- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8
function p.lookup_name(codepoint)
if p.is_noncharacter(codepoint) then
return ("<noncharacter-%04X>"):format(codepoint)
return ("<noncharacter-%04X>"):format(codepoint)
end
end
Line 161: Line 167:
end
end


--[[
-- No image data modules on Wikipedia yet.
function p.lookup_image(codepoint)
function p.lookup_image(codepoint)
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
local data = loader[('images/%03X'):format(codepoint / 0x1000)]
Line 170: Line 174:
end
end
end
end
--]]


local planes = {
local planes = {
Line 437: Line 440:
local Latn = false
local Latn = false
local i = 0; -- indexer for use in error messages
for codepoint in mw.ustring.gcodepoint(str) do
for codepoint in mw.ustring.gcodepoint(str) do
i = i + 1; -- bump the indexer
local script = lookup_script(codepoint)
local script = lookup_script(codepoint)
Line 445: Line 450:
elseif not (script == "Zyyy" or script == "Zinh"
elseif not (script == "Zyyy" or script == "Zinh"
or script == "Zzzz") then
or script == "Zzzz") then
return false
return false, i -- abandon as not Latn; identify the offending character's position
end
end
end
end
return Latn
return Latn, (not Latn and i) or nil -- when <Latn> false, return offending charactor's position as second return value; nil else
end
end


Line 481: Line 486:
return result
return result
end
end
--[[--------------------------< I S _ R T L _ F R A M E >------------------------------------------------------
external entry from an {{#invoke:}} to determine if a string of text is rtl.  Strips html and html-like tags so
that those tags don't corrupt the is-rtl-is-not-rtl determination; this added for the cases where the rtl text
has <br /> tags.
]]
function p.is_rtl_frame (frame)
local str = frame.args[1]; -- get the string from the {{#invoke:}} frame
str = str:gsub ('%b<>', ''); -- strip any html and html-like tags
return p.is_rtl (str); -- return if whatever remains rtl; false else
end


local function get_codepoint(args, arg)
local function get_codepoint(args, arg)