Module:za-sortkey

Wiktionary වෙතින්

This module will sort Zhuang භාෂාව text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{sortkey}}. Within a module, use Module:languages#Language:makeSortKey.

For testcases, see Module:za-sortkey/testcases.

Functions

makeSortKey(text, lang, sc)
Generates a sortkey for a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the sort fails, returns nil.
Alphabetic order
a ae (ə) b by c d e f g gv gy h i k l m mb (ƃ) my n nd (ƌ) ng (ŋ) ngv (ŋv) ny o oe (ɵ) p r s t u v w (ɯ) y
A A₂ (A₂ₐ) B B₂ C D E F G G₂ G₃ H I K L M M₂ (M₂ₐ) M₃ N N₂ (N₂ₐ) N₃ (N₃ₐ) N₄ (N₄ₐ) N₅ O O₂ (O₂ₐ) P R S T U V W Wₐ Y

Note: letters from the old orthography (in brackets) are sorted immediately after their new equivalents.

Order of tones
z (ƨ) j (з) x (ч) q (ƽ) h (ƅ)
² (²ᵃ) ³ (³ᵃ) (⁴ᵃ) (⁵ᵃ) (⁶ᵃ)

Note: "h" will sort as H if used as a consonant, or if used as a tone letter.

If a syllable has no tone letter but ends with a consonant, then the following tone values are used:

m n ng (ŋ) k p t b d g
N₃¹ (N₃ₐ¹) K⁷ P⁷ T⁷ B⁸ D⁸ G⁸

If new_bor=y or new_bor=1 are detected as parameters of {{za-pron}} on the page, then tone 5 is substituted for tone 1 in the sortkey. If {{za-1957 spelling of}} or {{za-1957 orthography of}} are detected on the page, then the page for the new orthography is checked for new_bor=y or new_bor=1 as well.

Examples[සංස්කරණය]

  • N₂A₂³ DAN₃³ VUEN₃²DA₂⁵ SIEN₃¹ DAN₃³ SIEN¹
ndaej dangj vuengzdaeq sieng dangj sien
  • LWG⁸FWN₃²G₃AN₃¹
lwgfwngzgyang
  • LAN₃⁶ BIT⁷ RO₂N₃² RA₂M⁴
langh bit roengz raemx
  • FAN₃²CWN₃²GAN₃³ᵃ
Faŋƨcɯŋƨgaŋз
  • GIEN²N₂AN₃¹CAN₃⁵
gienzndangcangq

Tone 5 substitution:

  • GUN₃¹CAN³DAN₃³
gungcanjdangj
  • GUN₃¹CAN³DAN₃³ᵃ
guŋcanзdaŋз
(due to the new_bor=1 parameter on gungcanjdangj)


  • bya (B₂A¹)
  • byaz (B₂A²)
  • byaƨ (B₂A²ᵃ)
  • byaj (B₂A³)
  • byaз (B₂A³ᵃ)
  • byax (B₂A⁴)
  • byaч (B₂A⁴ᵃ)
  • byaq (B₂A⁵)
  • byaƽ (B₂A⁵ᵃ)
  • byah (B₂A⁶)
  • byaƅ (B₂A⁶ᵃ)
  • byab (B₂AB⁸)
  • byad (B₂AD⁸)
  • byag (B₂AG⁸)
  • byak (B₂AK⁷)
  • byam (B₂AM¹)
  • byan (B₂AN¹)
  • byang (B₂AN₃¹)
  • byaŋ (B₂AN₃¹!)
  • byap (B₂AP⁷)
  • byat (B₂AT⁷)


  • a'a (A¹A¹)
  • aba (A¹BA¹)
  • a'ba (A¹BA¹)
  • a'da (A¹DA¹)
  • ada (A¹DA¹)
  • a'ga (A¹GA¹)
  • aga (A¹GA¹)
  • a'ha (A¹HA¹)
  • aha (A¹HA¹)
  • aka (A¹KA¹)
  • a'ma (A¹MA¹)
  • ama (A¹MA¹)
  • a'na (A¹NA¹)
  • ana (A¹NA¹)
  • anga (A¹N₃A¹)
  • apa (A¹PA¹)
  • ata (A¹TA¹)
  • aza (A²A¹)
  • aƨa (A²A¹!)
  • aja (A³A¹)
  • aзa (A³A¹!)
  • axa (A⁴A¹)
  • aчa (A⁴A¹!)
  • aqa (A⁵A¹)
  • aƽa (A⁵A¹!)
  • ah'a (A⁶A¹)
  • aƅa (A⁶A¹!)
  • abza (AB²A¹)
  • abƨa (AB²A¹!)
  • abja (AB³A¹)
  • abзa (AB³A¹!)
  • abxa (AB⁴A¹)
  • abчa (AB⁴A¹!)
  • abqa (AB⁵A¹)
  • abƽa (AB⁵A¹!)
  • abh'a (AB⁶A¹)
  • abƅa (AB⁶A¹!)
  • ab'a (AB⁸A¹)
  • ab'ha (AB⁸HA¹)
  • abha (AB⁸HA¹)
  • adza (AD²A¹)
  • adƨa (AD²A¹!)
  • adja (AD³A¹)
  • adзa (AD³A¹!)
  • adxa (AD⁴A¹)
  • adчa (AD⁴A¹!)
  • adqa (AD⁵A¹)
  • adƽa (AD⁵A¹!)
  • adh'a (AD⁶A¹)
  • adƅa (AD⁶A¹!)
  • ad'a (AD⁸A¹)
  • ad'ha (AD⁸HA¹)
  • adha (AD⁸HA¹)
  • agza (AG²A¹)
  • agƨa (AG²A¹!)
  • agja (AG³A¹)
  • agзa (AG³A¹!)
  • agxa (AG⁴A¹)
  • agчa (AG⁴A¹!)
  • agqa (AG⁵A¹)
  • agƽa (AG⁵A¹!)
  • agƅ (AG⁶ᵃ)
  • agh'a (AG⁶A¹)
  • ag'a (AG⁸A¹)
  • ag'ha (AG⁸HA¹)
  • agha (AG⁸HA¹)
  • akza (AK²A¹)
  • akƨa (AK²A¹!)
  • akja (AK³A¹)
  • akзa (AK³A¹!)
  • akxa (AK⁴A¹)
  • akчa (AK⁴A¹!)
  • akqa (AK⁵A¹)
  • akƽa (AK⁵A¹!)
  • akh'a (AK⁶A¹)
  • akƅa (AK⁶A¹!)
  • akha (AK⁷HA¹)
  • ak'ha (AK⁷HA¹)
  • am'a (AM¹A¹)
  • am'ha (AM¹HA¹)
  • amha (AM¹HA¹)
  • amza (AM²A¹)
  • amƨa (AM²A¹!)
  • amja (AM³A¹)
  • amзa (AM³A¹!)
  • amxa (AM⁴A¹)
  • amчa (AM⁴A¹!)
  • amqa (AM⁵A¹)
  • amƽa (AM⁵A¹!)
  • amh'a (AM⁶A¹)
  • amƅa (AM⁶A¹!)
  • an'a (AN¹A¹)
  • an'ga (AN¹GA¹)
  • an'ha (AN¹HA¹)
  • anha (AN¹HA¹)
  • anza (AN²A¹)
  • anƨa (AN²A¹!)
  • anja (AN³A¹)
  • anзa (AN³A¹!)
  • anxa (AN⁴A¹)
  • anчa (AN⁴A¹!)
  • anqa (AN⁵A¹)
  • anƽa (AN⁵A¹!)
  • anh'a (AN⁶A¹)
  • anƅa (AN⁶A¹!)
  • (AN₃¹!)
  • ang'a (AN₃¹A¹)
  • ang'ha (AN₃¹HA¹)
  • angha (AN₃¹HA¹)
  • angza (AN₃²A¹)
  • aŋƨa (AN₃²A¹!)
  • angja (AN₃³A¹)
  • aŋзa (AN₃³A¹!)
  • angxa (AN₃⁴A¹)
  • aŋчa (AN₃⁴A¹!)
  • angqa (AN₃⁵A¹)
  • aŋƽa (AN₃⁵A¹!)
  • angh'a (AN₃⁶A¹)
  • aŋƅa (AN₃⁶A¹!)
  • apza (AP²A¹)
  • apƨa (AP²A¹!)
  • apja (AP³A¹)
  • apзa (AP³A¹!)
  • apxa (AP⁴A¹)
  • apчa (AP⁴A¹!)
  • apqa (AP⁵A¹)
  • apƽa (AP⁵A¹!)
  • aph'a (AP⁶A¹)
  • apƅa (AP⁶A¹!)
  • ap'ha (AP⁷HA¹)
  • apha (AP⁷HA¹)
  • atza (AT²A¹)
  • atƨa (AT²A¹!)
  • atja (AT³A¹)
  • atзa (AT³A¹!)
  • atxa (AT⁴A¹)
  • atчa (AT⁴A¹!)
  • atqa (AT⁵A¹)
  • atƽa (AT⁵A¹!)
  • ath'a (AT⁶A¹)
  • atƅa (AT⁶A¹!)
  • at'ha (AT⁷HA¹)
  • atha (AT⁷HA¹)

local export = {}
local u = mw.ustring.char
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
local a, b, c, d, e, f, g = u(0xF000), u(0xF001), u(0xF002), u(0xF003)
local b2 = u(0xF100)
local g2, g3 = u(0xF200), u(0xF201)
local m2, m4 = u(0xF300), u(0xF301)
local n2, n4, n6, n7, n8 = u(0xF400), u(0xF401), u(0xF402), u(0xF403), u(0xF404)

local remove_diacritics = "'" -- apostrophe

local oneCharInit = {
	["z"] = "2", ["ƨ"] = "2!", ["j"] = "3", ["з"] = "3!", ["x"] = "4", ["ч"] = "4!", ["q"] = "5", ["ƽ"] = "5!", ["ƅ"] = "6!"
}

local twoCharsInit = {
	["by"] = b2, ["gv"] = g2, ["gy"] = g3, ["mb"] = m2, ["my"] = m4, ["nd"] = n2, ["ng"] = n4, ["ŋv"] = n7, ["ny"] = n8
}

local threeCharsInit = {
	["ngv"] = n6
}

local conditionalTones1 = {
	["h"] = "6"
}

local conditionalTones2 = {
	["m"] = "m1", ["n"] = "n1", [n4] = n4 .. "1", ["ŋ"] = "ŋ1", ["k"] = "k7", ["p"] = "p7", ["t"] = "t7", ["b"] = "b8", ["d"] = "d8", ["g"] = "g8"
}

local oneCharFinal = {
	["ə"] = "a" .. a .. "!", [b2] = "b" .. a, [g2] = "g" .. a, [g3] = "g" .. b, [m2] = "m" .. a, ["ƃ"] = "m" .. a .. "!", [m4] = "m" .. b, [n2] = "n" .. a, ["ƌ"] = "n" .. a .. "!", [n4] = "n" .. b, ["ŋ"] = "n" .. b .. "!", [n6] = "n" .. c, [n7] = "n" .. c .. "!", [n8] = "n" .. d, ["ɵ"] = "o" .. a .. "!", ["ɯ"] = "w!"
}

local twoCharsFinal = {
	["ae"] = "a" .. a, ["oe"] = "o" .. a
}

function export.makeSortKey(text, lang, sc)
	local origText = text
	text = mw.ustring.lower(text)
	
	-- convert any consonant clusters to single characters, which is necessary for later regexes, and unconditional tone letters to numbers
	for from, to in pairs(threeCharsInit) do
		text = text:gsub(from, to)
	end
	
	for from, to in pairs(twoCharsInit) do
		text = text:gsub(from, to)
	end
	
	text = text:gsub(UTF8_char, oneCharInit)
	
	-- conditionally convert any conditional tone letters to numbers (e.g. "h" can be a consonant or a tone letter)
	for from, to in pairs(conditionalTones1) do
		text = text:gsub(from .. "$", to)
		text = mw.ustring.gsub(text, from .. "([^1-8aeiouwəɵɯ])", to .. "%1")
	end
	
	-- conditionally add a tone number to any syllable-final consonants which do not have them
	for from, to in pairs(conditionalTones2) do
		text = text:gsub(from .. "$", to)
		text = mw.ustring.gsub(text, from .. "([^1-8aeiouwəɵɯ])", to .. "%1")
	end
	
	-- conditionally add a tone number to any syllable-final vowels which do not have them
	text = mw.ustring.gsub(text, "([^1-8%s%p])$", "%11")
	text = mw.ustring.gsub(text, "([1-8][" .. a .. "-" .. d .. "])1$", "%1")
	text = mw.ustring.gsub(text, "([aeiouwəɵɯ])([^1-8aeiouwəɵɯ][^1-8])", "%11%2")
	
	-- convert clusters and non-ASCII characters to final form, to achieve correct order
	for from, to in pairs(twoCharsFinal) do
		text = text:gsub(from, to)
	end
	
	text = text:gsub(UTF8_char, oneCharFinal)
	
	-- move "!" to the end and remove any duplicates, to ensure old orthography terms are sorted immediately after their new equivalents
	for old in text:gmatch("!") do text = text:gsub("(!)(.+)", "%2%1") end
	text = text:gsub("!+", "!")
	
	-- if tone 5 is substituted for tone 1 in pronunciation, also substitute in sortkey (i.e. as though "q" were written)
	local page = mw.title.new(origText):getContent() or ""
	if mw.ustring.match(page, "{{za%-pron|.*new_bor=1}}") or mw.ustring.match(page, "{{za%-pron|.*new_bor=y}}") then
		text = mw.ustring.gsub(text, "1", "5")
	-- if the page has the old orthography template, then check the modern orthography page and substitute if present there (i.e. as though "ƽ" were written)
	elseif mw.ustring.match(page, "{{za%-1957 spelling of|.*}}") then
		local parentPage = mw.title.new(mw.ustring.match(page, "{{za%-1957 spelling of|(.-)}}")):getContent() or ""
		if mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=1}}") or mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=y}}") then
			text = mw.ustring.gsub(text, "1", "5" .. a)
		end
	elseif mw.ustring.match(page, "{{za%-1957 orthography of|.*}}") then
		local parentPage = mw.title.new(mw.ustring.match(page, "{{za%-1957 orthography of|(.-)}}")):getContent() or ""
		if mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=1}}") or mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=y}}") then
			text = mw.ustring.gsub(text, "1", "5" .. a)
		end
	end
	
	-- decompose, remove appropriate diacritics, then recompose again
	return mw.ustring.upper(mw.ustring.toNFC(mw.ustring.gsub(mw.ustring.toNFD(text), "[" .. remove_diacritics .. "]", "")))
end

local za = require("Module:languages").getByCode("za")
local function tag(text)
	return require("Module:script utilities").tag_text(text, za)
end

local showsubst1 = {
	["0"] = "⁰", ["1"] = "¹", ["2"] = "²", ["3"] = "³", ["4"] = "⁴", ["5"] = "⁵", ["6"] = "⁶", ["7"] = "⁷", ["8"] = "⁸"
}

local showsubst2 = {
	["2!"] = "²ᵃ", ["3!"] = "³ᵃ", ["4!"] = "⁴ᵃ", ["5!"] = "⁵ᵃ", ["6!"] = "⁶ᵃ", ["A" .. a] = "A₂", ["A" .. a .. "!"] = "A₂ₐ", ["B" .. a] = "B₂", ["G" .. a] = "G₂", ["G" .. b] = "G₃", ["M" .. a] = "M₂", ["M" .. a .. "!"] = "M₂ₐ", ["M" .. b] = "M₃", ["N" .. a] = "N₂", ["N" .. a .. "!"] = "N₂ₐ", ["N" .. b] = "N₃", ["N" .. b .. "!"] = "N₃ₐ", ["N" .. c] = "N₄", ["N" .. c .. "!"] = "N₄ₐ", ["N" .. d] = "N₅", ["O" .. a] = "O₂", ["O" .. a .. "!"] = "O₂ₐ", ["W!"] = "Wₐ"
}

function export.showSortkey(frame)
	local output = {}
	
	for _, word in ipairs(frame.args) do
		local sc = za:findBestScript(word):getCode()
		local sortkey = export.makeSortKey(word, "za", sc)
		for from, to in pairs(showsubst2) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		for from, to in pairs(showsubst1) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		local example = "\n* <code>" .. sortkey .. "</code>\n: " .. tag(word)
		table.insert(output, example)
	end
	
	return table.concat(output)
end

function export.showSorting(frame)
	local terms = {}
	
	for _, term in ipairs(frame.args) do
		table.insert(terms, term)
	end
	
	local makeSortKey = require("Module:fun").memoize(export.makeSortKey)
	local function comp(term1, term2)
		return makeSortKey(term1) < makeSortKey(term2)
	end
	
	table.sort(terms, comp)
	
	for i, term in pairs(terms) do
		local sc = za:findBestScript(term):getCode()
		local sortkey = export.makeSortKey(term, "za", sc)
		for from, to in pairs(showsubst2) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		for from, to in pairs(showsubst1) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		terms[i] = "\n* " .. tag(term) .. " (<code>" .. sortkey .. "</code>)"
	end
	
	return table.concat(terms)
end

return export
"https://si.wiktionary.org/w/index.php?title=Module:za-sortkey&oldid=163789" වෙතින් සම්ප්‍රවේශනය කෙරිණි