Bước tới nội dung

Mô đun:Ancient Greek

Bách khoa toàn thư mở Wikipedia
local p = {}local macron = mw.ustring.char(0x304)local breve = mw.ustring.char(0x306)local rough = mw.ustring.char(0x314)local smooth = mw.ustring.char(0x313)local diaeresis = mw.ustring.char(0x308)local acute = mw.ustring.char(0x301)local grave = mw.ustring.char(0x300)local circumflex = mw.ustring.char(0x342)local Latin_circumflex = mw.ustring.char(0x302)local subscript = mw.ustring.char(0x345)local macron_circumflex = macron .. diaeresis .. '?' .. Latin_circumflexlocal is_velar = { ['κ'] = true, ['γ'] = true, ['χ'] = true, ['ξ'] = true, }local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳlocal info = {}-- The tables are shared among different characters so that they can be checked-- for equality if needed, and to use less space.local vowel = { vowel = true, diacritic_seat = true }local iota = { vowel = true, diacritic_seat = true, offglide = true }local upsilon = { vowel = true, diacritic_seat = true, offglide = true }-- Technically rho is only a seat for rough or smooth breathing.local rho = { consonant = true, diacritic_seat = true }local consonant = { consonant = true }local diacritic = { diacritic = true }-- Needed for equality comparisons.local breathing = { diacritic = true }local function add_info(characters, t)	if type(characters) == "string" then		for character in string.gmatch(characters, UTF8_char) do			info[character] = t		end	else		for _, character in ipairs(characters) do			info[character] = t		end	endendadd_info({ macron, breve,		diaeresis,		acute, grave, circumflex,		subscript,	}, diacritic)add_info({rough, smooth}, breathing)add_info("ΑΕΗΟΩαεηοω", vowel)add_info("Ιι", iota)add_info("Υυ", upsilon)add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant)add_info("Ρρ", rho)local not_recognized = {}setmetatable(info, { __index =	function()		return not_recognized	end})local function quote(str)	return "“" ..  str .. "”"endlocal correspondences = {	-- Vowels	["α"] = "a",	["ε"] = "e",	["η"] = "e" .. macron,	["ι"] = "i",	["ο"] = "o",	["υ"] = "u",	["ω"] = "o" .. macron,	-- Consonants	["β"] = "b",	["γ"] = "g",	["δ"] = "d",	["ζ"] = "z",	["θ"] = "th",	["κ"] = "k",	["λ"] = "l",	["μ"] = "m",	["ν"] = "n",	["ξ"] = "x",	["π"] = "p",	["ρ"] = "r",	["σ"] = "s",	["ς"] = "s",	["τ"] = "t",	["φ"] = "ph",	["ψ"] = "ps",		-- Archaic letters	["ϝ"] = "w",	["ϻ"] = "ś",	["ϙ"] = "q",	["ϡ"] = "š",	["ͷ"] = "v",		-- Diacritics	[smooth] = '',	[rough] = '', -- h is added below in the `transliterate` function.	[breve] = '',}local ALA_LC = {	["χ"] = "ch",	[acute] = '',	[grave] = '',	[circumflex] = '',	[subscript] = '',	[diaeresis] = '',	[macron] = '',}local Wiktionary_transliteration = {	["χ"] = "kh",	[circumflex] = Latin_circumflex,	[subscript] = 'i',}local function add_index_metamethod(t, index_metamethod)	local mt = getmetatable(t)	if not mt then		mt = {}		setmetatable(t, mt)	end	mt.__index = index_metamethodend--[=[		This breaks a word into meaningful "tokens", which are		individual letters or diphthongs with their diacritics.		Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].--]=]local function tokenize(text)	local tokens, vowel_info, prev_info = {}, {}, {}	local token_i = 1	local prev	for character in string.gmatch(mw.ustring.toNFD(text), UTF8_char) do		local curr_info = info[character]		-- Split vowels between tokens if not a diphthong.		if curr_info.vowel then			if prev and (not (curr_info.offglide and prev_info.vowel)					-- υυ → υ, υ					-- ιυ → ι, υ					or prev_info.offglide and curr_info == upsilon) then				token_i = token_i + 1			end			tokens[token_i] = (tokens[token_i] or "") .. character			table.insert(vowel_info, { index = token_i })		elseif curr_info.diacritic then			tokens[token_i] = (tokens[token_i] or "") .. character			if prev_info.vowel or prev_info.diacritic then				if character == diaeresis then					-- Current token is vowel, vowel, possibly other diacritics,					-- and a diaeresis.					-- Split the current token into two:					-- the first letter, then the second letter plus any diacritics.					local previous_vowel, vowel_with_diaeresis = string.match(tokens[token_i], "^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")					if previous_vowel then						tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis						token_i = token_i + 1					end				end			elseif prev_info == rho then				if curr_info ~= breathing then					return string.format("The character %s cannot have the accent %s on it.", prev, "◌" .. character)				end			else				error("The character " .. quote(prev) .. " cannot have a diacritic on it.")			end		elseif curr_info == rho then			if prev and not (prev_info == breathing and info[string.match(tokens[token_i], "^" .. basic_Greek)] == rho) then				token_i = token_i + 1			end			tokens[token_i] = (tokens[token_i] or "") .. character		else			if prev then				token_i = token_i + 1			end			tokens[token_i] = (tokens[token_i] or "") .. character		end		prev = character		prev_info = curr_info	end	return tokensendfunction p.transliterate(text, system)	add_index_metamethod(correspondences, system == "ALA-LC" and ALA_LC or Wiktionary_transliteration)		if text == '῾' then		return 'h'	end		text = mw.ustring.toNFD(text)		--[[		Replace semicolon or Greek question mark with regular question mark,		except after an ASCII alphanumeric character (to avoid converting		semicolons in HTML entities).	--]]	text = mw.ustring.gsub(text, "([^A-Za-z0-9])[;" .. mw.ustring.char(0x37E) .. "]", "%1?")		-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.	text = text:gsub("·", ";")		local tokens = tokenize(text)	--now read the tokens	local output = {}	for i, token in pairs(tokens) do		-- substitute each character in the token for its transliteration		local translit = string.gsub(mw.ustring.lower(token), UTF8_char, correspondences)				if token == 'γ' and is_velar[tokens[i + 1]] then			-- γ before a velar should be <n>			translit = 'n'		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then			-- ρ after ρ should be <rh>			translit = 'rh'		elseif system == "Wiktionary" and mw.ustring.find(token, '^[αΑ].*' .. subscript .. '$') then			-- add macron to ᾳ			translit = mw.ustring.gsub(translit, '([aA])', '%1' .. macron)		end				if token:find(rough) then			if mw.ustring.find(token, '[Ρρ]') then				translit = translit .. 'h'			else -- vowel				translit = 'h' .. translit			end		end				if system == "ALA-LC" and mw.ustring.find(token, '^[υΥ][^ιΙ]*$') then			translit = translit:gsub('u', 'y'):gsub('U', 'Y')		end				-- Remove macron from a vowel that has a circumflex.		if mw.ustring.find(translit, macron_circumflex) then			translit = translit:gsub(macron, '')		end				-- Capitalize first character of transliteration.		if token ~= mw.ustring.lower(token) then			translit = mw.ustring.gsub(translit, "^.", mw.ustring.upper)		end				table.insert(output, translit)	end		return table.concat(output)endfunction p.translit(frame)	local text = frame.args[1] or frame:getParent().args[1]		local system = frame.args.system	if system == nil or system == "" then		system = "Wiktionary"	elseif not (system == "ALA-LC" or system == "Wiktionary") then		error('Transliteration system in |system= not recognized; choose between "ALA-LC" and "Wiktionary"')	end		local transliteration = p.transliterate(text, system)	return '<span title="Ancient Greek transliteration" lang="grc-Latn"><i>' .. transliteration .. '</i></span>'endfunction p.bare_translit(frame)	return p.transliterate(frame.args[1] or frame:getParent().args[1])endreturn p