Bước tới nội dung

Mô đun:Ko-translit

Bách khoa toàn thư mở Wikipedia

local p = {}local find = mw.ustring.findlocal gsub = mw.ustring.gsublocal m_data = require('Module:Ko-translit/data')local m_utils = require('Module:Ko-utils')local get_args = require('Module:Arguments').getArgs--[[IMPORTANT NOTE before editing this module:1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them: ᄀ (U+1100) ᆨ (U+11A8) ㄱ (U+3131)2. When dealing with decomposed Hangul, a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ) b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $ For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$--]]local function gsub_iterate(text, table) for _, entry in ipairs(table) do text = gsub(text, entry[1], entry[2]) end return textendlocal function remove_links_and_markup(text) -- these either are unnecessary or interfere with assimilation -- remove bold/italic -- it is not impossible to allow bold/italic when it does not interfere with assimilation, but determining when to allow or disallow that adds complication for little practical gain text = gsub(text, "'''", "") text = gsub(text, "''", "") -- remove HTML tags (except br) text = gsub(text, "<[Bb][Rr] */?>", "&#10;") text = gsub(text, "</?[A-Za-z][^>]->", "") text = gsub(text, "&#10;", "<br>") -- remove wikilinks text = gsub(text, "%[%[[^%|]+%|(..-)%]%]", "%1") text = gsub(gsub(text, "%[%[", ""), "%]%]", "") text = mw.text.killMarkers(text) return textendlocal function disallow_invalid_input(text) -- very first step -- Hangul status: precomposed (한) -- input must contain Hangul if not m_utils.contains_hangul(text) then error("Input must contain Hangul") end -- no direct insertion of reference or footnote if m_utils.contains_reference(text) then error("Input cannot contain references") end -- if input contains Hangul not supported by RR and MR, change text to "N/A" and skip everything if find(text, "[ᄓ-ᅠᅶ-ᆧᇃ-ᇿ〮〯ㅤ-ㆎꥠ-꥿ힰ-퟿]") then text = "N/A" return text end text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders -- various validations of input if find(text, "[ᄀ-ᄒ]") or find(text, "[ᅡ-ᅵᆨ-ᇂ]") then error("Do not input conjoining Hangul jamo directly") elseif find(text, "`%*") then error("Use *` instead of `*") elseif find(text, "@%*") then error("Use *@ instead of @*") elseif find(text, "%^[^가-힣]") then error("^ must be immediately followed by Hangul syllabic block") elseif find(text, "[^%*0-9A-Za-z]`") or find(text, "[^0-9A-Za-z]%*`") or find(text, "`[^가-깋다-딯바-빟자-짛]") then error("Found invalid sequence containing `") elseif find(text, "[^%*ㄹ가-힣]@") or find(text, "[^가-힣]%*@") or find(text, "%*@[^가-깋다-딯바-빟자-짛]") or find(text, "ㄹ@[^가-깋다-딯바-빟사-싷자-짛]") or find(text, "@[^가-깋다-딯라-맇바-빟사-싷야-얳여-옣요-욯유-윶윸-윻이-잍잏자-짛하-힣]") then error("Found invalid sequence containing @") elseif find(text, "[^가-힣]%$") or find(text, "%$[^아어에엔엘여요으은을음읍의이인일임입]") then error("Found invalid sequence containing $") elseif find(text, "%%$") then error("Remove final %") elseif find(text, "[ _][ _]") then error("No two or more consecutive space characters") elseif find(text, "^[%$%*@_`]") or find(text, "^%%[^_가-힣]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[﷐-﷒]") or find(text, "%%_$") or find(text, "[%$%*@%^`]$") then error("Invalid input") end return textendlocal function check_invalid_seq(text) -- validity check after removing links and markups (before decomposing Hangul) -- Hangul status: precomposed (한) if find(text, "[ _][ _]") then error("No two or more consecutive space characters") elseif find(text, "^[%$%*@_`]") or find(text, "[ _]%*") or find(text, "%*[ %*%-_]") or find(text, "%-%*") or find(text, "[%$%*@%^_`]$") then error("Invalid input") end return textendlocal function check_invalid_seq_decomposed_hangul(text) -- validity check after decomposing Hangul -- Hangul status: decomposed (ᄒ+ᅡ+ᆫ) if find(text, "[ᆨ-ᆪᆬ-ᆮᆴ-ᆶᆸᆹᆻᆽ-ᇂ]%*?﷐?@﷐?[ᄀᄃᄇᄉᄌ]") or find(text, "ᆰ%*?﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆲ﷐?@﷐?[ᄀ-ᄊᄌ-ᄑ]") or find(text, "ᆺ%*@[ᄀᄇ]") or find(text, "ᆺ%*?﷐?@﷐?[ᄁ-ᄆᄈ-ᄊᄌ-ᄑ]") or find(text, "[ᅡ-ᅵᆨ-ᆪᆬ-ᇂ]﷐?@﷐?ᄅ") or find(text, "[ᅡ-ᅵ]﷐?@﷐?ᄋ") or find(text, "[ᅡ-ᅵᆫ-ᆭᆯᆱ-ᆷᆼ]﷐?@﷐?ᄒ") then error("Found invalid sequence containing @") elseif find(text, "[ᅡ-ᅵᆨᆫᆭ-ᆯᆶ-ᆸᆼ]﷐?%$") then error("Found invalid sequence containing $") end return textendlocal function parse_name(text) -- processing people names -- Hangul status: precomposed (한) local hanja_readings_final_L = "갈걸결골괄굴궐귤글길날녈놜눌닐달돌랄렬률말멸몰물밀발벌별불살설솔술슬실알얼열올왈울월율을일절졸줄즐질찰철촬출칠탈팔필할헐혈홀활훌휼흘힐" local hanja_readings_init_DSJ = "다단달담답당대댁덕도독돈돌동두둔득등사삭산살삼삽상새색생서석선설섬섭성세소속손솔송쇄쇠수숙순술숭쉬슬습승시식신실심십자작잔잠잡장재쟁저적전절점접정제조족존졸종좌죄주죽준줄중즉즐즙증지직진질짐집징" -- note: internally uses 3 noncharacters -- ﷐ (U+FDD0): mostly for given name in RR -- ﷑ (U+FDD1): marks beginning of name -- ﷒ (U+FDD2): marks end of name -- change % to U+FDD1 and U+FDD2 (end of string also terminates name mode) text = gsub(text, "%%([^%%]*)%%", "﷑%1﷒") text = gsub(text, "%%([^%%]*)$", "﷑%1﷒") -- disallow invalid input for name if find(text, "﷑_?﷒") then error("Name cannot be empty") elseif find(text, "﷑[^﷑﷒]*[^가-힣_ ][^﷑﷒]*﷒") then error("Invalid character in name") elseif find(text, "﷑ ") then error("Name cannot begin with space") elseif find(text, " ﷒") then error("Name cannot end with space") elseif find(text, "﷑[^﷒]*[ _][^﷒]*[ _][^﷒]*﷒") then error("No more than two components in name") elseif find(text, "﷑[가-힣]_") then error("No _ after one-syllable surname") elseif find(text, "﷑[^﷒]*[" .. hanja_readings_final_L .. "]@[" .. hanja_readings_init_DSJ .. "][^﷒]*﷒") then error("Contains unnecessary @ in name") -- see below end -- separate surname and given name -- if input contains _ or space, separate there text = gsub(text, "﷑([가-힣%$@]+)_﷒", "﷑^%1_﷒") -- for surname-only string text = gsub(text, "﷑_([가-힣%$@]+)﷒", "﷑_^%1﷒") -- for mononym text = gsub(text, "﷑([가-힣%$@]+)[ _]([가-힣%$@]+)﷒", "﷑^%1_^%2﷒") -- otherwise, separate after first syllabic block text = gsub(text, "﷑([가-힣])﷒", "﷑^%1_﷒") -- for surname-only string text = gsub(text, "﷑([가-힣])([가-힣%$@]+)﷒", "﷑^%1_^%2﷒") -- check invalid input after separating surname and given name if find(text, "﷑[^﷒]*_%^[%$@][^﷒]*﷒") then error("No @ or $ between surname and given name") end -- tensification of ㄹ + {ㄷ, ㅅ, ㅈ} (needed for MR; e.g. 홍길동 [홍길똥], 을지문덕 [을찌문덕]) -- does not occur when same syllable is repeated (e.g. 구구절절 [구구절절], not [구구절쩔]); just using U+FDD0 here too for i = 1, mw.ustring.len(text) do text = gsub(text, "﷑([^﷒]*)([달돌살설솔술슬실절졸줄즐질])%2([^﷒]*)﷒", "﷑%1%2﷐%2%3﷒") end -- now apply tensification for i = 1, mw.ustring.len(text) do text = gsub(text, "﷑([^﷒]*)([" .. hanja_readings_final_L .. "])([" .. hanja_readings_init_DSJ .. "])([^﷒]*)﷒", "﷑%1%2@%3%4﷒") end -- insert U+FDD0 in given name (needed for RR; e.g. 한복남 Han Boknam, not Han Bongnam) for i = 1, mw.ustring.len(text) do text = gsub(text, "﷑([^﷒]*)_%^([^﷒]*)([가-힣%$@])([가-힣%$@])([^﷒]*)﷒", "﷑%1_^%2%3﷐%4%5﷒") end -- remove _ which was needed for surname-only string and mononym text = gsub(text, "_﷒", "﷒") text = gsub(text, "﷑_%^", "﷑^") text = gsub(text, "[﷑﷒]", "") -- remove U+FDD1 and U+FDD2 return textendlocal function final_processing(text) -- final processing for RR and MR -- result should not contain Hangul if m_utils.contains_hangul(text) then error("Result contains Hangul; debugging required") end text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII -- if result is nothing (e.g. when input is just ㅇ) if text == "" then text = "—" end return textend-- Convert to Revised Romanizationfunction p.rr(frame) return p._rr(get_args(frame))endfunction p._rr(args) local text = args[1] text = disallow_invalid_input(text) if text == "N/A" then return text end text = parse_name(text) text = remove_links_and_markup(text) text = check_invalid_seq(text) text = gsub(text, "`", "") -- ignore ` (only needed for MR; not needed for RR) text = gsub_iterate(text, m_data.enclosed_hangul) text = m_utils.decompose_hangul(text) -- decompose Hangul text = check_invalid_seq_decomposed_hangul(text) text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only text = gsub_iterate(text, m_data.preprocessing) text = gsub(text, "ᇂᄉ", "ᄉ") -- convert remaining ㅎ combination text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants text = gsub_iterate(text, m_data.at_dollar_irregularities) -- @ for ㄴㄹ pronounced [ㄴㄴ], $ for 절음 법칙 text = gsub_iterate(text, m_data.at_irregularities_additional_rr) -- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p text = gsub_iterate(text, m_data.consonant_assimilations) -- consonant assimilations text = gsub(text, "ᆯᄅ", "ᆯl") -- ㄹㄹ is ll text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ} text = gsub_iterate(text, m_data.vowels_rr) -- replace Hangul vowels with romanized text text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later) text = gsub_iterate(text, m_data.single_consonants_rr) -- replace single consonants with romanized text text = gsub(text, "﷐", "") -- now remove U+FDD0 that was needed for converting each syllabic block in given name separately (e.g. 한복남 Han Boknam, not Han Bongnam) -- ^ for capitalization text = gsub(text, "%^[a-eg-km-pr-uwy]", mw.ustring.upper) text = gsub(text, "%^", "") text = final_processing(text) return textend-- Convert to McCune–Reischauerfunction p.mr(frame) return p._mr(get_args(frame))endfunction p._mr(args) local text = args[1] text = disallow_invalid_input(text) if text == "N/A" then return text end text = parse_name(text) text = gsub(text, "﷐", "") -- remove U+FDD0 (only needed for RR; not needed for MR) text = remove_links_and_markup(text) text = check_invalid_seq(text) text = gsub_iterate(text, m_data.enclosed_hangul) text = m_utils.decompose_hangul(text) -- decompose Hangul text = check_invalid_seq_decomposed_hangul(text) text = gsub(text, "([ᄀᄁᄃ-ᄊᄌ-ᄑ])ᅴ", "%1ᅵ") -- syl-init consonant + ㅢ → syl-init consonant + ㅣ (except 의, 늬, 희) text = gsub_iterate(text, m_data.preprocessing) text = gsub_iterate(text, m_data.before_neutralizing_syl_final_consonants_mr) -- should be done before neutralization of syl-final consonants text = gsub_iterate(text, m_data.neutralize_syl_final_consonants) -- neutralization of syl-final consonants text = gsub(text, "([ᅡ-ᅵᆫᆷᆼ])@ᄉ", "%1ᄊ") -- @ for tensification text = gsub_iterate(text, m_data.at_dollar_irregularities) -- @ for ㄴㄹ pronounced [ㄴㄴ], $ for 절음 법칙 text = gsub_iterate(text, m_data.gdbj_mr) -- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants text = gsub(text, "ᆯ%*ᄅ", "ᆯ-l") -- ㄹ-ㄹ should probably be l-l rather than l-r text = gsub(text, "%*", "-") -- * for additional hyphen in romanization only text = gsub(text, "@", "") -- consonant assimilations text = gsub_iterate(text, m_data.consonant_assimilations) text = gsub_iterate(text, m_data.consonant_assimilations_additional_mr) text = gsub_iterate(text, m_data.drop_y) -- drop y after {ㅈ, ㅉ, ㅊ} text = gsub_iterate(text, m_data.vowels_mr) -- replace Hangul vowels with romanized text text = gsub(text, "([ao])ᄋe", "%1ë") -- ㅏ에 (aë) and ㅗ에 (oë) text = gsub(text, "'([ᄋㅇ]+)'", "&#39;%1&#39;") -- to prevent input like 'ㅇ' (with ') from becoming italic markup (as [ᄋㅇ] is simply removed later) text = gsub_iterate(text, m_data.single_consonants_mr) -- replace single consonants with romanized text -- replace ' with &#39; when followed by another ' or at end of string (to avoid possible clashes with bold/italic markup) text = gsub(text, "([hkpt])''", "%1&#39;'") text = gsub(text, "([hkpt])'$", "%1&#39;") -- ^ for capitalization text = gsub(text, "%^[acehikm-pr-uwyŏŭ]", mw.ustring.upper) text = gsub(text, "%^", "") text = final_processing(text) return textend-- Removing special chars (except for escaped ones)function p.clean_hangul(frame) return p._clean_hangul(get_args(frame))endfunction p._clean_hangul(args) local text = args[1] -- input must contain Hangul if not m_utils.contains_hangul(text) then error("Input must contain Hangul") end -- no direct insertion of reference or footnote if m_utils.contains_reference(text) then error("Input cannot contain references") end -- symbol should not appear within single syllabic block if find(text, "[ᄀ-ᅟꥠ-ꥼ][%$%%%*@%^_`][ᅠ-ᆧힰ-ퟆ]") or find(text, "[ᅠ-ᆧ가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히ힰ-ퟆ][%$%%%*@%^_`][ᆨ-ᇿퟋ-ퟻ]") then error("Do not insert symbol within single syllabic block") end text = gsub_iterate(text, m_data.escaped_to_html_enc) -- replacing escaped special chars with placeholders text = gsub(text, "[%$%%%*@%^_`]", "") -- removing non-escaped special chars text = gsub_iterate(text, m_data.html_enc_to_ascii) -- convert HTML encodings back to ASCII text = mw.text.unstrip(text) -- unstripping test return textendreturn p