Modul:ru-translit

Dokumentasjon for modulen kan opprettast på Modul:ru-translit/dok
local export = {}

local u = mw.ustring.char
local rfind = mw.ustring.find
local rsub = mw.ustring.gsub -- WARNING: Don't return this directly in a function, or surround in parens
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local ulower = mw.ustring.lower
local usub = mw.ustring.sub

local GR = u(0x0300) -- grave =  ̀
local TEMP_G = u(0xFFF1) -- substitute to preserve g from changing to v

local function ine(x) -- if not empty
	if x == "" then return nil else return x end
end

-- In this table, we now map Cyrillic е and э to je and e, and handle the
-- post-consonant version (plain e and ɛ) specially.
local tab = {
	["А"]="A", ["Б"]="B", ["В"]="V", ["Г"]="G", ["Д"]="D", ["Е"]="Je", ["Ё"]="Jó", ["Ж"]="Zj", ["З"]="Z", ["И"]="I", ["Й"]="J",
	["К"]="K", ["Л"]="L", ["М"]="M", ["Н"]="N", ["О"]="O", ["П"]="P", ["Р"]="R", ["С"]="S", ["Т"]="T", ["У"]="U", ["Ф"]="F",
	["Х"]="Kh", ["Ц"]="Ts", ["Ч"]="Tsj", ["Ш"]="Sj", ["Щ"]="Sjtsj", ["Ъ"]="ʺ", ["Ы"]="Y", ["Ь"]="ʹ", ["Э"]="E", ["Ю"]="Ju", ["Я"]="Ja",
	['а']='a', ['б']='b', ['в']='v', ['г']='g', ['д']='d', ['е']='je', ['ё']='jó', ['ж']='zj', ['з']='z', ['и']='i', ['й']='j',
	['к']='k', ['л']='l', ['м']='m', ['н']='n', ['о']='o', ['п']='p', ['р']='r', ['с']='s', ['т']='t', ['у']='u', ['ф']='f',
	['х']='kh', ['ц']='ts', ['ч']='tsj', ['ш']='sj', ['щ']='sjtsj', ['ъ']='ʺ', ['ы']='y', ['ь']='ʹ', ['э']='e', ['ю']='ju', ['я']='ja',
	-- Russian style quotes
	['«']='“', ['»']='”',
	-- archaic, pre-1918 letters
	['І']='I', ['і']='i', ['Ѳ']='F', ['ѳ']='f',
	['Ѣ']='Jě', ['ѣ']='jě', ['Ѵ']='I', ['ѵ']='i',
}

-- following based on ru-common for use with is_monosyllabic()
-- any Cyrillic or Latin vowel, including ёЁ and composed Cyrillic vowels with grave accent;
-- not including accented Latin vowels except ě (FIXME, might want to change this)
local vowels = "аеиоуяэыюіѣѵүАЕИОУЯЭЫЮІѢѴҮѐЀѝЍёЁAEIOUYĚƐaeiouyěɛ"

-- FIXME! Doesn't work with ɣ, which gets included in this character set
local non_consonants = "[" .. vowels .. "ЪЬъьʹʺ%A]"
local consonants = "[^" .. vowels .. "ЪЬъьʹʺ%A]"

local map_to_plain_e_map = {["Е"] = "E", ["е"] = "e", ["Ѣ"] = "Ě", ["ѣ"] = "ě", ["Э"] = "Ɛ", ["э"] = "ɛ"}
local function map_to_plain_e(pre, e)
	return pre .. map_to_plain_e_map[e]
end

local map_to_je_map = {["Е"] = "Je", ["е"] = "je", ["Ѣ"] = "Jě", ["ѣ"] = "jě", ["Э"] = "E", ["э"] = "e"}
local function map_to_je(pre, e)
	if e == nil then
		e = pre
		pre = ""
	end
	return pre .. map_to_je_map[e]
end

-- decompose composed grave chars; they will map to uncomposed Latin letters for
-- consistency with other char+grave combinations, and we do this early to
-- avoid problems converting to e or je
local decompose_grave_map = {['ѐ'] = 'е' .. GR, ['Ѐ'] = 'Е' .. GR, ['ѝ'] = 'и' .. GR, ['Ѝ'] = 'И' .. GR}

-- True if Cyrillic or decomposed Latin word has no more than one vowel;
-- includes non-syllabic stems such as льд-; copied from ru-common and modified
-- to avoid having to import that module (which would slow things down
-- significantly)
local function is_monosyllabic(word)
	return not rfind(word, "[" .. vowels .. "].*[" .. vowels .. "]")
end

-- Apply transformations to the Cyrillic to more closely match pronunciation.
-- Return two arguments: the "original" text (after decomposing composed
-- grave characters), and the transformed text. If the two are different,
-- {{ru-IPA}} should display a "phonetic respelling" notation. 
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives, including those in -аго (pre-reform spelling)
-- and disables checking for exceptions (e.g. много, ого). NOSHTO disables
-- special-casing for что and related words.
function export.apply_tr_fixes(text, noadj, noshto, forceadj)
	-- decompose composed grave characters before we convert Cyrillic е to
	-- Latin e or je
	text = rsub(text, "[ѐЀѝЍ]", decompose_grave_map)

	local origtext = text
	-- the second half of the if-statement below is an optimization; see above.
	if not noadj and text:find("го") then
		if not forceadj then
			-- handle много
			text = rsub(text, "%f[%a\204\129\204\128]([Мм]но[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle немного, намного
			text = rsub(text, "%f[%a\204\129\204\128]([Нн][еа]мно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle до́рого [short form of дорогой, adverb]
			text = rsub(text, "%f[%a\204\129\204\128]([Дд]о[\204\129\204\128]?ро)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle недо́рого [short form of недорогой, adverb]
			text = rsub(text, "%f[%a\204\129\204\128]([Нн]едо[\204\129\204\128]?ро)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle стро́го
			text = rsub(text, "%f[%a\204\129\204\128]([Сс]тро[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle нестро́го
			text = rsub(text, "%f[%a\204\129\204\128]([Нн]естро[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle убо́го
			text = rsub(text, "%f[%a\204\129\204\128]([Уу]бо[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle поло́го
			text = rsub(text, "%f[%a\204\129\204\128]([Пп]оло[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle длинноно́го
			text = rsub(text, "%f[%a\204\129\204\128]([Дд]линноно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle коротконо́го
			text = rsub(text, "%f[%a\204\129\204\128]([Кк]оротконо[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle кривоно́го
			text = rsub(text, "%f[%a\204\129\204\128]([Кк]ривоно[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle пе́го [short form of пе́гий "piebald"]
			text = rsub(text, "%f[%a\204\129\204\128]([Пп]е[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle лого, сого, ого
			text = rsub(text, "%f[%a\204\129\204\128]([лсЛС]?[Оо][\204\129\204\128]?)г(о[\204\129\204\128]?)%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "%2")
			-- handle Того, То́го (but not того or Того́, which have /v/)
			text = rsub(text, "%f[%a\204\129\204\128](То́?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle лего
			text = rsub(text, "%f[%a\204\129\204\128]([Лл]е[\204\129\204\128]?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
			-- handle игого, огого; note, we substitute TEMP_G for both г's
			-- because otherwise the ого- at the beginning gets converted to ово
			text = rsub(text, "%f[%a\204\129\204\128]([ИиОо])гог(о[\204\129\204\128]?)%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о" .. TEMP_G .. "%2")
			-- handle Диего
			text = rsub(text, "%f[%a\204\129\204\128](Дие́?)го%f[^%a\204\129\204\128]", "%1" .. TEMP_G .. "о")
		end
		--handle genitive/accusative endings, which are spelled -ого/-его/-аго
		-- (-ogo/-ego/-ago) but transliterated -ovo/-evo/-avo; only for adjectives
		-- and pronouns, excluding words like много, ого (-аго occurs in
		-- pre-reform spelling); \204\129 is an acute accent, \204\128 is a grave accent
		local pattern = "([оеОЕ" .. (forceadj and "аА" or "") .. "][\204\129\204\128]?)([гГ])([оО][\204\129\204\128]?)"
		local reflexive = "([сС][яЯ][\204\129\204\128]?)"
		local v = {["г"] = "в", ["Г"] = "В"}
		local repl = function(e, g, o, sja) return e .. v[g] .. o .. (sja or "") end
		text = rsub(text, pattern .. "%f[^%a\204\129\204\128]", repl)
		text = rsub(text, pattern .. reflexive .. "%f[^%a\204\129\204\128]", repl)
		-- handle сегодня
		text = rsub(text, "%f[%a\204\129\204\128]([Сс]е)г(о[\204\129\204\128]?дня)%f[^%a\204\129\204\128]", "%1в%2")
		-- handle сегодняшн-
		text = rsub(text, "%f[%a\204\129\204\128]([Сс]е)г(о[\204\129\204\128]?дняшн)", "%1в%2")
		-- replace TEMP_G with g; must be done after the -go -> -vo changes
		text = rsub(text, TEMP_G, "г")
	end

	-- the second half of the if-statement below is an optimization; see above.
	if not noshto and text:find("то") then
		local ch2sh = {["ч"] = "ш", ["Ч"] = "Ш"}
		-- Handle что
		text = rsub(text, "%f[%a\204\129\204\128]([Чч])(то[\204\129\204\128]?)%f[^%a\204\129\204\128]",
			function(ch, to) return ch2sh[ch] .. to end)
		-- Handle чтобы, чтоб
		text = rsub(text, "%f[%a\204\129\204\128]([Чч])(то[\204\129\204\128]?бы?)%f[^%a\204\129\204\128]",
			function(ch, to) return ch2sh[ch] .. to end)
		-- Handle ничто
		text = rsub(text, "%f[%a\204\129\204\128]([Нн]и)ч(то[\204\129\204\128]?)%f[^%a\204\129\204\128]", "%1ш%2")
	end

	text = rsub(text, "([МмЛл][яеё][\204\129\204\128]?)г([кч])", "%1х%2")

	return origtext, text
end

-- Transliterate after the pronunciation-related transformations of
-- export.apply_tr_fixes() have been applied. Called from {{ru-IPA}}.
-- INCLUDE_MONOSYLLABIC_JO_ACCENT is as in export.tr().
function export.tr_after_fixes(text, include_monosyllabic_jo_accent)
	-- Remove word-final hard sign, either utterance-finally or followed by
	-- a non-letter character such as space, comma, period, hyphen, etc.
	text = rsub(text, "[Ъъ]$", "")
	text = rsub(text, "[Ъъ]([%A])", "%1")

	 -- the if-statement below isn't necessary but may speed things up,
	 -- particularly when include_monosyllabic_jo_accent isn't set, in that
	 -- in the majority of cases where ё doesn't occur, we avoid a pattern find
	 -- (in is_monosyllabic()) and three pattern subs. The translit module needs
	 -- to be as fast as possible since it may be called hundreds or
	 -- thousands of times on some pages.
	 if rfind(text, "[Ёё]") then
		-- We need to special-case ё after a "hushing" consonant, which becomes
		-- ó (or o), without j. We also need special cases for monosyllabic ё
		-- when INCLUDE_MONOSYLLABIC_JO_ACCENT isn't set, so we don't add the
		-- accent mark that we would otherwise include.
		if not include_monosyllabic_jo_accent and is_monosyllabic(text) then
			text = rsub(text, "([жшчщЖШЧЩ])ё","%1o")
			text = text:gsub("ё", "jo")
			text = text:gsub("Ё", "Jo")
		else
			text = rsub(text, "([жшчщЖШЧЩ])ё","%1ó")
			-- conversion of remaining ё will occur as a result of 'tab'.
		end
	end

	-- ю after ж and ш becomes u (e.g. брошюра, жюри)
	text = rsub(text, "([жшЖШ])ю","%1u")

	 -- the if-statement below isn't necessary but may speed things up in that
	 -- in the majority of cases where the letters below don't occur, we avoid
	 -- six pattern subs.
	 if rfind(text, "[ЕеѢѣЭэ]") then
		-- е after a dash at the beginning of a word becomes e, and э becomes ɛ
		-- (like after a consonant)
		text = rsub(text, "^(%-)([ЕеѢѣЭэ])", map_to_plain_e)
		text = rsub(text, "(%s%-)([ЕеѢѣЭэ])", map_to_plain_e)
		-- don't get confused by single quote or parens between consonant and е;
		-- e.g. Б'''ез''', американ(ец)
		text = rsub(text, "(" .. consonants .. "['%(%)]*)([ЕеѢѣЭэ])", map_to_plain_e)

		-- This is now the default
		-- е after a vowel or at the beginning of a word becomes je, and э becomes e
		-- text = rsub(text, "^([ЕеѢѣЭэ])", map_to_je)
		-- text = rsub(text, "(" .. non_consonants .. ")([ЕеѢѣЭэ])", map_to_je)
		-- -- need to do it twice in case of sequences of such vowels
		-- text = rsub(text, "^([ЕеѢѣЭэ])", map_to_je)
		-- text = rsub(text, "(" .. non_consonants .. ")([ЕеѢѣЭэ])", map_to_je)
	end

	text = (rsub(text,'.',tab))
	return text
end

-- Transliterates text, which should be a single word or phrase. It should
-- include stress marks, which are then preserved in the transliteration.
-- ё is a special case: it is rendered (j)ó in multisyllabic words and
-- monosyllabic words in multi-word phrases, but rendered (j)o without an
-- accent in isolated monosyllabic words, unless INCLUDE_MONOSYLLABIC_JO_ACCENT
-- is specified. (This is used in conjugation and declension tables.)
-- NOADJ disables special-casing for adjectives in -го, while FORCEADJ forces
-- special-casing for adjectives and disables checking for exceptions
-- (e.g. много). NOSHTO disables special-casing for что and related words.
function export.tr(text, lang, sc, include_monosyllabic_jo_accent, noadj, noshto, forceadj)
	local origtext, subbed_text = export.apply_tr_fixes(text, noadj, noshto, forceadj)
	return export.tr_after_fixes(subbed_text, include_monosyllabic_jo_accent)
end

-- translit with various special-case substitutions; NOADJ disables
-- special-casing for adjectives in -го, while FORCEADJ forces special-casing
-- for adjectives and disables checking for expections (e.g. много).
-- NOSHTO disables special-casing for что and related words. SUB is used
-- to implement arbitrary substitutions in the Cyrillic text before other
-- transformations are applied and before translit. It is of the form
-- FROM/TO,FROM/TO,...
function export.tr_sub(text, include_monosyllabic_jo_accent, noadj, noshto, sub,
	forceadj)
	if type(text) == 'table' then -- called directly from a template
		include_monosyllabic_jo_accent = ine(text.args.include_monosyllabic_jo_accent)
		noadj = ine(text.args.noadj)
		noshto = ine(text.args.noshto)
		sub = ine(text.args.sub)
		text = text.args[1]
	end

	if sub then
		local subs = rsplit(sub, ",")
		for _, subpair in ipairs(subs) do
			local subsplit = rsplit(subpair, "/")
			text = rsub(text, subsplit[1], subsplit[2])
		end
	end

	return export.tr(text, nil, nil, include_monosyllabic_jo_accent, noadj, noshto, forceadj)
end

--for adjectives, pronouns
function export.tr_adj(text, include_monosyllabic_jo_accent)
	if type(text) == 'table' then -- called directly from a template
		include_monosyllabic_jo_accent = ine(text.args.include_monosyllabic_jo_accent)
		text = text.args[1]
	end

	-- we have to include "forceadj" because typically when tr_adj() is called
	-- from the noun or adjective modules, it's called with suffix ого, which
	-- would otherwise trigger the exceptional case and be transliterated as ogo
	return export.tr(text, nil, nil, include_monosyllabic_jo_accent, false,
		"noshto", "forceadj")
end

return export

-- For Vim, so we get 4-space tabs
-- vim: set ts=4 sw=4 noet: