local headword_page_module = "Module:headword/page"
local list_to_set = require("Module:table").listToSet
local data = {}
------ 1. Lists which are converted into sets. ------
-- Zero-plurals (i.e. invariable plurals).
local irregular_plurals = list_to_set({
"ชมาโว", --cmavo
"ชเมเน", --cmene
"ฟูฮิฝลา", --fu'ivla
"กิสมู", --gismu
"ฮั้นถื่อ", --Han tu
"ฮันจา", --hanja
"ฮั่นจื้อ", --hanzi
"ยฺหวืดเพ็ง", --jyutping
"คานะ", --kana
"คันจิ", --kanji
"ลุฌโว", --lujvo
"หนังสือวลี", --phrasebook
"พินอิน", --pinyin
"รัฟซี", --rafsi
}, function(item)
return item
end)
-- Irregular non-zero plurals AND any regular plurals where the singular ends in "s",
-- because the module assumes that inputs ending in "s" are plurals. The singular and
-- plural both need to be added, as the module will generate a default plural if
-- the input doesn't match a key in this table.
for sg, pl in next, {
mora = "morae"
} do
irregular_plurals[sg], irregular_plurals[pl] = pl, pl
end
data.irregular_plurals = irregular_plurals
data.lemmas = list_to_set{
"คำย่อ", --abbreviations
"acronyms",
"คำคุณศัพท์", --adjectives
"adnominals",
"adpositions",
"คำกริยาวิเศษณ์", --adverbs
"affixes",
"ambipositions",
"คำกำกับนาม", --articles
"circumfixes",
"circumpositions",
"คำลักษณนาม", --classifiers
"ชมาโว", --cmavo
"cmavo clusters",
"ชเมเน", --cmene
"combining forms",
"คำสันธาน", --conjunctions
"คำลักษณนาม", --counters = classifiers
"คำกำหนด", --determiners
"diacritical marks",
"ทวิอักษร", --digraphs
"equative adjectives",
"ฟูฮิฝลา", --fu'ivla
"กิสมู", --gismu
"อักษรจีน", --Han characters
"ฮั้นถื่อ", --Han tu
"ฮันจา", --hanja
"ฮั่นจื้อ", --hanzi
"ideophones",
"สำนวน", --idioms
"อาคม", --infixes
"อักษรย่อ", --initialisms
"iteration marks",
"interfixes",
"คำอุทาน", --interjections
"คานะ", --kana
"คันจิ", --kanji
"ตัวอักษร", --letters
"ตัวอักษรควบ", --ligatures
"logograms",
"ลุฌโว", --lujvo
"morae",
"หน่วยคำ", --morphemes
"non-constituents",
"คำนาม", --nouns
"จำนวน", --numbers
"ตัวเลข", --numeral symbols
"เลข", --numerals
"คำอนุภาค", --particles
"วลี", --phrases
"คำปัจฉบท", --postpositions
"postpositional phrases",
"predicatives",
"อุปสรรค", --prefixes
"prepositional phrases",
"คำบุพบท", --prepositions
"preverbs",
"pronominal adverbs",
"คำสรรพนาม", --pronouns
"คำวิสามานยนาม", --proper nouns
"สุภาษิต", --proverbs
"เครื่องหมายวรรคตอน", --punctuation marks
"relatives",
"ราก", --roots
"stems",
"ปัจจัย", --suffixes
"พยางค์", --syllables
"สัญลักษณ์", --symbols
"คำกริยา", --verbs
}
data.nonlemmas = list_to_set{
"รูปผันพาร์ทิซิเพิลกรรตุวาจก", --active participle forms
"พาร์ทิซิเพิลกรรตุวาจก", --active participles
"พาร์ทิซิเพิลเชิงคุณศัพท์", --adjectival participles
"adjective case forms",
"รูปผันคำคุณศัพท์", --adjective forms
"adjective feminine forms",
"รูปผันคำคุณศัพท์พหูพจน์", --adjective plural forms
"รูปผันคำกริยาวิเศษณ์", --adverb forms
"พาร์ทิซิเพิลเชิงกริยาวิเศษณ์", --adverbial participles
"agent participles",
"รูปผันคำกำกับนาม", --article forms
"circumfix forms",
"combined forms",
"รูปผันคำคุณศัพท์ขั้นกว่า", --comparative adjective forms
"คำคุณศัพท์ขั้นกว่า", --comparative adjectives
"รูปผันคำกริยาวิเศษณ์ขั้นกว่า", --comparative adverb forms
"คำกริยาวิเศษณ์ขั้นกว่า", --comparative adverbs
"รูปผันคำสันธาน", --conjunction forms
"contractions",
"converbs",
"รูปผันคำกำหนดขั้นกว่า", --determiner comparative forms
"รูปผันคำกำหนด", --determiner forms
"รูปผันคำกำหนดขั้นสุด", --determiner superlative forms
"คำนามบอกความเล็ก", --diminutive nouns
"elative adjectives",
"equative adjective forms",
"equative adjectives",
"พาร์ทิซิเพิลอนาคตกาล", --future participles
"gerunds",
"infinitive forms",
"infinitives",
"รูปผันคำอุทาน", --interjection forms
"ยฺหวืดเพ็ง", --jyutping
"misspellings",
"negative participles",
"nominal participles",
"noun case forms",
"รูปผันคำนามทวิพจน์", --noun dual forms
"รูปผันคำนาม", --noun forms
"noun paucal forms",
"รูปผันคำนามพหูพจน์", --noun plural forms
"noun possessive forms",
"noun singulative forms",
"รูปผันเลข", --numeral forms
"พาร์ทิซิเพิล", --participles
"รูปผันพาร์ทิซิเพิล", --participle forms
"รูปผันคำอนุภาค", --particle forms
"พาร์ทิซิเพิลกรรมวาจก", --passive participles
"พาร์ทิซิเพิลกรรตุวาจกอดีตกาล", --past active participles
"พาร์ทิซิเพิลอดีตกาล", --past participles
"รูปผันพาร์ทิซิเพิลอดีตกาล", --past participle forms
"พาร์ทิซิเพิลกรรมวาจกอดีตกาล", --past passive participles
"perfect active participles",
"perfect participles",
"perfect passive participles",
"พินอิน", --pinyin
"พหูพจน์", --plurals
"รูปผันคำปัจฉบท", --postposition forms
"รูปผันอุปสรรค", --prefix forms
"preposition contractions",
"รูปผันคำบุพบท", --preposition forms
"prepositional pronouns",
"พาร์ทิซิเพิลกรรตุวาจกปัจจุบันกาล", --present active participles
"พาร์ทิซิเพิลปัจจุบันกาล", --present participles
"พาร์ทิซิเพิลกรรมวาจกปัจจุบันกาล", --present passive participles
"รูปผันคำสรรพนาม", --pronoun forms
"pronoun possessive forms",
"รูปผันคำวิสามานยนาม", --proper noun forms
"รูปผันคำวิสามานยนามพหูพจน์", --proper noun plural forms
"รัฟซี", --rafsi
"การถอดเป็นอักษรโรมัน", --romanizations
"root forms",
"singulatives",
"รูปผันปัจจัย", --suffix forms
"รูปผันคำคุณศัพท์ขั้นสุด", --superlative adjective forms
"คำคุณศัพท์ขั้นสุด", --superlative adjectives
"รูปผันคำกริยาวิเศษณ์ขั้นสุด", --superlative adverb forms
"คำกริยาวิเศษณ์ขั้นสุด", --superlative adverbs
"รูปผันคำกริยา", --verb forms
"คำกริยานาม", --verbal nouns
}
-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = list_to_set{
"zh",
}
-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = list_to_set{
-------- Languages without spaces between words (sometimes spaces between phrases) --------
"blt", -- Tai Dam
"ja", -- Japanese
"khb", -- Lü
"km", -- Khmer
"lo", -- Lao
"mnw", -- Mon
"my", -- Burmese
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
"nod", -- Northern Thai
"ojp", -- Old Japanese
"shn", -- Shan
"sou", -- Southern Thai
"tdd", -- Tai Nüa
"th", -- Thai
"tts", -- Isan
"twh", -- Tai Dón
"txg", -- Tangut
"zh", -- Chinese (all varieties with Chinese characters)
"zkt", -- Khitan
-------- Languages with spaces between syllables --------
"ahk", -- Akha
"aou", -- A'ou
"atb", -- Zaiwa
"byk", -- Biao
"cdy", -- Chadong
--"duu", -- Drung; not sure
--"hmx-pro", -- Proto-Hmong-Mien
--"hnj", -- Green Hmong; not sure
"huq", -- Tsat
"ium", -- Iu Mien
--"lis", -- Lisu; not sure
"mtq", -- Muong
--"mww", -- White Hmong; not sure
"onb", -- Lingao
--"sit-gkh", -- Gokhy; not sure
--"swi", -- Sui; not sure
"tbq-lol-pro", -- Proto-Loloish
"tdh", -- Thulung
"ukk", -- Muak Sa-aak
"vi", -- Vietnamese
"yig", -- Wusa Nasu
"zng", -- Mang
-------- Languages with ~ with surrounding spaces used to separate variants --------
"mkh-ban-pro", -- Proto-Bahnaric
"sit-pro", -- Proto-Sino-Tibetan; listed above
-------- Other weirdnesses --------
"mul", -- Translingual; gestures, Morse code, etc.
"aot", -- Atong (India); bullet is a letter
-------- All sign languages --------
"ads",
"aed",
"aen",
"afg",
"ase",
"asf",
"asp",
"asq",
"asw",
"bfi",
"bfk",
"bog",
"bqn",
"bqy",
"bvl",
"bzs",
"cds",
"csc",
"csd",
"cse",
"csf",
"csg",
"csl",
"csn",
"csq",
"csr",
"doq",
"dse",
"dsl",
"ecs",
"esl",
"esn",
"eso",
"eth",
"fcs",
"fse",
"fsl",
"fss",
"gds",
"gse",
"gsg",
"gsm",
"gss",
"gus",
"hab",
"haf",
"hds",
"hks",
"hos",
"hps",
"hsh",
"hsl",
"icl",
"iks",
"ils",
"inl",
"ins",
"ise",
"isg",
"isr",
"jcs",
"jhs",
"jls",
"jos",
"jsl",
"jus",
"kgi",
"kvk",
"lbs",
"lls",
"lsl",
"lso",
"lsp",
"lst",
"lsy",
"lws",
"mdl",
"mfs",
"mre",
"msd",
"msr",
"mzc",
"mzg",
"mzy",
"nbs",
"ncs",
"nsi",
"nsl",
"nsp",
"nsr",
"nzs",
"okl",
"pgz",
"pks",
"prl",
"prz",
"psc",
"psd",
"psg",
"psl",
"pso",
"psp",
"psr",
"pys",
"rms",
"rsl",
"rsm",
"sdl",
"sfb",
"sfs",
"sgg",
"sgx",
"slf",
"sls",
"sqk",
"sqs",
"ssp",
"ssr",
"svk",
"swl",
"syy",
"tse",
"tsm",
"tsq",
"tss",
"tsy",
"tza",
"ugn",
"ugy",
"ukl",
"uks",
"vgt",
"vsi",
"vsl",
"vsv",
"xki",
"xml",
"xms",
"ygs",
"ysl",
"zib",
"zsl",
}
-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = list_to_set{
"akk", -- Akkadian; hyphens between syllables
"akl", -- Aklanon; hyphens for mid-word glottal stops
"ber-pro", -- Proto-Berber; morphemes separated by hyphens
"ceb", -- Cebuano; hyphens for mid-word glottal stops
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"hnn", -- Hanunoo; too many false positives
"ilo", -- Ilocano; hyphens for mid-word glottal stops
"kne", -- Kankanaey; hyphens for mid-word glottal stops
"lcp", -- Western Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"msb", -- Masbatenyo; too many false positives
"tl", -- Tagalog; too many false positives
"war", -- Waray-Waray; too many false positives
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}
-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = list_to_set{
-- Languages without gender but which use the gender field for other purposes
"ja",
"th",
}
data.notranslit = list_to_set{
"ams",
"az",
"bbc",
"bug",
"cia",
"cjm",
"cmn",
"cpi",
"hak",
--"ja",
"kzg",
"lad",
"lzh",
"ms",
"mul",
"mvi",
"nan",
"nan-hbl",
"nan-hnm",
"nan-luh",
"nan-tws",
"oj",
"okn",
"ryn",
"rys",
"ryu",
"sh",
"tgt",
--"th",
"tkn",
"tly",
"txg",
"und",
--"vi",
"xug",
"yoi",
"yox",
"yue",
"za",
"zh",
-- ภาษาที่ใช้ Thai อย่างเดียว หรือ Thai/Latin นอกนั้นไปกำหนดใน languages
"th", --ไทย
"bzi", --บีซู
"cbn", --ญัฮกุร
"lcp", --เลอเวือะตะวันตก
"lwl", --เลอเวือะตะวันออก
"nyw", --ญ้อ
"skb", --แสก
"sou", --ปักษ์ใต้
"thm", --ทะวืง
"tts", --อีสาน
"ugo", --กฺ๋อง
"urk", --อูรักลาโวยจ
}
-- เพิ่มเอง
data.forcetranslit = list_to_set{
"eo",
"vi",
}
-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = list_to_set{
"Ahom",
"Arab",
"fa-Arab",
"glk-Arab",
"kk-Arab",
"ks-Arab",
"ku-Arab",
"mzn-Arab",
"ms-Arab",
"ota-Arab",
"pa-Arab",
"ps-Arab",
"sd-Arab",
"tt-Arab",
"ug-Arab",
"ur-Arab",
"Armi",
"Armn",
"Avst",
"Bali",
"Bamu",
"Batk",
"Beng",
"as-Beng",
"Bopo",
"Brah",
"Brai",
"Bugi",
"Buhd",
"Cakm",
"Cans",
"Cari",
"Cham",
"Cher",
"Copt",
"Cprt",
"Cyrl",
"Cyrs",
"Deva",
"Dsrt",
"Egyd",
"Egyp",
"Ethi",
"Geok",
"Geor",
"Glag",
"Goth",
"Grek",
"Polyt",
"polytonic",
"Gujr",
"Guru",
"Hang",
"Hani",
"Hano",
"Hebr",
"Hira",
"Hluw",
"Ital",
"Java",
"Kali",
"Kana",
"Khar",
"Khmr",
"Knda",
"Kthi",
"Lana",
"Laoo",
"Latn",
"Latf",
"Latg",
"Latnx",
"Latinx",
"pjt-Latn",
"Lepc",
"Limb",
"Linb",
"Lisu",
"Lyci",
"Lydi",
"Mand",
"Mani",
"Marc",
"Merc",
"Mero",
"Mlym",
"Mong",
"mnc-Mong",
"sjo-Mong",
"xwo-Mong",
"Mtei",
"Mymr",
"Narb",
"Nkoo",
"Nshu",
"Ogam",
"Olck",
"Orkh",
"Orya",
"Osma",
"Ougr",
"Palm",
"Phag",
"Phli",
"Phlv",
"Phnx",
"Plrd",
"Prti",
"Rjng",
"Runr",
"Samr",
"Sarb",
"Saur",
"Sgnw",
"Shaw",
"Shrd",
"Sinh",
"Sora",
"Sund",
"Sylo",
"Syrc",
"Tagb",
"Tale",
"Talu",
"Taml",
"Tang",
"Tavt",
"Telu",
"Tfng",
"Tglg",
"Thaa",
"Thai",
"Tibt",
"Ugar",
"Vaii",
"Xpeo",
"Xsux",
"Yiii",
"Zmth",
"Zsym",
"Ipach",
"Music",
"Rumin",
}
-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = list_to_set{
"diacritical marks",
"อักษรจีน", --Han characters
"ฮั้นถื่อ", --Han tu
"ฮันจา", --hanja
"ฮั่นจื้อ", --hanzi
"iteration marks",
"คานะ", --kana
"คันจิ", --kanji
"ตัวอักษร", --letters
"ตัวอักษรควบ", --ligatures
"logograms",
"morae",
"ตัวเลข", --numeral symbols
"เลข", --numerals
"เครื่องหมายวรรคตอน", --punctuation marks
"พยางค์", --syllables
"สัญลักษณ์", --symbols
}
------ 2. Lists not converted into sets. ------
-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
-- and [[Module:links]] for the pos= param.
data.pos_aliases = {
a = "adjective",
adj = "adjective",
adv = "adverb",
art = "article",
det = "determiner",
cnum = "cardinal number",
conj = "conjunction",
conv = "converb",
int = "interjection",
interj = "interjection",
intj = "interjection",
n = "noun",
num = "numeral",
part = "participle",
pcl = "particle",
phr = "phrase",
pn = "proper noun",
postp = "postposition",
pre = "preposition",
prep = "preposition",
pro = "pronoun",
pron = "pronoun",
prop = "proper noun",
proper = "proper noun",
onum = "ordinal number",
rom = "romanization",
v = "verb",
vb = "verb",
vi = "intransitive verb",
vt = "transitive verb",
vti = "transitive and intransitive verb",
}
-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number.
data.pos_for_gender_number_cat = {
["nouns"] = "nouns",
["proper nouns"] = "nouns",
["suffixes"] = "suffixes",
-- We include verbs because impf and pf are valid "genders".
["verbs"] = "verbs",
}
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename
return data