Module:Language/data: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
m (1 revision imported) |
(No difference)
|
Latest revision as of 12:01, 28 March 2024
Documentation for this module may be created at Module:Language/data/doc
local U = mw.ustring.char -- Diacritics, from the [[Combining Diacritical Marks]] block. local grave = U(0x300) local acute = U(0x301) local circumflex = U(0x302) local tilde = U(0x303) local macron = U(0x304) local breve = U(0x306) local dot = U(0x307) local diaeresis = U(0x308) local double_acute = U(0x30B) local caron = U(0x30C) local double_grave = U(0x30F) local invbreve = U(0x311) local dot_below = U(0x323) local undertie = U(0x35C) --[[ This is a table of Wiktionary language codes with data belonging to them. Name is the "canonical name" used on Wiktionary. Article is the Wikipedia article. Script is the ISO 15924 code. ]] local data = { ["languages"] = { ["aaq"] = { ["name"] = "Penobscot", }, ["ab"] = { ["name"] = "Abkhaz", }, ["abe"] = { ["name"] = "Abenaki", }, ["ang"] = { ["name"] = "Old English", ["article"] = {"Old English"}, -- Remove macrons, acutes, and overdots ["replacements"] = { decompose = true, from = { "[" .. macron .. acute .. dot .. "]" }, }, }, ["ar"] = { ["name"] = "Arabic", ["article"] = "Arabic language", ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["ara"] = { ["name"] = "Arabic", ["article"] = "Arabic language", ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["arb"] = { ["name"] = "Modern Standard Arabic", ["article"] = "Modern Standard Arabic", ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["apc"] = { ["name"] = "North Levantine Arabic", ["article"] = "North Levantine Arabic", ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["ajp"] = { ["name"] = "South Levantine Arabic", ["article"] = "South Levantine Arabic", ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["arz"] = { ["name"] = "Egyptian Arabic", ["article"] = "Egyptian Arabic", ["direction"] = "rtl", -- Should be in the script data module. ["replacements"] = { -- ālif with wasla is replaced by ālif; [U(0x0671)] = U(0x0627), -- taṭwīl, fatḥatan, ḍammatan, kasratan, -- fatḥa, ḍamma, kasra, -- shadda, sukūn, and superscript (dagger) ālif are removed. ["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D) ..U(0x064E)..U(0x064F)..U(0x0650) ..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "", }, }, ["av"] = { ["name"] = "Avar" }, ["be"] = { ["article"] = "Belarusian language", ["replacements"] = { [acute] = "", }, }, ["bn"] = { ["name"] = "Bengali", ["article"] = "Bengali language", }, ["bua"] = { ["name"] = "Buryat", }, ["cel-pro"] = { -- Incorrect tag ["name"] = "Proto-Celtic", ["Wikipedia_code"] = "cel-x-proto", }, ["cel-x-proto"] = { ["name"] = "Proto-Celtic", }, ["cel-bry-pro"] = { -- Incorrect tag ["name"] = "Proto-Brythonic", ["article"] = "Common Brittonic", ["type"] = "reconstructed", }, ["com"] = { ["name"] = "Comanche", ["article"] = "Comanche language", }, ["cu"] = { ["name"] = "Old Church Slavonic", ["article"] = "Old Church Slavonic", }, ["de"] = { ["name"] = "German", ["article"] = "German language", }, ["en"] = { ["name"] = "English", ["article"] = "English language", }, ["es"] = { ["name"] = "Spanish", ["article"] = "Spanish language", }, ["egy"] = { ["name"] = "Egyptian", }, ["evn"] = { ["name"] = "Evenki", ["article"] = "Evenki language", }, ["fr"] = { ["name"] = "French", ["article"] = "French language", }, ["frm"] = { ["name"] = "Middle French", ["article"] = "Middle French", }, ["frp"] = { ["name"] = "Franco-Provençal", }, ["ff"] = { ["name"] = "Fula", }, ["gem-pro"] = { -- Incorrect tag ["name"] = "Proto-Germanic", ["article"] = "Proto-Germanic language", ["type"] = "reconstructed", ["replacements"] = {}, ["Wikipedia_code"] = "gem-x-proto", }, ["gem-x-proto"] = { ["name"] = "Proto-Germanic", ["article"] = "Proto-Germanic language", ["type"] = "reconstructed", ["replacements"] = {}, }, ["gml"] = { ["name"] = "Middle Low German", }, ["gmw-ecg"] = { ["name"] = "East Central German", }, ["gmw-x-proto"] = { ["name"] = "Proto-West Germanic", ["article"] = "Proto-West Germanic language", ["type"] = "reconstructed", ["replacements"] = {}, }, ["gmq-x-gut"] = { ["name"] = "Gutnish", ["article"] = "Gutnish", }, ["goh"] = { ["replacements"] = { decompose = true, from = { "[" .. macron .. circumflex .. diaeresis .. "]", }, }, }, ["got"] = { ["name"] = "Gothic", ["article"] = "Gothic language", ["replacements"] = { -- Latin to Gothic since people will not want to have to copy -- and paste Gothic letters in ["[AÁaáĀā]"] = "𐌰", ["[Bb]"] = "𐌱", ["[Gg]"] = "𐌲", ["[Dd]"] = "𐌳", ["[EeĒē]"] = "𐌴", ["[Qq]"] = "𐌵", ["[Zz]"] = "𐌶", ["[Hh]"] = "𐌷", ["[Þþ]"] = "𐌸", ["[IiÍí]"] = "𐌹", ["[Kk]"] = "𐌺", ["[Ll]"] = "𐌻", ["[Mm]"] = "𐌼", ["[Nn]"] = "𐌽", ["[Jj]"] = "𐌾", ["[UuÚúŪū]"] = "𐌿", ["[Pp]"] = "𐍀", ["[Rr]"] = "𐍂", ["[Ss]"] = "𐍃", ["[Tt]"] = "𐍄", ["[WwYy]"] = "𐍅", ["[Ff]"] = "𐍆", ["[Xx]"] = "𐍇", ["[Ƕƕ]"] = "𐍈", -- Not sure if "hw" and "hv" can safely be converted ["[OoŌō]"] = "𐍉", }, }, ["gsw"] = { ["name"] = "Alemannic German", }, ["grc"] = { ["name"] = "Ancient Greek", ["article"] = "Ancient Greek", ["replacements"] = { decompose = true, from = { -- Replace variant letterforms with standard ones. "ϐ", "ϵ", "ϑ", "ϰ", "ϱ", "ϲ", "ϕ", -- Remove macrons and breves. "[" .. macron .. breve .. undertie .. "]" }, to = { "β", "ε", "θ", "κ", "ρ", "σ", "φ", } }, }, ["grk-pro"] = { -- Incorrect tag ["name"] = "Proto-Hellenic", ["Wikipedia_name"] = "Proto-Greek", ["article"] = "Proto-Greek language", ["type"] = "reconstructed", ["replacements"] = {}, ["Wikipedia_code"] = "grk-x-proto", }, ["grk-x-proto"] = { ["name"] = "Proto-Hellenic", ["Wikipedia_name"] = "Proto-Greek", ["article"] = "Proto-Greek language", ["type"] = "reconstructed", ["replacements"] = {}, }, ["grt"] = { ["name"] = "Garo", }, ["ha"] = { ["name"] = "Hausa", -- remove tilde, grave, acute, macron, circumflex ["replacements"] = { decompose = true, from = { "[" .. grave .. circumflex .. macron .. acute .. tilde .. "]" }, }, }, ["hi"] = { ["name"] = "Hindi", ["article"] = "Hindi", }, ["ine-bsl-pro"] = { ["name"] = "Proto-Balto-Slavic", ["article"] = "Proto-Balto-Slavic language", ["type"] = "reconstructed", }, ["ine-pro"] = { -- Incorrect tag ["name"] = "Proto-Indo-European", ["article"] = "Proto-Indo-European language", ["type"] = "reconstructed", ["replacements"] = {}, ["Wikipedia_code"] = "ine-x-proto", }, ["ine-x-proto"] = { ["name"] = "Proto-Indo-European", ["article"] = "Proto-Indo-European language", ["type"] = "reconstructed", ["replacements"] = {}, }, ["ja"] = { ["name"] = "Japanese", ["article"] = "Japanese language", }, ["jbo"] = { -- Lojban ["type"] = "appendix", }, ["ket"] = { ["name"] = "Ket", ["article"] = "Ket language", }, ["ksk"] = { ["name"] = "Kansa", ["article"] = "Kansa language", }, ["la"] = { ["name"] = "Latin", ["article"] = "Latin", ["replacements"] = { decompose = true, from = { "[" .. macron .. breve .. diaeresis .. "]" }, }, }, ["lt"] = { ["name"] = "Lithuanian", -- remove acute, tilde, grave ["replacements"] = { decompose = true, from = { "[" .. acute .. tilde .. grave .. "]" }, }, }, ["moe"] = { ["name"] = "Cree", }, ["mul"] = { ["name"] = "Translingual", ["article"] = "", }, ["nci"] = { ["name"] = "Classical Nahuatl", ["article"] = "Classical Nahuatl", -- Remove macrons, acutes, circumflexes and graves ["replacements"] = { decompose = true, -- Remove macrons, acutes, circumflexes, graves, and saltillo; -- see [[Saltillo (linguistics)]]. from = { "[" .. grave .. acute .. macron .. circumflex .. "Ꞌꞌʻʼ'ʔ]" }, }, }, ["nds-de"] = { ["name"] = "German Low German", }, ["non"] = { ["name"] = "Old Norse", }, ["non-x-proto"] = { ["name"] = "Proto-Norse", }, ["odt"] = { ["name"] = "Old Dutch", }, ["oge"] = { ["name"] = "Old Georgian", }, ["oj"] = { ["name"] = "Ojibwe", }, ["orv"] = { ["name"] = "Old East Slavic", ["article"] = "Old East Slavic", ["replacements"] = { [U(0x484)] = "", }, }, ["osx"] = { ["name"] = "Old Saxon", }, ["pt"] = { ["name"] = "Portuguese", ["article"] = "Portuguese language", -- ["scripts"] = { "Latn" }, }, ["pa"] = { ["name"] = "Punjabi", ["article"] = "Punjabi language", }, ["pgl"] = { ["name"] = "Primitive Irish", ["article"] = "Primitive Irish", }, ["pis"] = { ["name"] = "Pijin", ["article"] = "Pijin language", }, ["poz-x-poly-proto"] = { ["name"] = "Proto-Nuclear Polynesian", ["article"] = "Proto-Polynesian language", ["type"] = "reconstructed", }, ["rap"] = { ["name"] = "Rapa Nui", ["article"] = "Rapa Nui language", }, ["ru"] = { ["name"] = "Russian", ["article"] = "Russian language", ["replacements"] = { [acute] = "", }, }, ["rw"] = { ["name"] = "Rwanda-Rundi", }, ["se"] = { ["replacements"] = { ["([đflmnŋrsšŧv])'%1"] = "%1%1", }, }, ["sem-pro"] = { ["name"] = "Proto-Semitic", ["article"] = "Proto-Semitic", ["type"] = "reconstructed", }, ["sh"] = { ["article"] = "Serbo-Croatian language", ["replacements"] = { decompose = true, from = { "([AaEeIiOoUuRrАаЕеИиОоУуРр])[" .. double_grave .. grave .. invbreve .. acute .. macron .. tilde .. "]" }, to = { "%1" }, }, }, ["sl"] = { ["name"] = "Slovene", ["replacements"] = { decompose = true, -- remove tonal orthography from = {"ł", "[" .. grave .. acute .. macron .. double_grave .. invbreve .. circumflex .. dot_below .. "]"}, to = {"l"}, }, }, ["sla-pro"] = { ["name"] = "Proto-Slavic", -- also Common Slavic ["type"] = "reconstructed", ["replacements"] = { ["[ÀÁÃĀȀȂ]"] = "A", ["[àáãāȁȃ]"] = "a", ["[ÈÉẼĒȄȆ]"] = "E", ["[èéẽēȅȇ]"] = "e", ["[ÌÍĨĪȈȊ]"] = "I", ["[ìíĩīȉȋ]"] = "i", ["[ÒÓÕŌȌȎŐ]"] = "O", ["[òóõōȍȏő]"] = "o", ["[ÙÚŨŪȔȖŰ]"] = "U", ["[ùúũūȕȗű]"] = "u", ["[ỲÝỸȲ]"] = "Y", ["[ỳýỹȳ]"] = "y", ["Ǭ"] = "Ǫ", ["ǭ"] = "ǫ", ["[" .. grave .. acute .. double_acute .. tilde .. macron .. double_grave .. invbreve .. "]"] = "", ["ĭ"] = "ь", ["ŭ"] = "ъ", }, }, ["tts"] = { ["name"] = "Isan", -- also "Northeastern Thai" ["article"] = "Isan language", }, ["ug"] = { ["name"] = "Uyghur", --also less commonly "Uighur" ["article"] = "Uyghur language", }, ["uk"] = { ["article"] = "Ukrainian language", ["replacements"] = { [acute] = "", } }, ["ur"] = { ["name"] = "Urdu", ["article"] = "Urdu", }, ["xcl"] = { ["name"] = "Old Armenian", ["article"] = "Classical Armenian", ["replacements"] = { ["[՞՜՛՟]"] = "", ["և"] = "եւ", }, }, ["xgf"] = { ["name"] = "Tongva", -- not ISO name "Gabrielino-Fernandeño" ["article"] = "Tongva language", ["replacements"] = { ["['`ʔ]"] = "ʼ", }, }, ["xlu"] = { ["name"] = "Luwian", -- not ISO name "Cuneiform Luwian" ["article"] = "Cuneiform Luwian" }, ["xpq"] = { ["name"] = "Mohegan-Pequot", }, ["xxt"] = { ["name"] = "Tambora", ["article"] = "Tambora language", }, ["xvn"] = { ["name"] = "Vandalic", ["article"] = "Vandalic language", }, ["yua"] = { ["name"] = "Yucatec Maya", ["article"] = "Yucatec Maya language", }, ["zh"] = { ["name"] = "Chinese", ["article"] = "Chinese language", -- ["scripts"] = { "Hani" }, }, }, -- Here, keys (for example, "gem") are Wikipedia language codes used in -- {{lang}}, and values (for example, "gem-pro") are the equivalent Wiktionary -- code. -- Subtags are not currently supported. ["redirects"] = { ["aae"] = "sq", ["aiq"] = "fa", ["aln"] = "sq", ["als"] = "sq", ["azb"] = "az", ["azj"] = "az", ["bgn"] = "bal", ["bs"] = "sh", ["bxr"] = "bua", ["ciw"] = "oj", ["cnr"] = "sh", ["fil"] = "tl", ["fuf"] = "ff", ["gem"] = "gem-pro", -- Not correct, but is commonly used. ["hak"] = "zh", ["hbo"] = "he", ["hr"] = "sh", ["ine"] = "ine-pro", -- Not correct, but might be commonly used. ["kjv"] = "sh", ["nan"] = "zh", ["prs"] = "fa", ["rn"] = "rw", ["sli"] = "gmw-ecg", ["sr"] = "sh", ["src"] = "sc", ["sro"] = "sc", ["tw"] = "ak", ["wae"] = "gsw", ["wep"] = "nds-de", ["yue"] = "zh", ["xno"] = "fro", }, } return data