Module:scripts/recognition data
- The following documentation is located at Module:scripts/recognition data/documentation. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox
This module relates Unicode characters (code points) to Wiktionary script codes. It is used by the findBestScriptWithoutLang
function in Module:scripts.
It was generated by a series of functions in Module:User:Erutuon/script recognition from the script patterns in Module:scripts/data. It must be regenerated whenever script patterns are added or modified. The highlight_dump
and modified_dump
functions in Module:debug were used to print the output from Module:User:Erutuon/script recognition.
To explain the format, the first keys are the groups of 4096 codepoints (0x1000
in hexadecimal base). Key 0
is for the first group of 4096 codepoints, from U+000 to U+FFF. Within the tables for each block of codepoints, the syntax { 0x41, 0x5A, "Latn" }
indicates that all the characters from codepoint 0x41
(U+0041 or the character A) to codepoint 0x5A
(U+005A or the character Z) belong to the Latin script. The table "individual"
contains all the individual codepoints that are defined as belonging to a script, but are not found inside a range.
Conflicts between character sets of different scripts
This module includes fewer scripts than Module:scripts/data, because some scripts do not have characters at all, and some scripts' character sets are in conflict with each other. Here are the ways conflicts have been resolved.
Scripts are omitted if they have the same characters as another more basic script, or they consist of two or more other scripts. For example, fa-Arab
is omitted because it contains the same characters as Arab
, and Jpan
is omitted because it consists of Hani
, Hira
and Kana
.
If a script consists of the characters of another script, plus some unique characters, only the unique characters are counted as belonging to the script. For instance, Polyt
consists of the characters of Grek
, which are used in the monotonic orthography of Modern Greek, plus characters used for Ancient Greek; only the Ancient Greek characters are counted as Polyt
in this module. It is the same with Cyrs
and Cyrl
.
return {
[0x00] = {
{ 0x00041, 0x0005A, "Latn"},
{ 0x00061, 0x0007A, "Latn"},
{ 0x000C0, 0x000D6, "Latn"},
{ 0x000D8, 0x000F6, "Latn"},
{ 0x000F8, 0x0024F, "Latn"},
{ 0x00370, 0x003E1, "Grek", "Polyt" },
{ 0x003E2, 0x003EF, "Copt" },
{ 0x003F0, 0x003FF, "Grek", "Polyt" },
{ 0x00400, 0x0045F, "Cyrl" },
{ 0x00460, 0x00469, "Cyrs" },
{ 0x0046A, 0x0046D, "Cyrl" },
{ 0x0046E, 0x00471, "Cyrs" },
{ 0x00472, 0x00475, "Cyrl" },
{ 0x00476, 0x00489, "Cyrs" },
{ 0x0048A, 0x00527, "Cyrl" },
{ 0x00531, 0x0058F, "Armn" },
{ 0x00590, 0x005FF, "Hebr" },
{ 0x00600, 0x006FF, "Arab" },
{ 0x00700, 0x0074F, "Syrc" },
{ 0x00750, 0x0077F, "Arab" },
{ 0x00780, 0x007B1, "Thaa" },
{ 0x007C0, 0x007FF, "Nkoo" },
{ 0x00800, 0x0083E, "Samr" },
{ 0x00840, 0x0085E, "Mand" },
{ 0x00860, 0x0086A, "Syrc" },
{ 0x008A0, 0x008FF, "Arab" },
{ 0x00900, 0x0097F, "Deva" },
{ 0x00980, 0x00983, "Beng" },
{ 0x00985, 0x0098C, "Beng" },
{ 0x00993, 0x009A8, "Beng" },
{ 0x009AA, 0x009B0, "Beng" },
{ 0x009B6, 0x009B9, "Beng" },
{ 0x009BC, 0x009C4, "Beng" },
{ 0x009CB, 0x009CE, "Beng" },
{ 0x009E0, 0x009E3, "Beng" },
{ 0x009E6, 0x009EF, "Beng" },
{ 0x009F0, 0x009F1, "as-Beng" },
{ 0x00A01, 0x00A76, "Guru" },
{ 0x00A81, 0x00AF1, "Gujr" },
{ 0x00B01, 0x00B77, "Orya" },
{ 0x00B82, 0x00BFA, "Taml" },
{ 0x00C00, 0x00C7F, "Telu" },
{ 0x00C80, 0x00CF2, "Knda" },
{ 0x00D02, 0x00D7F, "Mlym" },
{ 0x00D82, 0x00DF4, "Sinh" },
{ 0x00E01, 0x00E5B, "Thai" },
{ 0x00E81, 0x00EDF, "Laoo" },
{ 0x00F00, 0x00FDA, "Tibt" },
length = 48,
},
[0x01] = {
{ 0x01000, 0x0109F, "Mymr" },
{ 0x010A0, 0x010CD, "Geok" },
{ 0x010D0, 0x010FF, "Geor" },
{ 0x01100, 0x011FF, "Hang" },
{ 0x01200, 0x01399, "Ethi" },
{ 0x013A0, 0x013F4, "Cher" },
{ 0x01400, 0x0167F, "Cans" },
{ 0x01680, 0x0169C, "Ogam" },
{ 0x016A0, 0x016F0, "Runr" },
{ 0x01700, 0x01714, "Tglg" },
{ 0x01720, 0x01734, "Hano" },
{ 0x01740, 0x01753, "Buhd" },
{ 0x01760, 0x01773, "Tagb" },
{ 0x01780, 0x017F9, "Khmr" },
{ 0x01800, 0x018AA, "Mong" },
{ 0x01900, 0x0194F, "Limb" },
{ 0x01950, 0x01974, "Tale" },
{ 0x01980, 0x019DF, "Talu" },
{ 0x019E0, 0x019FF, "Khmr" },
{ 0x01A00, 0x01A1F, "Bugi" },
{ 0x01A20, 0x01AAD, "Lana" },
{ 0x01B00, 0x01B7C, "Bali" },
{ 0x01B80, 0x01BBF, "Sund" },
{ 0x01BC0, 0x01BFF, "Batk" },
{ 0x01C00, 0x01C4F, "Lepc" },
{ 0x01C50, 0x01C7F, "Olck" },
{ 0x01C90, 0x01CBF, "Geor" },
{ 0x01E00, 0x01EFF, "Latn" },
{ 0x01F00, 0x01FFE, "Polyt" },
length = 29,
},
[0x02] = {
{ 0x02190, 0x021FF, "Zsym" },
{ 0x02200, 0x022FF, "Zmth" },
{ 0x02300, 0x023FF, "Zsym" },
{ 0x02500, 0x027BF, "Zsym" },
{ 0x027C0, 0x027EF, "Zmth" },
{ 0x02800, 0x028FF, "Brai" },
{ 0x02980, 0x02AFF, "Zmth" },
{ 0x02B00, 0x02BFE, "Zsym" },
{ 0x02C00, 0x02C5E, "Glag" },
{ 0x02C60, 0x02C7F, "Latn" },
{ 0x02C80, 0x02CFF, "Copt" },
{ 0x02D00, 0x02D2D, "Geok" },
{ 0x02D30, 0x02D7F, "Tfng" },
{ 0x02D80, 0x02DDE, "Ethi" },
{ 0x02E80, 0x02FDF, "Hani" },
length = 15,
},
[0x03] = {
{ 0x03001, 0x03002, "Hani", "Bopo", "Hang", "Hira", "Kana", "Yiii" },
{ 0x03003, 0x03007, "Hani" },
{ 0x03008, 0x03011, "Hani", "Bopo", "Hang", "Hira", "Kana", "Yiii" },
{ 0x03012, 0x03013, "Hani" },
{ 0x03014, 0x0301B, "Hani", "Bopo", "Hang", "Hira", "Kana", "Yiii" },
{ 0x0301C, 0x0301F, "Hani", "Bopo", "Hang", "Hira", "Kana" },
{ 0x03020, 0x0303F, "Hani" },
{ 0x03041, 0x0309F, "Hira" },
{ 0x030A0, 0x030FF, "Kana" },
{ 0x03105, 0x0312F, "Bopo" },
{ 0x03131, 0x0318E, "Hang" },
{ 0x031A0, 0x031BA, "Bopo" },
{ 0x031C0, 0x031E3, "Hani" },
{ 0x031F0, 0x031FF, "Kana" },
{ 0x03220, 0x03247, "Hani" },
{ 0x03280, 0x032B0, "Hani" },
{ 0x032C0, 0x032CB, "Hani" },
{ 0x03300, 0x03357, "Kana" },
{ 0x03358, 0x03370, "Hani" },
{ 0x0337B, 0x0337F, "Hani" },
{ 0x033E0, 0x033FE, "Hani" },
{ 0x03400, 0x03FFF, "Hani" },
length = 22,
},
[0x04] = {
{ 0x04000, 0x04DB5, "Hani" },
{ 0x04E00, 0x04FFF, "Hani" },
length = 2,
},
[0x05] = {
{ 0x05000, 0x05FFF, "Hani" },
length = 1,
},
[0x06] = {
{ 0x06000, 0x06FFF, "Hani" },
length = 1,
},
[0x07] = {
{ 0x07000, 0x07FFF, "Hani" },
length = 1,
},
[0x08] = {
{ 0x08000, 0x08FFF, "Hani" },
length = 1,
},
[0x09] = {
{ 0x09000, 0x09FFF, "Hani" },
length = 1,
},
[0x0A] = {
{ 0x0A000, 0x0A4C6, "Yiii" },
{ 0x0A4D0, 0x0A4FF, "Lisu" },
{ 0x0A500, 0x0A62B, "Vaii" },
{ 0x0A640, 0x0A67F, "Cyrs" },
{ 0x0A680, 0x0A697, "Cyrl" },
{ 0x0A6A0, 0x0A6F7, "Bamu" },
{ 0x0A720, 0x0A7FF, "Latn" },
{ 0x0A800, 0x0A82B, "Sylo" },
{ 0x0A830, 0x0A832, "Deva", "Dogr", "Gujr", "Guru", "Khoj", "Knda", "Kthi", "Mahj", "Modi", "Nand", "Sind", "Takr", "Tirh"},
{ 0x0A833, 0x0A835, "Deva", "Dogr", "Gujr", "Guru", "Khoj", "Knda", "Kthi", "Mahj", "Mlym", "Modi", "Nand", "Sind", "Takr", "Tirh"},
{ 0x0A836, 0x0A839, "Deva", "Dogr", "Gujr", "Guru", "Khoj", "Kthi", "Mahj", "Modi", "Sind", "Takr", "Tirh"},
{ 0x0A840, 0x0A877, "Phag" },
{ 0x0A880, 0x0A8D9, "Saur" },
{ 0x0A8E0, 0x0A8FF, "Deva" },
{ 0x0A900, 0x0A92F, "Kali" },
{ 0x0A930, 0x0A95F, "Rjng" },
{ 0x0A980, 0x0A9DF, "Java" },
{ 0x0A9E0, 0x0A9FE, "Mymr" },
{ 0x0AA00, 0x0AA5F, "Cham" },
{ 0x0AA60, 0x0AA7F, "Mymr" },
{ 0x0AA80, 0x0AADF, "Tavt" },
{ 0x0AAE0, 0x0AAFF, "Mtei" },
{ 0x0AB01, 0x0AB2E, "Ethi" },
{ 0x0AB30, 0x0AB65, "Latn" },
{ 0x0AB70, 0x0ABBF, "Cher" },
{ 0x0ABC0, 0x0ABFF, "Mtei" },
{ 0x0AC00, 0x0AFFF, "Hang" },
length = 27,
},
[0x0B] = {
{ 0x0B000, 0x0BFFF, "Hang" },
length = 1,
},
[0x0C] = {
{ 0x0C000, 0x0CFFF, "Hang" },
length = 1,
},
[0x0D] = {
{ 0x0D000, 0x0D7A3, "Hang" },
length = 1,
},
[0x0F] = {
{ 0x0FA27, 0x0FA29, "Hani" },
{ 0x0FB13, 0x0FB17, "Armn" },
{ 0x0FB1D, 0x0FB4F, "Hebr" },
{ 0x0FB50, 0x0FDFD, "Arab" },
{ 0x0FE45, 0x0FE46, "Hani", "Bopo", "Hang", "Hira", "Kana" },
{ 0x0FE70, 0x0FEFC, "Arab" },
{ 0x0FF61, 0x0FF65, "Hani", "Bopo", "Hang", "Hira", "Kana", "Yiii" },
length = 7,
},
[0x10] = {
{ 0x10000, 0x100FA, "Linb" },
{ 0x10280, 0x1029C, "Lyci" },
{ 0x102A0, 0x102D0, "Cari" },
{ 0x102E1, 0x102FB, "Copt" },
{ 0x10300, 0x10323, "Ital" },
{ 0x10330, 0x1034A, "Goth" },
{ 0x10350, 0x1037A, "Perm" },
{ 0x10380, 0x1039F, "Ugar" },
{ 0x103A0, 0x103D5, "Xpeo" },
{ 0x10400, 0x1044F, "Dsrt" },
{ 0x10450, 0x1047F, "Shaw" },
{ 0x10480, 0x104A9, "Osma" },
{ 0x104B0, 0x104FB, "Osge" },
{ 0x10500, 0x10527, "Elba" },
{ 0x10530, 0x10563, "Aghb" },
{ 0x10600, 0x10767, "Lina" },
{ 0x10800, 0x1083F, "Cprt" },
{ 0x10840, 0x1085F, "Armi" },
{ 0x10860, 0x1087F, "Palm" },
{ 0x10880, 0x108AF, "Nbat" },
{ 0x108E0, 0x108FF, "Hatr" },
{ 0x10900, 0x1091F, "Phnx" },
{ 0x10920, 0x1093F, "Lydi" },
{ 0x10980, 0x1099F, "Mero" },
{ 0x109A0, 0x109BF, "Merc" },
{ 0x10A00, 0x10A58, "Khar" },
{ 0x10A60, 0x10A7F, "Sarb" },
{ 0x10A80, 0x10A9F, "Narb" },
{ 0x10AC0, 0x10AF6, "Mani" },
{ 0x10B00, 0x10B3F, "Avst" },
{ 0x10B40, 0x10B5F, "Prti" },
{ 0x10B60, 0x10B7F, "Phli" },
{ 0x10B80, 0x10BAF, "Phlp" },
{ 0x10C00, 0x10C48, "Orkh" },
{ 0x10C80, 0x10CB2, "Hung" },
{ 0x10D00, 0x10D39, "Rohg" },
{ 0x10E60, 0x10E7E, "Rumin" },
{ 0x10F00, 0x10F27, "Sogo" },
{ 0x10F30, 0x10F59, "Sogd" },
{ 0x10F70, 0x10FAF, "Ougr" },
{ 0x10FE0, 0x10FFF, "Elym" },
length = 41,
},
[0x11] = {
{ 0x11000, 0x1107F, "Brah" },
{ 0x11080, 0x110CD, "Kthi" },
{ 0x110D0, 0x110F9, "Sora" },
{ 0x11100, 0x11146, "Cakm" },
{ 0x11150, 0x11176, "Mahj" },
{ 0x11180, 0x111D9, "Shrd" },
{ 0x11200, 0x1123D, "Khoj" },
{ 0x11280, 0x112A9, "Mult" },
{ 0x112B0, 0x112F9, "Sind" },
{ 0x11301, 0x11374, "Gran" },
{ 0x11400, 0x1145E, "Newa" },
{ 0x11480, 0x114D9, "Tirh" },
{ 0x11580, 0x115DD, "Sidd" },
{ 0x11600, 0x11659, "Modi" },
{ 0x11680, 0x116C9, "Takr" },
{ 0x11700, 0x1173F, "Ahom" },
{ 0x11800, 0x1183B, "Dogr" },
{ 0x118A0, 0x118FF, "Wara" },
{ 0x119A0, 0x119FF, "Nand" },
{ 0x11A00, 0x11A47, "Zanb" },
{ 0x11A50, 0x11AA2, "Soyo" },
{ 0x11AC0, 0x11AF8, "Pauc" },
{ 0x11C00, 0x11C6C, "Bhks" },
{ 0x11C70, 0x11CB6, "Marc" },
{ 0x11D00, 0x11D59, "Gonm" },
{ 0x11D60, 0x11DA9, "Gong" },
{ 0x11EE0, 0x11EF8, "Maka" },
length = 27,
},
[0x12] = {
{ 0x12000, 0x1236E, "Xsux" },
{ 0x12400, 0x12473, "Xsux" },
{ 0x12F90, 0x12FFF, "Cpmn" },
length = 3,
},
[0x13] = {
{ 0x13000, 0x1342E, "Egyp" },
length = 1,
},
[0x14] = {
{ 0x14400, 0x14646, "Hluw" },
length = 1,
},
[0x16] = {
{ 0x16800, 0x16A38, "Bamu" },
{ 0x16A40, 0x16A6F, "Mroo" },
{ 0x16AD0, 0x16AF5, "Bass" },
{ 0x16B00, 0x16B8F, "Hmng" },
{ 0x16E40, 0x16E9A, "Medf" },
{ 0x16F00, 0x16F9F, "Plrd" },
length = 6,
},
[0x17] = {
{ 0x17000, 0x17FFF, "Tang" },
length = 1,
},
[0x18] = {
{ 0x18000, 0x18AF2, "Tang" },
length = 1,
},
[0x1B] = {
{ 0x1B001, 0x1B11E, "Hira" },
{ 0x1B170, 0x1B2FB, "Nshu" },
{ 0x1BC00, 0x1BC9F, "Dupl" },
length = 3,
},
[0x1D] = {
{ 0x1D100, 0x1D1DD, "Music" },
{ 0x1D2E0, 0x1D2F3, "Maya" },
{ 0x1D400, 0x1D7FF, "Zmth" },
{ 0x1D800, 0x1DAAF, "Sgnw" },
length = 4,
},
[0x1E] = {
{ 0x1E000, 0x1E02A, "Glag" },
{ 0x1E800, 0x1E8D6, "Mend" },
{ 0x1E900, 0x1E95F, "Adlm" },
length = 3,
},
[0x1F] = {
{ 0x1F000, 0x1F0F5, "Zsym" },
{ 0x1F300, 0x1FA6D, "Zsym" },
length = 2,
},
[0x20] = {
{ 0x20000, 0x20FFF, "Hani" },
length = 1,
},
[0x21] = {
{ 0x21000, 0x21FFF, "Hani" },
length = 1,
},
[0x22] = {
{ 0x22000, 0x22FFF, "Hani" },
length = 1,
},
[0x23] = {
{ 0x23000, 0x23FFF, "Hani" },
length = 1,
},
[0x24] = {
{ 0x24000, 0x24FFF, "Hani" },
length = 1,
},
[0x25] = {
{ 0x25000, 0x25FFF, "Hani" },
length = 1,
},
[0x26] = {
{ 0x26000, 0x26FFF, "Hani" },
length = 1,
},
[0x27] = {
{ 0x27000, 0x27FFF, "Hani" },
length = 1,
},
[0x28] = {
{ 0x28000, 0x28FFF, "Hani" },
length = 1,
},
[0x29] = {
{ 0x29000, 0x29FFF, "Hani" },
length = 1,
},
[0x2A] = {
{ 0x2A000, 0x2AFFF, "Hani" },
length = 1,
},
[0x2B] = {
{ 0x2B000, 0x2BFFF, "Hani" },
length = 1,
},
[0x2C] = {
{ 0x2C000, 0x2CFFF, "Hani" },
length = 1,
},
[0x2D] = {
{ 0x2D000, 0x2DFFF, "Hani" },
length = 1,
},
[0x2E] = {
{ 0x2E000, 0x2EBE0, "Hani" },
length = 1,
},
individual = {
[0x00462] = "Cyrl",
[0x00463] = "Cyrl",
[0x0060C] = "Arab, Nkoo, Rohg, Syrc, Thaa, Yezi",
[0x0061B] = "Arab, Nkoo, Rohg, Syrc, Thaa, Yezi",
[0x0061F] = "Arab, Adlm, Nkoo, Rohg, Syrc, Thaa, Yezi",
[0x00640] = "Arab, Adlm, Mand, Mani, Ougr, Phlp, Rohg, Sogd, Syrc",
[0x00951] = "Deva, Beng, Gran, Gujr, Guru, Knda, Latn, Mlym, Orya, Shrd, Taml, Telu, Tirh",
[0x00952] = "Deva, Beng, Gran, Gujr, Guru, Knda, Latn, Mlym, Orya, Taml, Telu, Tirh",
[0x00964] = "Deva, Beng, Dogr, Gong, Gonm, Gran, Gujr, Guru, Knda, Mahj, Mlym, Nand, Orya, Sind, Sinh, Sylo, Takr, Taml, Telu, Tirh",
[0x00965] = "Deva, Beng, Dogr, Gong, Gonm, Gran, Gujr, Guru, Knda, Limb, Mahj, Mlym, Nand, Orya, Sind, Sinh, Sylo, Takr, Taml, Telu, Tirh",
[0x0098F] = "Beng",
[0x00990] = "Beng",
[0x009A1] = "Beng",
[0x009A2] = "Beng",
[0x009AF] = "Beng",
[0x009B2] = "Beng",
[0x009BC] = "Beng",
[0x009C7] = "Beng",
[0x009C8] = "Beng",
[0x009D7] = "Beng",
[0x01CDA] = "Deva, Knda, Mlym, Orya, Taml, Telu",
[0x01CF2] = "Deva, Beng, Gran, Knda, Nand, Orya, Telu, Tirh",
[0x02135] = "Zmth",
[0x03000] = "Hani",
[0x03003] = "Hani, Bopo, Hang, Hira, Kana",
[0x03013] = "Hani, Bopo, Hang, Hira, Kana",
[0x03030] = "Hani, Bopo, Hang, Hira, Kana",
[0x03037] = "Hani, Bopo, Hang, Hira, Kana",
[0x030FB] = "Kana, Hani, Bopo, Hang, Hira, Yiii",
[0x032FF] = "Hani",
[0x0FA0E] = "Hani",
[0x0FA0F] = "Hani",
[0x0FA11] = "Hani",
[0x0FA13] = "Hani",
[0x0FA14] = "Hani",
[0x0FA1F] = "Hani",
[0x0FA21] = "Hani",
[0x0FA23] = "Hani",
[0x0FA24] = "Hani",
[0x1056F] = "Aghb",
[0x16FE0] = "Tang",
[0x16FE1] = "Nshu",
[0x1B000] = "Kana",
},
blocks = {
{ 0x04, 0x09, "Hani" },
{ 0x0B, 0x0D, "Hang" },
{ 0x17, 0x18, "Tang" },
{ 0x20, 0x2E, "Hani" },
},
}