Module:Jpan-sortkey
Jump to navigation
Jump to search
- The following documentation is located at Module:Jpan-sortkey/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module will sort text in the Japanese script. It is used to sort Southern Amami Ōshima, Japanese, Hachijō, Kikai, Translingual, Miyako, Old Japanese, Okinoerabu, Northern Amami Ōshima, Yaeyama, Okinawan, Tokunoshima, Kunigami, Yonaguni, and Yoron.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{sortkey}}
.
Within a module, use Module:languages#Language:makeSortKey.
For testcases, see Module:Jpan-sortkey/testcases.
Functions
makeSortKey(text, lang, sc)
- Generates a sortkey for a given piece of
text
written in the script specified by the codesc
, and language specified by the codelang
. - When the sort fails, returns
nil
.
local require = require
local require_when_needed = require("Module:require when needed")
local export = {}
local concat = table.concat
local find_templates = require_when_needed("Module:template parser", "find_templates")
local get_by_code = require_when_needed("Module:languages", "getByCode")
local get_section = require_when_needed("Module:pages", "get_section")
local Hani_sort = require_when_needed("Module:Hani-sortkey", "makeSortKey")
local Hira_sort = require("Module:Hira-sortkey").makeSortKey
local insert = table.insert
local toNFC = mw.ustring.toNFC
local track = require_when_needed("Module:debug/track")
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
local usub = require_when_needed("Module:string utilities", "sub")
local range = mw.loadData("Module:ja/data/range")
local kanji_pattern = range.kanji
local ideograph_pattern = range.ideograph
local kana_graph_pattern = range.kana_graph
local latin_pattern = range.latin
function export.makeSortKey(text, lang, sc)
-- Determine reading.
local seen_pages, langname = {}
while lang ~= "mul" and (not seen_pages[text]) and umatch(text, "[0-9" .. kanji_pattern .. ideograph_pattern .. kana_graph_pattern .. latin_pattern .. "]") do
repeat
langname = langname or get_by_code(lang):getCanonicalName()
seen_pages[text] = true
local content = mw.title.new(toNFC(text)):getContent()
content = get_section(content, langname, 2)
if not content then
break
end
local kanjitab, br
for template in find_templates(content) do
local name = template:get_name()
if (
name == lang .. "-head" or
name == lang .. "-pos"
) then
local reading = template:get_arguments()[2]
if reading ~= nil then
text = reading
br = true
break
end
elseif (
name == lang .. "-noun" or
name == lang .. "-verb" or
name == lang .. "-adj" or
name == lang .. "-phrase" or
name == lang .. "-verb form" or
name == lang .. "-verb-suru" or
name == lang .. "-see" or
name == lang .. "-see-kango" or
name == lang .. "-gv"
) then
local reading = template:get_arguments()[1]
if reading ~= nil then
text = reading
br = true
break
end
elseif (
name == "head" or
name == "head-lite"
) then
local args = template:get_arguments()
if args[1] == lang then
for i, arg in ipairs(args) do
if arg == "kana" then
local kana = args[i+1]
if kana then
text = kana
br = true
break
end
end
end
end
elseif not kanjitab and name == lang .. "-kanjitab" then
kanjitab = template:get_arguments()
end
end
if kanjitab and not br then
track{"Jpan-sortkey/kanjitab", "Jpan-sortkey/kanjitab/" .. lang}
if kanjitab.sortkey then
text = kanjitab.sortkey
break
end
-- extract kanji and non-kanji
local kanji = {}
local non_kanji = {}
local kanji_border = 1
ugsub(text, "()([" .. kanji_pattern .. "々])()", function(p1, w1, p2)
insert(non_kanji, usub(text, kanji_border, p1 - 1))
kanji_border = p2
insert(kanji, w1)
end)
insert(non_kanji, usub(text, kanji_border))
-- 々
for i, v in ipairs(kanji) do
if v == "々" then kanji[i] = kanji[i - 1] end
end
-- process readings
local readings = {}
local readings_actual = {}
local reading_length_total = 0
for i in ipairs(kanjitab) do
local reading_kana, reading_length = umatch(kanjitab[i] or "", "^([^0-9]*)([0-9]*)$")
reading_kana = reading_kana ~= "" and reading_kana or nil
reading_length = reading_kana and tonumber(reading_length) or 1
insert(readings, {reading_kana, reading_length})
reading_length_total = reading_length_total + reading_length
for _ = reading_length_total + 1, #kanji do
insert(readings, {nil, 1})
end
if reading_kana then
local actual_reading = kanjitab["k" .. i]
local okurigana = kanjitab["o" .. i]
readings_actual[i] = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length}
else
readings_actual[i] = {nil, 1}
end
end
local sortkey = {non_kanji[1]}
local id = 1
for _, v in ipairs(readings_actual) do
id = id + v[2]
v[1] = v[1] ~= "-" and v[1]
insert(sortkey, (v[1] or "") .. (non_kanji[id] or ""))
end
sortkey = concat(sortkey)
if sortkey ~= "" then
text = sortkey
end
end
until true
end
-- Use hiragana sort.
text = Hira_sort(text, lang, sc)
-- Run through Hani sort, to catch any stray kanji. This shouldn't happen but often does, and we still want to handle them sensibly in the time before the entry is fixed. Exclude spaces and punctuation, since otherwise Hani_sort automatically removes them.
local ret = ugsub(text, "[^%s%p]+", function(str)
return Hani_sort(str, lang, sc)
end)
if not (lang == "mul" or ret == text) then
track{"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang}
end
return ret
end
return export
Categories:
- Japanese script
- Sortkey-generating modules by script
- Japanese script modules
- Sortkey-generating modules
- Japanese modules
- Kunigami modules
- Northern Amami Ōshima modules
- Translingual modules
- Okinoerabu modules
- Tokunoshima modules
- Old Japanese modules
- Okinawan modules
- Kikai modules
- Hachijō modules
- Yaeyama modules
- Southern Amami Ōshima modules
- Yonaguni modules
- Yoron modules
- Miyako modules