Module:ja-parse
Appearance
- පහත දැක්වෙන උපදෙස්, Module:ja-parse/documentation හි පිහිටා ඇත. Module:ja-parse/documentation]]. [සංස්කරණය]
- ප්රයෝජනවත් සබැඳි: උප පිටු ලැයිස්තුව • සබැඳි • transclusions • testcases • sandbox (වෙනස)
Auxiliary functions to parse the source of Japanese entries. Currently only used by Module:ja-see.
local export = {}
local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local m_ja = require('Module:ja')
-- Auxiliary functions
local kanji_pattern = "一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𰀀-"
local kana_pattern = 'ぁ-ゖァ-ヺー'
local japanese_pattern = kana_pattern .. kanji_pattern .. 'a-zA-Z0-9〆々'
local headword_templates = {
['ja-adj'] = true, ['ja-pos'] = true, ['ja-noun'] = true, ['ja-phrase'] = true,
['ja-verb'] = true, ['ja-verb form'] = true, ['ja-verb-suru'] = true,
}
local function find_headword_template(wikitext)
local index =
wikitext:find('{{ja%-noun[|}]') or
wikitext:find('{{ja%-adj[|}]') or
wikitext:find('{{ja%-pos[|}]') or
wikitext:find('{{ja%-phrase[|}]') or
wikitext:find('{{ja%-verb[|}]') or
wikitext:find('{{ja%-verb form[|}]') or
wikitext:find('{{ja%-verb%-suru[|}]')
if index then
-- This assumes that the template has matching braces.
return wikitext:match('%b{}', index)
end
end
local function parse_template(wikitext) -- only supports the simplest format
local template = wikitext
template = template:gsub('%[%[([^%[%]|]-)|([^%[%]|]-)%]%]', '[[%1`%2]]')
local name
local args = {}
for glob in mw.text.gsplit(template:gsub('^{{', ''):gsub('}}$', ''), '|') do
if not name then
name = glob
else
glob = glob:gsub('`', '|')
local key, value = match(glob, "(.-)=(.*)")
if key and value then
args[key] = value
else
table.insert(args, glob)
end
end
end
return name, args
end
local function contains(list, item)
for i = 1, #list do
if list[i] == item then return true end
end
return false
end
-- Part I: functions to parse entries into words
function export.words(page_title)
local page = mw.title.new(page_title):getContent() or ''
local l2 = match(page, '==Japanese==\n(.-)\n==[^=]+==\n') or match(page, '==Japanese==\n(.*)') or ''
-- split into L3 sections
local l3_sections = {}
local multi_etym = false
-- special hack for kanji entries
if not find(l2, '===Etymology 1===') and (find(l2, '===Kanji===') or find(l2, '===Kanji %d+===')) then
l2 = gsub(l2, '{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
end
local current_l3_title = ''
local current_l3_content = {}
for v in l2:gmatch('[^\n]+') do
if find(v, '^===[^=]') then
table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
current_l3_title = match(v, '^===([^=]+)')
if current_l3_title == 'Etymology 1' then multi_etym = true end
current_l3_content = {}
end
table.insert(current_l3_content, v)
end
table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
-- group the L3 sections into words
local words = {}
if multi_etym then
for _, v in ipairs(l3_sections) do
local header = v[1]
local content = v[2]
if find(header, '^Etymology %d+$') then
table.insert(words, content)
end
end
else
local word = {}
for _, v in ipairs(l3_sections) do
local header = v[1]
local content = v[2]
if not (header == 'Kanji' or find(header, '^Kanji %d+$')) then
table.insert(word, content)
end
end
word = table.concat(word, '\n')
table.insert(words, word)
end
local result = {}
local function add(list, item)
if not contains(list, item) then table.insert(list, item) end
end
local function insert_spelling(entry, spelling)
if spelling then
if find(m_ja.script(spelling), 'Hani') then
add(entry.kanji_spellings, spelling)
else
add(entry.kana_spellings, spelling)
end
end
end
for _, word in ipairs(words) do
local entry = {
word,
type = '',
kana_spellings = {},
kanji_spellings = {},
historical_spellings = {},
}
insert_spelling(entry, page_title)
local ja_see = find(word, '{{ja%-see[|}]') or find(word, '{{ja%-see-kango[|}]')
if ja_see then
entry.type = 'redirect'
for link_title in gmatch(match(word, '.-}}', ja_see), '[' .. japanese_pattern .. ']+') do
insert_spelling(entry, link_title)
end
else
local ja_kanjitab = word:find('{{ja%-kanjitab[|}]')
local headword_template = find_headword_template(word)
if ja_kanjitab then
entry.type = 'lemma'
local _, args = parse_template(word:match('%b{}', ja_kanjitab))
if args.alt and args.alt ~= "" and args.alt ~= "-" then
for alt_spelling in mw.text.gsplit(args.alt, ',') do
insert_spelling(entry, alt_spelling:gsub(':.+', ''))
end
end
end
if headword_template then
entry.type = 'lemma'
local _, args = parse_template(headword_template)
for i = 1, #args do
if find(args[i], '[' .. japanese_pattern .. ']') then
insert_spelling(entry, m_ja.remove_ruby_markup(args[i]))
end
end
add(entry.historical_spellings, args.hhira)
add(entry.historical_spellings, args.hkata)
end
end
table.insert(result, entry)
end
return result
end
-- Part II: functions to extract definitions and categories from a word
function export.parse_word(wikitext, lemma, nonlemma, frame, reading)
local def = {}
local cat = {}
local current_section = ''
for line in wikitext:gmatch('[^\n]+') do
if line:find('^#+ ') then
if not line:find('{{rfdef') and not (
-- the nonlemma entry is a kanji spelling and
find(nonlemma, '[' .. kanji_pattern .. ']') and
-- is not listed in {{ja-def}} or the lemma entry has <!-- kana only -->
(line:find('{{ja%-def|') and not line:find('|' .. nonlemma .. '[|}]') or line:find('<!%-%- kana only %-%->'))
) then
table.insert(def, { line:gsub("<ref[ >].-</ref>", ""), pos = current_section })
end
elseif line:find('^===') then
current_section = line:gsub("^=*(.-)=*$", "%1")
else
table.insert(cat, line)
end
end
-- expand the other parts for categories
local cat = table.concat(cat, '\n')
cat = gsub(cat, '<ref', '')
local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|".
local templates_to_include = {
-- Categories generated by these templates are copied.
-- It is currently empty here.
-- ['ExampleTemplateName'] = true,
}
if headword_templates[a] then
local source_script = m_ja.script(lemma)
if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
return '{{' .. a .. '|hira=' .. lemma .. b
else
return '{{' .. a .. b
end
elseif a:find('^R%:') then
return '{{=' .. b
elseif a == 'ja-usex' or a:find('^quote') then -- special hack
return '[[Category:Japanese terms with usage examples]]{{=' .. b
elseif not templates_to_include[a] then
return '{{=' .. b
else
return '{{' .. a .. b
end
end
cat = gsub(cat, '{{([^|}\n]+)\n?([|}])', process_template_header)
cat = gsub(cat, '{{ja%-pron.-}}', function(pron)
local result = ''
if not find(pron, '|noipa=') then result = result .. '[[Category:Japanese terms with IPA pronunciation]]' end
if find(pron, '|a=') or find(pron, '|audio=') then result = result .. '[[Category:Japanese terms with audio links]]' end
return result
end)
cat = frame:preprocess(cat)
local cat2 = {}
for i in gmatch(cat, '%[%[Category:.-%]%]') do
i = gsub(i, '|.*', ']]')
if i == '[[Category:Japanese lemmas]]' then i = '[[Category:Japanese non-lemma forms]]' end
i = gsub(i, '%]%]', '|' .. (require("Module:languages").getByCode("ja"):makeSortKey(reading)) .. ']]')
table.insert(cat2, i)
end
cat = table.concat(cat2)
-- one might want to modify the sortkeys here
return def, cat
end
return export