Module:etymon
පෙනුම
- පහත දැක්වෙන උපදෙස්, Module:etymon/documentation හි පිහිටා ඇත. Module:etymon/documentation]]. [සංස්කරණය]
- ප්රයෝජනවත් සබැඳි: උප පිටු ලැයිස්තුව • සබැඳි • transclusions • testcases • sandbox
This module implements the template {{etymon}}.
--[=[
This module implements the {{etymon}} template for structured etymology data on Wiktionary.
It enables the creation of etymology trees and text by parsing etymon chains,
scraping linked pages for their own {{etymon}} data, and recursively building a tree
of derivational relationships.
Authorship:
- Original implementation: [[User:Ioaxxere]]
- Full refactor (September 2025): [[User:Fenakhay]] ([[Special:Diff/86717746]])
Modules:
- [[Module:etymon]]: main module handling parsing, validation, tree building, and page scraping
- [[Module:etymon/data]]: keyword definitions, configuration, and status constants
- [[Module:etymon/tree]]: etymology tree rendering
- [[Module:etymon/text]]: etymology text generation
- [[Module:etymon/categories]]: category generation logic
]=]
local export = {}
local etymon_data_module = "Module:etymon/data"
local etymon_text_module = "Module:etymon/text"
local etymon_tree_module = "Module:etymon/tree"
local etymon_categories_module = "Module:etymon/categories"
local __state = {
cached_etymon_args = {},
cached_etymon_pages = {},
senseid_parent_etymon = {},
available_etymon_ids = {},
single_etymons = {},
current_page_has_inline_etymology = false,
current_page_has_redundant_etymology = false,
used_idless_etymon = false,
toplevel_has_inline_etymology = false,
toplevel_redundant_etymology = false,
toplevel_idless_etymon = false,
has_mismatched_id = false,
max_depth_reached = 0,
total_nodes = 0,
language_count = {},
toplevel_keyword_stats = {},
warnings = {},
}
local Loader = {}
Loader.modules = {
data = etymon_data_module,
tree = etymon_tree_module,
text = etymon_text_module,
categories = etymon_categories_module,
anchors = "Module:anchors",
etydate = "Module:etydate",
etymology = "Module:etymology",
headword_data = "Module:headword/data",
languages = "Module:languages",
languages_errorgetby = "Module:languages/errorGetBy",
links = "Module:links",
pages = "Module:pages",
parameters = "Module:parameters",
parameters_data = "Module:parameters/data",
string_utilities = "Module:string utilities",
template_parser = "Module:template parser",
utilities = "Module:utilities",
debug = "Module:debug",
parse_utilities = "Module:parse utilities",
references = "Module:references",
track = "Module:debug/track",
template_styles = "Module:TemplateStyles",
script_utilities = "Module:script utilities",
JSON = "Module:JSON",
yesno = "Module:yesno",
}
function Loader.init(self)
local loaded = {}
local function get(module_name)
if not loaded[module_name] then
loaded[module_name] = require(self.modules[module_name])
end
return loaded[module_name]
end
local mt = {}
function mt.__index(t, k)
if self.modules[k] then
local module = get(k)
if type(module) == "function" then
rawset(t, k, module)
return module
end
local func_loader = {}
local func_mt = {}
function func_mt.__index(t2, k2)
local func = module[k2]
rawset(t2, k2, func)
return func
end
setmetatable(func_loader, func_mt)
rawset(t, k, func_loader)
return func_loader
end
end
return setmetatable({}, mt)
end
local M = Loader:init()
local Util = {}
function Util.format_error(message, preview_only)
if preview_only and not M.pages.is_preview() then
return nil
end
return '<span class="error">' .. message .. '</span>'
end
function Util.add_warning(message, preview_only)
local formatted = Util.format_error(message, preview_only)
if formatted then
table.insert(__state.warnings, formatted)
end
end
function Util.get_lang(code, no_error)
if no_error then
return M.languages.getByCode(code, nil, true)
end
return M.languages.getByCode(code, nil, true) or M.languages_errorgetby.code(code, true, true)
end
function Util.get_lang_exception(lang)
local code = lang:getCode()
local lang_exceptions = M.data.config.lang_exceptions
if lang_exceptions[code] then
return lang_exceptions[code]
end
for norm_code, exc in pairs(lang_exceptions) do
if exc.normalize_from_families then
local should_normalize = false
for _, family in ipairs(exc.normalize_from_families) do
if lang:inFamily(family) then
should_normalize = true
break
end
end
if should_normalize and exc.normalize_exclude_families then
for _, family in ipairs(exc.normalize_exclude_families) do
if lang:inFamily(family) then
should_normalize = false
break
end
end
end
if should_normalize then
return exc
end
end
end
return nil
end
function Util.get_norm_lang(lang)
local exc = Util.get_lang_exception(lang)
if exc and exc.normalize_to then
return M.languages.getByCode(exc.normalize_to)
end
return lang
end
-- Add default values for boolean modifiers (e.g., <unc> becomes <unc:1>)
-- This is needed because Module:parse utilities expects boolean modifiers to have explicit values
function Util.add_boolean_defaults(str, param_mods)
local result = str
for name, spec in pairs(param_mods) do
if spec.type == "boolean" then
-- Replace <name> with <name:1> (but not <name:...> which already has a value)
result = result:gsub("<" .. name .. ">", "<" .. name .. ":1>")
end
end
return result
end
-- Centralized term formatting: handles suppress_term, unknown_term, and regular terms
function Util.format_term(term, is_toplevel, opts)
opts = opts or {}
-- suppress_term (-) returns nil
if term.suppress_term then
return nil
end
local lang = term.lang
local exc = Util.get_lang_exception(lang)
if is_toplevel then
local display_text = term.alt or term.title or ""
local sc = term.sc or lang:findBestScript(display_text)
local bold_text = tostring(mw.html.create("strong")
:addClass("selflink")
:wikitext(display_text))
return M.script_utilities.tag_text(bold_text, lang, sc, "term")
end
local link_params = { lang = lang }
link_params.term = not term.unknown_term and term.title or nil
link_params.alt = term.alt
link_params.id = (not term.unknown_term and term.id and term.id ~= "") and term.id or nil
link_params.tr = term.tr
link_params.ts = term.ts
if not opts.suppress_gloss then
link_params.gloss = term.t
end
if not opts.suppress_pos then
link_params.pos = term.pos
end
-- Apply language exceptions
if exc and exc.suppress_tr then
link_params.tr = nil
end
return M.links.full_link(link_params, "term")
end
local __is_content_page_cached
function Util.is_content_page()
if __is_content_page_cached == nil then
__is_content_page_cached = M.pages.is_content_page(mw.title.getCurrentTitle())
end
return __is_content_page_cached
end
local __page_data_cached
function Util.get_page_data()
if not __page_data_cached then
__page_data_cached = mw.loadData(Loader.modules.headword_data).page
end
return __page_data_cached
end
-- Extract base keyword from param (without modifiers)
local function get_keyword_base(param)
if type(param) ~= "string" then return nil end
local base = param:match("^:?([^<]+)") or param:gsub("^:", "")
return base
end
local function is_keyword(param, allow_colon_less)
if type(param) ~= "string" then return false end
local keywords = M.data.keywords
if param:sub(1, 1) == ":" then
local base = get_keyword_base(param)
return keywords[base] ~= nil
end
if allow_colon_less then
local base = get_keyword_base(param)
return keywords[base] ~= nil
end
return false
end
local function get_keyword(param, allow_colon_less)
if type(param) ~= "string" then return nil end
local keywords = M.data.keywords
if param:sub(1, 1) == ":" then
return get_keyword_base(param)
end
if allow_colon_less then
local base = get_keyword_base(param)
if keywords[base] then
return base
end
end
return nil
end
local function normalize_keyword(keyword)
if keyword:sub(1, 1) == ":" then
return keyword
end
return ":" .. keyword
end
local EtymonParser = {}
-- Keyword modifier definitions
EtymonParser.keyword_param_mods = {
unc = { type = "boolean" },
ref = {},
text = { restrict = { keywords = { "from" } } },
lit = { restrict = { keywords = { "af", "univ" } } },
conj = {}, -- conjunction for alternatives: "and", "or", "and/or", etc.
}
-- Term modifier definitions
EtymonParser.etymon_param_mods = {
id = {},
t = {},
tr = {},
ts = {},
pos = {},
alt = {},
ety = {},
unc = { type = "boolean" },
ref = {},
aftype = { restrict = { keywords = { "af", "afeq" } } },
postype = {},
bor = { type = "boolean", restrict = { keywords = { "af" } } },
}
local function get_clean_param_mods(param_mods)
local clean = {}
for mod_name, mod_def in pairs(param_mods) do
clean[mod_name] = {}
for key, value in pairs(mod_def) do
if key ~= "restrict" then
clean[mod_name][key] = value
end
end
end
return clean
end
function EtymonParser.check_modifier_restrictions(modifiers, current_keyword, param_mods)
for mod_name, mod_value in pairs(modifiers) do
-- Only check restrictions if the modifier has a non-false/nil value
if mod_value then
local mod_def = param_mods[mod_name]
if mod_def and mod_def.restrict and mod_def.restrict.keywords then
local allowed_keywords = mod_def.restrict.keywords
local is_allowed = false
for _, allowed_keyword in ipairs(allowed_keywords) do
if current_keyword == allowed_keyword then
is_allowed = true
break
end
end
if not is_allowed then
local keyword_list = {}
for _, kw in ipairs(allowed_keywords) do
table.insert(keyword_list, ":" .. kw)
end
local keyword_str = table.concat(keyword_list, #keyword_list == 2 and " or " or ", ")
if #keyword_list > 2 then
-- Replace last comma with "or"
keyword_str = keyword_str:gsub(", ([^,]+)$", " or %1")
end
local mod_display = mod_value == true and "<" .. mod_name .. ">" or "<" .. mod_name .. ":" .. tostring(mod_value) .. ">"
error("The modifier `" .. mod_display .. "` is only allowed for the keyword" .. (#keyword_list > 1 and "s " or " ") .. keyword_str .. ".")
end
end
end
end
end
-- Parse keyword with modifiers (e.g., ":bor<unc>" or ":bor<ref:{{R:example}}>")
function EtymonParser.parse_keyword_modifiers(param)
if type(param) ~= "string" then return nil, {} end
local base_keyword = get_keyword_base(param)
if not base_keyword then return nil, {} end
-- Check if there are any modifiers
if not param:find("<", 1, true) then
return base_keyword, {}
end
-- Parse modifiers using the same mechanism as etymon parsing
local rest_with_defaults = Util.add_boolean_defaults(param, EtymonParser.keyword_param_mods)
local function generate_obj(ignored)
return {}
end
local parsed = M.parse_utilities.parse_inline_modifiers(rest_with_defaults:gsub("^:?[^<]+", ""),
{ param_mods = get_clean_param_mods(EtymonParser.keyword_param_mods), generate_obj = generate_obj })
local modifiers = {
unc = parsed.unc or false,
ref = parsed.ref,
text = parsed.text,
lit = parsed.lit,
conj = parsed.conj,
}
-- Validate modifiers against restrictions
EtymonParser.check_modifier_restrictions(modifiers, base_keyword, EtymonParser.keyword_param_mods)
return base_keyword, modifiers
end
function EtymonParser.parse_balanced_segments(str)
local segments = {}
local current = ""
local depth = 0
local i = 1
while i <= #str do
local char = str:sub(i, i)
if char == "<" then
if depth == 0 and current ~= "" then
table.insert(segments, current)
current = ""
end
depth = depth + 1
current = current .. char
elseif char == ">" then
current = current .. char
depth = depth - 1
if depth == 0 then
table.insert(segments, current)
current = ""
elseif depth < 0 then
error("Unbalanced brackets in etymon: unexpected '>'")
end
else
current = current .. char
end
i = i + 1
end
if depth ~= 0 then
error("Unbalanced brackets in etymon: missing '>'")
end
if current ~= "" then
table.insert(segments, current)
end
return segments
end
function EtymonParser.parse_inline_ety(ety_string, context_lang)
local segments = EtymonParser.parse_balanced_segments(ety_string)
if #segments == 0 then
error("Empty inline etymology")
end
local keyword = M.string_utilities.trim(segments[1])
if not is_keyword(keyword, true) then
error("Invalid keyword '" .. keyword .. "' in inline etymology <ety:" .. keyword .. "...>")
end
local args = { context_lang:getCode(), normalize_keyword(keyword) }
for i = 2, #segments do
local segment = segments[i]
if segment:sub(1, 1) == "<" and segment:sub(-1) == ">" then
local inner = segment:sub(2, -2)
if inner ~= "" then
table.insert(args, inner)
end
elseif is_keyword(segment, true) then
-- Handle keywords that appear between bracketed segments
table.insert(args, normalize_keyword(get_keyword(segment, true)))
end
end
return args
end
function EtymonParser.parse_etymon(param, context_lang)
if is_keyword(param) then
return nil
end
if type(param) ~= "string" then
return nil
end
local lang, rest
local before_bracket = param:match("^([^<]*)") or param
local lang_code, rest_match = before_bracket:match("^([a-zA-Z][a-zA-Z0-9._-]*):(.*)$")
if lang_code then
local potential_lang = Util.get_lang(lang_code, true)
if potential_lang then
lang = potential_lang
rest = param:sub(#lang_code + 2)
else
lang = context_lang
rest = param
end
else
lang = context_lang
rest = param
end
if rest == "" then
M.track("etymon/term/empty")
elseif rest == "?" then
M.track("etymon/term/question-mark")
elseif rest == "-" then
M.track("etymon/term/hyphen")
end
if rest == "" then
return {
lang = lang,
term = nil,
unknown_term = true
}
end
if rest == "-" then
return {
lang = lang,
term = nil,
suppress_term = true
}
end
if not rest:find("<", 1, true) then
return {
lang = lang,
term = M.string_utilities.trim(rest)
}
end
local term_text = rest:match("^([^<]*)") or ""
local is_unknown = (term_text == "")
local is_suppress = (term_text == "-")
local function generate_obj(ignored_term)
return { term = (is_unknown or is_suppress) and nil or M.string_utilities.trim(term_text) }
end
local rest_with_defaults = Util.add_boolean_defaults(rest, EtymonParser.etymon_param_mods)
local parsed_obj = M.parse_utilities.parse_inline_modifiers(rest_with_defaults,
{ param_mods = get_clean_param_mods(EtymonParser.etymon_param_mods), generate_obj = generate_obj })
if parsed_obj.id and parsed_obj.id:match("^!") then
parsed_obj.id = parsed_obj.id:sub(2)
parsed_obj.override = true
end
parsed_obj.lang = lang
if is_unknown then
parsed_obj.unknown_term = true
elseif is_suppress then
parsed_obj.suppress_term = true
end
return parsed_obj
end
function EtymonParser.validate(lang, args, id, title, pos, starts_with_lang_code)
-- id is now optional, so only validate if provided
if id then
if mw.ustring.len(id) < 2 then
error("The `id` parameter must have at least two characters.")
end
if id == title or id == Util.get_page_data().pagename then
error("The `id` parameter must not be the same as the page title.")
end
end
local valid_pos = { prefix = true, suffix = true, interfix = true, infix = true, root = true, word = true }
if pos and not valid_pos[pos] then
error("Unknown value provided for `pos`. Valid values: " .. table.concat(require("Module:table").keysToList(valid_pos), ", ") .. ".")
end
local current_keyword = "from"
local etymons_in_group = {}
local keywords = M.data.keywords
local function checkGroup()
if keywords[current_keyword] and keywords[current_keyword].is_group and current_keyword ~= 'af' and current_keyword ~= 'afeq' and #etymons_in_group <= 1 then
error("Detected `:" .. current_keyword .. "` group with fewer than two etymons.")
end
etymons_in_group = {}
end
local start_index = starts_with_lang_code and 2 or 1
for i = start_index, #args do
local param = args[i]
if type(param) ~= "string" then
elseif param:sub(1, 1) == ":" and not is_keyword(param) then
error("Invalid keyword '" .. param .. "'. Did you mean a valid keyword like ':bor', ':inh', etc.?")
elseif is_keyword(param) then
local keyword = get_keyword(param)
checkGroup()
current_keyword = keyword
else
local etymon_data = EtymonParser.parse_etymon(param, lang)
if etymon_data then
table.insert(etymons_in_group, param)
local param_lang = etymon_data.lang
if current_keyword == "from" and param_lang:getFullCode() ~= lang:getFullCode() then
error("`:from` is for same-language derivation, but language does not match. " ..
"Expected '" .. lang:getFullCode() .. "', got '" .. param_lang:getFullCode() .. "'.")
elseif current_keyword == "inh" then
M.etymology.check_ancestor(lang, param_lang)
end
-- Check modifier restrictions
EtymonParser.check_modifier_restrictions(etymon_data, current_keyword, EtymonParser.etymon_param_mods)
-- postype must be "root" or "word"
local VALID_POSTYPES = { root = true, word = true }
if etymon_data.postype and not VALID_POSTYPES[etymon_data.postype] then
error("Invalid <postype:" .. etymon_data.postype .. ">; must be \"root\" or \"word\".")
end
if etymon_data.ety then
local inline_args = EtymonParser.parse_inline_ety(etymon_data.ety, etymon_data.lang)
EtymonParser.validate(etymon_data.lang, inline_args, nil, nil, nil, true)
end
else
table.insert(etymons_in_group, param)
end
end
end
checkGroup()
end
local DataRetriever = {}
-- Given an etymon data, scrape its page and cache the result in the global state object.
function DataRetriever.cache_page_etymons(etymon_page, etymon_title, key, etymon_lang, etymon_id, redirected_from)
local content = etymon_title:getContent()
if not content then
__state.cached_etymon_args[key] = M.data.STATUS.REDLINK
return
end
-- Check if the linked page is a redirect. If it is, the template parsing
-- code below will be effectively skipped, and `scrape_page` will be called
-- again on the redirect target (see the bottom of this function)
local redirect_target = etymon_title.redirect_target
if not redirect_target then
content = M.pages.get_section(content, etymon_lang:getFullName(), 2)
if not content then
__state.cached_etymon_args[key] = M.data.STATUS.MISSING
return
end
end
local etymon_lang_code = etymon_lang:getFullCode()
local lang_page_key = etymon_lang_code .. ":" .. etymon_page
local found_templates_for_lang = {}
local found_ids = {}
local has_idless_etymon = false
local get_node_class = M.template_parser.class_else_type
-- Look for all {{etymon}} templates within the page content using the template parser
-- This way the same page is never parsed more than once
-- Build a map from senseids to their parent etymonids.
local active_etymon_args = nil
for node in M.template_parser.parse(content):iterate_nodes() do
local node_class = get_node_class(node)
if node_class == "heading" then
-- A new L2 or etymology section acts as a barrier: an {{etymon}} usage
-- used previously cannot be the parent of any subsequent senseids.
-- Note that we don't have to check for L2s due to the usage of `M.pages.get_section` above.
if node:get_name():find("^Etymology") then
active_etymon_args = nil
end
elseif node_class == "template" then
local template_name = node:get_name()
if template_name == "etymon" then
local template_args = node:get_arguments()
-- Check if this etymon is for our language
if template_args[1] == etymon_lang_code then
table.insert(found_templates_for_lang, template_args)
if template_args.id then
local etymon_key = lang_page_key .. ":" .. template_args.id
__state.cached_etymon_args[etymon_key] = template_args
__state.cached_etymon_pages[etymon_key] = tostring(etymon_page)
table.insert(found_ids, template_args.id)
active_etymon_args = template_args
else
has_idless_etymon = true
-- Store idless etymon with default key
local etymon_key = lang_page_key .. ":*"
__state.cached_etymon_args[etymon_key] = template_args
__state.cached_etymon_pages[etymon_key] = tostring(etymon_page)
table.insert(found_ids, "*")
active_etymon_args = template_args
end
end
elseif active_etymon_args and template_name == "senseid" then
local template_args = node:get_arguments()
-- This should always be true for proper usages of {{senseid}}.
if template_args[1] == etymon_lang_code and template_args[2] then
local sense_id_key = lang_page_key .. ":" .. template_args[2]
__state.senseid_parent_etymon[sense_id_key] = active_etymon_args
__state.cached_etymon_pages[sense_id_key] = tostring(etymon_page)
end
end
end
end
-- Error if multiple etymons exist and at least one is missing an id
if #found_templates_for_lang > 1 and has_idless_etymon then
error("Page '[[" .. tostring(etymon_page) .. "]]' has " .. #found_templates_for_lang ..
" etymon templates for " .. etymon_lang:getCanonicalName() ..
", but at least one is missing an id. All etymons must have unique IDs when there are multiple.")
end
local id_data_list = {}
for _, args in ipairs(found_templates_for_lang) do
local id = args.id or "*"
table.insert(id_data_list, { id = id, pos = args.pos })
end
__state.available_etymon_ids[lang_page_key] = id_data_list
if #found_templates_for_lang == 1 then
__state.single_etymons[lang_page_key] = found_templates_for_lang[1]
end
if redirected_from and __state.available_etymon_ids[lang_page_key] then
__state.available_etymon_ids[redirected_from] = __state.available_etymon_ids[redirected_from] or {}
for _, id_data in ipairs(__state.available_etymon_ids[lang_page_key]) do
table.insert(__state.available_etymon_ids[redirected_from], id_data)
end
end
if __state.cached_etymon_args[key] ~= nil or __state.senseid_parent_etymon[key] ~= nil then
-- All done!
return
elseif redirect_target and not redirected_from then
-- Try scraping the redirect.
etymon_page = redirect_target.prefixedText
DataRetriever.cache_page_etymons(etymon_page, redirect_target, lang_page_key .. ":" .. etymon_id, etymon_lang, etymon_id, lang_page_key)
__state.cached_etymon_args[key] = __state.cached_etymon_args[etymon_lang_code .. ":" .. etymon_page .. ":" .. etymon_id]
else
__state.cached_etymon_args[key] = M.data.STATUS.MISSING
end
end
-- Given an etymon object, scrape its page (if necessary) and return its own etymon arguments as well as the page name.
function DataRetriever.get_etymon_args(etymon_data, is_toplevel)
local page = M.links.get_link_page(etymon_data.term, etymon_data.lang)
local norm_lang = Util.get_norm_lang(etymon_data.lang)
if etymon_data.id then
local key = norm_lang:getFullCode() .. ":" .. page .. ":" .. etymon_data.id
local cached_args = __state.cached_etymon_args[key] or __state.senseid_parent_etymon[key]
if cached_args == nil then
local title = mw.title.new(page)
if not title then error('Invalid page title "' .. page .. '" encountered.') end
DataRetriever.cache_page_etymons(page, title, key, norm_lang, etymon_data.id)
end
cached_args = __state.cached_etymon_args[key] or __state.senseid_parent_etymon[key] -- refresh
-- Get etymon_id from parent if this was resolved via senseid
local parent_etymon = __state.senseid_parent_etymon[key]
local resolved_etymon_id = parent_etymon and parent_etymon.id
return cached_args, __state.cached_etymon_pages[key], resolved_etymon_id
else
__state.used_idless_etymon = true
if is_toplevel then
__state.toplevel_idless_etymon = true
end
local base_key = norm_lang:getFullCode() .. ":" .. page
if __state.available_etymon_ids[base_key] == nil then
local title = mw.title.new(page)
if not title then error('Invalid page title "' .. page .. '" encountered.') end
DataRetriever.cache_page_etymons(page, title, base_key .. ":*", norm_lang, "*")
end
local ids = __state.available_etymon_ids[base_key] or {}
local count = #ids
-- Try to filter by postype if available and we have multiple candidates
if count > 1 and etymon_data.postype then
local matching_ids = {}
for _, id_data in ipairs(ids) do
if id_data.pos == etymon_data.postype then
table.insert(matching_ids, id_data)
end
end
if #matching_ids == 1 then
local matched_id = matching_ids[1].id
local matched_key = base_key .. ":" .. matched_id
return __state.cached_etymon_args[matched_key], __state.cached_etymon_pages[matched_key]
end
end
if count == 1 then
return __state.single_etymons[base_key], __state.cached_etymon_pages[base_key .. ":*"]
elseif count > 1 then
local id_list = {}
for _, id_data in ipairs(ids) do
local id = type(id_data) == "table" and id_data.id or id_data
if id and id ~= "" then
table.insert(id_list, "\"" .. id .. "\"")
end
end
local suggestion_text = ""
if #id_list > 0 then
suggestion_text = " Available IDs: " .. table.concat(id_list, ", ") .. "."
end
Util.add_warning("Etymology link to '[[" .. page .. "]]' is ambiguous. The page has " ..
count .. " etymon templates for " .. norm_lang:getCanonicalName() ..
". Please specify an ID." .. suggestion_text, true)
return M.data.STATUS.AMBIGUOUS, nil
else
return M.data.STATUS.MISSING, nil
end
end
end
local TreeBuilder = {}
local function parse_etymon_references(refs_text)
if not refs_text or refs_text == "" then
return ""
end
return M.references.parse_references(refs_text)
end
local function parse_tree_references(node)
if node.ref then
node.parsed_ref = parse_etymon_references(node.ref)
end
if node.children then
for _, container in ipairs(node.children) do
if container.terms then
for _, term in ipairs(container.terms) do
parse_tree_references(term)
end
end
end
end
end
function TreeBuilder.build(lang, title, args, seen, depth, stop_recursion)
seen = seen or {}
depth = depth or 0
local is_toplevel = (depth == 0)
if depth > __state.max_depth_reached then
__state.max_depth_reached = depth
end
__state.total_nodes = __state.total_nodes + 1
local lang_code = lang:getCode()
__state.language_count[lang_code] = (__state.language_count[lang_code] or 0) + 1
local current_id = (type(args) == "table" and args.id) or ""
local key = Util.get_norm_lang(lang):getFullCode() .. ":" .. (title and M.links.get_link_page(title, lang) or "*") .. ":" .. current_id
local node = { lang = lang, title = title, id = current_id, args = args, children = {}, status = M.data.STATUS.OK }
if type(args) ~= "table" or seen[key] then
node.status = args or M.data.STATUS.MISSING
-- Mark as duplicate if we've seen this node before
if seen[key] then
node.is_duplicate = true
node.duplicate_key = key
local original_node = seen[key]
if type(original_node) == "table" and original_node.children and #original_node.children > 0 then
node.original_has_children = true
end
end
return node
end
node.status = args.status or M.data.STATUS.OK
seen[key] = node
-- If stop_recursion is set, skip parsing children but check for visible children
if stop_recursion then
local keywords = M.data.keywords
local has_visible_children = false
for i = 2, #args do
local param = args[i]
if type(param) == "string" then
local keyword_base = get_keyword_base(param)
if keyword_base and keywords[keyword_base] then
-- It's a keyword, check if it's visible
local keyword_info = keywords[keyword_base]
if not keyword_info.invisible then
has_visible_children = true
break
end
elseif param:sub(1, 1) ~= ":" then
-- It's a term (not a keyword), so there are visible children
has_visible_children = true
break
end
end
end
node.has_visible_children = has_visible_children
return node
end
-- Parse args into keyword containers
local current_keyword = "from"
local current_keyword_modifiers = {}
local current_container = nil
-- Helper to track keyword usage at top level
local function track_keyword_usage(keyword, target_lang, source_lang)
if not is_toplevel then return end
if not __state.toplevel_keyword_stats[keyword] then
__state.toplevel_keyword_stats[keyword] = {
count = 0,
target_langs = {},
source_langs = {},
}
end
local keyword_data = __state.toplevel_keyword_stats[keyword]
keyword_data.count = keyword_data.count + 1
local target_code = target_lang:getCode()
keyword_data.target_langs[target_code] = (keyword_data.target_langs[target_code] or 0) + 1
if source_lang then
local source_code = source_lang:getCode()
keyword_data.source_langs[source_code] = (keyword_data.source_langs[source_code] or 0) + 1
end
end
local function ensure_container()
if not current_container or current_container.keyword ~= current_keyword then
current_container = {
keyword = current_keyword,
keyword_info = M.data.keywords[current_keyword],
keyword_modifiers = current_keyword_modifiers,
terms = {},
}
table.insert(node.children, current_container)
end
end
for i = 2, #args do
local param = args[i]
if is_keyword(param) then
local keyword, modifiers = EtymonParser.parse_keyword_modifiers(param)
current_keyword = keyword
current_keyword_modifiers = modifiers
current_container = nil -- Force new container for new keyword
elseif type(param) == "string" and param:sub(1, 1) == ":" then
error("Invalid keyword '" .. param .. "'. Did you mean a valid keyword like ':bor', ':inh', etc.?")
elseif type(param) == "string" then
local etymon_data = EtymonParser.parse_etymon(param, lang)
if etymon_data then
-- Track keyword usage at top level
track_keyword_usage(current_keyword, lang, etymon_data.lang)
local term_node = {}
-- Handle suppress_term (-) and unknown_term (empty) directly
if etymon_data.suppress_term or etymon_data.unknown_term then
ensure_container()
if etymon_data.ety then
local inline_args = EtymonParser.parse_inline_ety(etymon_data.ety, etymon_data.lang)
inline_args.id = etymon_data.id
inline_args.status = M.data.STATUS.INLINE
term_node = TreeBuilder.build(etymon_data.lang, nil, inline_args, seen, depth + 1)
else
term_node = {
lang = etymon_data.lang,
children = {},
status = M.data.STATUS.OK,
}
end
term_node.suppress_term = etymon_data.suppress_term
term_node.unknown_term = etymon_data.unknown_term
term_node.is_uncertain = etymon_data.unc
term_node.ref = etymon_data.ref
term_node.t = etymon_data.t
term_node.tr = etymon_data.tr
term_node.ts = etymon_data.ts
term_node.alt = etymon_data.alt
term_node.pos = etymon_data.pos
else
-- Regular term: fetch arguments from page
local etymon_args, page_of, resolved_etymon_id = DataRetriever.get_etymon_args(etymon_data, is_toplevel)
if etymon_data.id and etymon_args == M.data.STATUS.MISSING and not etymon_data.ety then
local page = M.links.get_link_page(etymon_data.term, etymon_data.lang)
local norm_lang = Util.get_norm_lang(etymon_data.lang)
local base_key = norm_lang:getFullCode() .. ":" .. page
local available_ids = __state.available_etymon_ids[base_key] or {}
if #available_ids > 0 then
__state.has_mismatched_id = true
end
end
-- Check for <ety> inline parameter doesn't override the scraped arguments, unless the latter are missing
if etymon_data.ety then
if etymon_args == M.data.STATUS.REDLINK or etymon_args == M.data.STATUS.MISSING then
__state.current_page_has_inline_etymology = true
if is_toplevel then
__state.toplevel_has_inline_etymology = true
end
local inline_args = EtymonParser.parse_inline_ety(etymon_data.ety, etymon_data.lang)
-- Track inline ety keywords too
local inline_keyword = get_keyword(inline_args[2], true)
if inline_keyword and #inline_args >= 3 then
local inline_etymon = EtymonParser.parse_etymon(inline_args[3], etymon_data.lang)
if inline_etymon then
track_keyword_usage(inline_keyword, etymon_data.lang, inline_etymon.lang)
end
end
inline_args.id = etymon_data.id
inline_args.status = M.data.STATUS.INLINE
etymon_args = inline_args
term_node.page_of = __state.cached_etymon_pages[key] -- term node is on the same page as the parent
else
-- Scraped arguments exist, <ety> is redundant and ignored
__state.current_page_has_redundant_etymology = true
if is_toplevel then
__state.toplevel_redundant_etymology = true
end
end
end
-- Ensure container exists before checking keyword info
ensure_container()
-- Check if current keyword has no_child_categories - if so, stop recursion
local keyword_info = current_container.keyword_info
local should_stop_recursion = (stop_recursion or (keyword_info and keyword_info.no_child_categories))
term_node = TreeBuilder.build(etymon_data.lang, etymon_data.term, etymon_args, seen, depth + 1, should_stop_recursion)
term_node.target_key = Util.get_norm_lang(etymon_data.lang):getFullCode() ..
":" .. M.links.get_link_page(etymon_data.term, etymon_data.lang)
term_node.id = etymon_data.id
term_node.etymon_id = resolved_etymon_id -- The actual etymon id when resolved via senseid
term_node.t = etymon_data.t
term_node.tr = etymon_data.tr
term_node.ts = etymon_data.ts
term_node.pos = etymon_data.pos
term_node.alt = etymon_data.alt
term_node.ref = etymon_data.ref
term_node.is_uncertain = etymon_data.unc
term_node.override = etymon_data.override
term_node.page_of = page_of
term_node.aftype = etymon_data.aftype
term_node.postype = etymon_data.postype
term_node.bor = etymon_data.bor
end
table.insert(current_container.terms, term_node)
end
end
end
return node
end
-- Convert etymology tree to JSON-serializable table
local function tree_to_json(node)
local obj = {
term = node.title,
lang = node.lang:getCode(),
lang_name = node.lang:getCanonicalName(),
id = (node.id and node.id ~= "") and node.id or nil,
status = node.status,
is_uncertain = node.is_uncertain or nil,
is_duplicate = node.is_duplicate or nil,
gloss = node.t,
transliteration = node.tr,
transcription = node.ts,
alt = node.alt,
pos = node.pos,
children = {},
}
for _, container in ipairs(node.children or {}) do
local keyword_info = container.keyword_info
if keyword_info then
local container_obj = {
keyword = container.keyword,
keyword_label = keyword_info.text,
keyword_abbrev = keyword_info.abbrev,
is_group = keyword_info.is_group or nil,
is_invisible = keyword_info.invisible or nil,
is_uncertain = (container.keyword_modifiers and container.keyword_modifiers.unc) or nil,
terms = {},
}
for _, term in ipairs(container.terms or {}) do
table.insert(container_obj.terms, tree_to_json(term))
end
table.insert(obj.children, container_obj)
end
end
return obj
end
local function track_ranges(base_key, value, ranges, lang_code)
M.track("etymon/" .. base_key .. "/" .. value)
if lang_code then
M.track("etymon/lang/" .. lang_code .. "/" .. base_key .. "/" .. value)
end
for _, range in ipairs(ranges) do
local matches = false
if range.min and range.max then
matches = value >= range.min and value <= range.max
elseif range.min then
matches = value >= range.min
elseif range.max then
matches = value <= range.max
elseif range.exact then
matches = value == range.exact
end
if matches then
M.track("etymon/" .. base_key .. "/" .. range.label)
if lang_code then
M.track("etymon/lang/" .. lang_code .. "/" .. base_key .. "/" .. range.label)
end
break
end
end
end
-- Build and return the etymology data tree for a given term.
function export.get_tree(lang, title, args, options)
options = options or {}
if options.validate then
EtymonParser.validate(lang, args, options.id, title, options.pos, false)
end
local lang_code = lang:getCode()
local start_index = (args[1] == lang_code) and 2 or 1
local tree_args = { [1] = lang_code, id = options.id or args.id }
for i = start_index, #args do
table.insert(tree_args, args[i])
end
__state.cached_etymon_args[lang_code .. ":" .. title .. ":" .. (tree_args.id or "")] = tree_args
local ety_data_tree = TreeBuilder.build(lang, title, tree_args)
parse_tree_references(ety_data_tree)
if options.json then
return M.JSON.toJSON(tree_to_json(ety_data_tree))
end
return ety_data_tree
end
-- Given a language code, page name and optionally the id= parameter,
-- render the tree and only the etymology tree for the relevant page.
-- Fetches and parses the corresponding {{etymon}} from the requested page,
-- and any further pages needed to render the tree.
-- Parameters can be passed either through the #invoke or as
-- template parameters *through* an #invoke.
function export.render_tree_for_etymon_on_page(frame)
local frame_args = frame.args
local parent_args = frame:getParent().args
local langcode = frame_args[1] or parent_args[1]
local pagename = frame_args[2] or parent_args[2]
local id = frame_args["id"] or parent_args["id"]
local display_title = frame_args["title"] or parent_args["title"]
local parsed_title = mw.title.new(pagename, 0)
local title
if parsed_title.namespace == 0 then
title = M.pages.safe_page_name(parsed_title)
elseif parsed_title.namespace == 118 then
title = "*" .. M.pages.safe_page_name(parsed_title)
else
error("Unsupported namespace for render_tree_for_etymon_on_page: " .. parsed_title.namespace)
end
local lang = Util.get_lang(langcode)
-- Construct etymon_data for DataRetriever.get_args.
local etymon_data = {
lang = lang,
term = title,
id = id
}
local args, pagename = DataRetriever.get_etymon_args(etymon_data, true)
if args == M.data.STATUS.MISSING then
error("The etymon template was not found (language " ..
langcode ..
", title '" ..
title ..
"'" ..
(id and ", ID '" .. id .. "'" or ", no ID given") .. "). Page contents may have changed in the interim.")
end
local ety_data_tree = export.get_tree(lang, display_title or title, args, {
validate = true,
id = id,
})
local output = {}
table.insert(output, M.template_styles("Module:etymon/styles.css"))
table.insert(output, M.tree.render({
data_tree = ety_data_tree,
format_term_func = function(term, is_toplevel)
return Util.format_term(term, is_toplevel, { suppress_gloss = true, suppress_pos = true })
end,
}))
return table.concat(output)
end
function export.main(frame)
local parent_args = frame:getParent().args
local args = M.parameters.process(parent_args, mw.loadData(Loader.modules.parameters_data).etymon)
local lang = args[1]
local etymon_args = args[2]
local id = args.id
local title = args.title
local text = args.text
local tree = args.tree
local etydate = args.etydate
local rfe = args.rfe
local page_data = Util.get_page_data()
if not title then
title = page_data.pagename
if page_data.namespace == "Reconstruction" then title = "*" .. title end
end
local current_L2 = M.pages.get_current_L2()
if current_L2 then
local norm_lang = Util.get_norm_lang(lang)
local norm_name = norm_lang:getCanonicalName()
if current_L2 ~= norm_name then
local lang_desc = lang:getCode() .. " (" .. lang:getCanonicalName() .. ")"
if norm_lang:getCode() ~= lang:getCode() then
lang_desc = lang_desc .. ", normalized to " .. norm_lang:getCode() .. " (" .. norm_name .. ")"
end
error("Language '" .. lang_desc .. "' does not match the L2 header (" .. current_L2 .. ").")
end
end
local ety_data_tree = export.get_tree(lang, title, etymon_args, {
validate = true,
pos = args.pos,
id = id,
json = args.json,
})
if args.json then
return ety_data_tree
end
local output = {}
local lang_exc = Util.get_lang_exception(lang)
if lang_exc and lang_exc.disallow then
local disallow = lang_exc.disallow
local error_text = " for " .. lang:getFullName()
if disallow.ref then
error_text = error_text .. "; see " .. disallow.ref
else
error_text = error_text .. "."
end
if tree and disallow.tree then
error("Etymology trees are not allowed" .. error_text)
end
if text and disallow.text then
error("Etymology texts are not allowed" .. error_text)
end
end
if etydate then
local etydate_param_mods = {
ref = { list = true, type = "references", allow_holes = true },
refn = { list = true, allow_holes = true },
nocap = { type = "boolean" },
}
local function generate_etydate_obj(etydate_text)
local etydate_specs = {}
for spec in etydate_text:gmatch("[^,]+") do
table.insert(etydate_specs, mw.text.trim(spec))
end
return { [1] = etydate_specs }
end
local parsed_etydate = M.parse_utilities.parse_inline_modifiers(etydate, { param_mods = etydate_param_mods, generate_obj = generate_etydate_obj })
local etydate_args = {
[1] = parsed_etydate[1],
nocap = parsed_etydate.nocap or false,
ref = parsed_etydate.ref or {},
refn = parsed_etydate.refn or { maxindex = 0 }
}
if etydate_args.refn then
local max = 0
for k, v in pairs(etydate_args.refn) do
if type(k) == "number" and k > max then
max = k
end
end
etydate_args.refn.maxindex = max
end
ety_data_tree.etydate = M.etydate.format_etydate(etydate_args)
-- Parse and store references separately for text rendering
local refs_text = ""
for _, ref in ipairs(etydate_args.ref) do
refs_text = refs_text .. (refs_text ~= "" and "" or "") .. ref
end
if refs_text ~= "" then
ety_data_tree.etydate_refs = M.references.parse_references(refs_text)
end
end
if tree then
table.insert(output, M.template_styles("Module:etymon/styles.css"))
table.insert(output, M.tree.render({
data_tree = ety_data_tree,
format_term_func = function(term, is_toplevel)
return Util.format_term(term, is_toplevel, { suppress_gloss = true, suppress_pos = true })
end,
}))
end
-- Check if there are any visible children (non-invisible keywords)
local has_visible_children = false
for _, child in ipairs(ety_data_tree.children or {}) do
local child_keyword_info = child.keyword_info
if not (child_keyword_info and child_keyword_info.invisible) then
has_visible_children = true
break
end
end
local tree_disallowed = lang_exc and lang_exc.disallow and lang_exc.disallow.tree
local anchor = M.anchors.etymonid(lang, id, {
no_tree = args.notree,
title = title,
empty_tree = (not has_visible_children) or tree_disallowed
})
table.insert(output, anchor)
if text then
local max_depth, stop_at_blue_link, stop_at_lang
if text == "++" then
max_depth, stop_at_blue_link = false, false
elseif text == "+" then
max_depth, stop_at_blue_link = 1, false
elseif text == "*" then
max_depth, stop_at_blue_link = false, true
elseif text:sub(1, 1) == ":" then
-- Stop at a specific language, e.g., ":ar" stops at first Arabic term
local lang_code = text:sub(2)
if lang_code ~= "" then
-- Validate the language code
local lang_obj = Util.get_lang(lang_code, true)
if lang_obj then
stop_at_lang = lang_code
else
Util.add_warning('Invalid language code "' .. lang_code .. '" in text parameter. Showing full chain instead.')
max_depth, stop_at_blue_link = false, false -- default to ++
end
else
Util.add_warning('Empty language code in text parameter. Showing full chain instead.')
max_depth, stop_at_blue_link = false, false -- default to ++
end
else
local num = tonumber(text)
if num and num >= 1 then
max_depth, stop_at_blue_link = num, false
else
error('Invalid text value "' ..
text .. '". Valid values are: "++" (full chain), "+" (first step only), "*" (until first blue link), a number (max steps), or ":lang" (stop at language)')
end
end
table.insert(output, M.text.render({
data_tree = ety_data_tree,
format_term_func = Util.format_term,
max_depth = max_depth,
stop_at_blue_link = stop_at_blue_link,
curr_page = page_data.pagename,
nodot = args.nodot,
stop_at_lang = stop_at_lang,
}))
end
if rfe then
local rfe_param_mods = {
nocat = { type = "boolean" },
sort = {},
y = {},
m = {},
fragment = {},
section = {},
box = { type = "boolean" },
noes = { type = "boolean" },
}
local function generate_rfe_obj(rfe_text)
-- Check if it's a boolean true value
if M.yesno(rfe_text, false) then
return { is_boolean = true }
else
return { text = rfe_text }
end
end
local rfe_with_defaults = Util.add_boolean_defaults(rfe, rfe_param_mods)
local parsed_rfe = M.parse_utilities.parse_inline_modifiers(rfe_with_defaults, {
param_mods = rfe_param_mods,
generate_obj = generate_rfe_obj
})
local rfe_args = {
[1] = lang:getCode(),
nocat = parsed_rfe.nocat,
sort = parsed_rfe.sort,
y = parsed_rfe.y,
m = parsed_rfe.m,
fragment = parsed_rfe.fragment,
section = parsed_rfe.section,
box = parsed_rfe.box,
noes = parsed_rfe.noes,
}
if not parsed_rfe.is_boolean then
rfe_args[2] = parsed_rfe.text
end
table.insert(output, frame:expandTemplate({
title = "rfe",
args = rfe_args
}))
end
if Util.is_content_page() and __state.max_depth_reached > 0 then
local lang_code = lang:getCode()
local depth_ranges = {
{ min = 50, label = "extremely-deep" },
{ min = 20, label = "20+" },
{ min = 10, max = 19, label = "10-19" },
{ min = 5, max = 9, label = "5-9" },
{ min = 3, max = 4, label = "3-4" },
{ max = 2, label = "1-2" }
}
local node_ranges = {
{ min = 100, label = "extremely-large" },
{ min = 50, label = "50+" },
{ min = 20, max = 49, label = "20-49" },
{ min = 10, max = 19, label = "10-19" },
{ min = 5, max = 9, label = "5-9" },
{ max = 4, label = "1-4" }
}
local language_ranges = {
{ min = 10, label = "10+" },
{ min = 5, max = 9, label = "5-9" },
{ min = 3, max = 4, label = "3-4" },
{ exact = 2, label = "2" },
{ exact = 1, label = "1" }
}
track_ranges("depth", __state.max_depth_reached, depth_ranges, lang_code)
track_ranges("nodes", __state.total_nodes, node_ranges, lang_code)
local unique_languages = 0
for _ in pairs(__state.language_count) do
unique_languages = unique_languages + 1
end
track_ranges("unique-languages", unique_languages, language_ranges, lang_code)
if __state.total_nodes == __state.max_depth_reached + 1 then
track_ranges("linear-depth", __state.max_depth_reached, depth_ranges, lang_code)
end
end
local categories = {}
if Util.is_content_page() then
local should_suppress_categories = lang_exc and lang_exc.suppress_categories
if not should_suppress_categories and not args.nocat then
categories = M.categories.render({
data_tree = ety_data_tree,
page_lang = lang,
available_etymon_ids = __state.available_etymon_ids,
senseid_parent_etymon = __state.senseid_parent_etymon,
get_norm_lang_func = Util.get_norm_lang,
lang_exc = lang_exc,
})
end
local target_lang_code = lang:getCode()
for keyword, keyword_data in pairs(__state.toplevel_keyword_stats) do
-- Track keyword globally
M.track("etymon/keyword/" .. keyword)
-- Track keyword per target language
M.track("etymon/keyword/" .. keyword .. "/target/" .. target_lang_code)
-- Track keyword per source language
for source_code, count in pairs(keyword_data.source_langs) do
M.track("etymon/keyword/" .. keyword .. "/source/" .. source_code)
-- Track keyword per target+source combination
M.track("etymon/keyword/" .. keyword .. "/target/" .. target_lang_code .. "/source/" .. source_code)
end
end
if tree then
table.insert(categories, "Pages with etymology trees")
table.insert(categories, lang:getCanonicalName() .. " ප්රවේශ, නිරුක්ති ශාඛා සහිත")
end
if text then table.insert(categories, lang:getCanonicalName() .. " entries with etymology texts") end
if args.exnihilo then table.insert(categories, lang:getCanonicalName() .. " terms coined ex nihilo") end
if __state.toplevel_has_inline_etymology then
table.insert(categories, "Pages with inline etymon for redlinks")
end
if __state.toplevel_redundant_etymology then
table.insert(categories, "Pages with redundant inline etymon")
end
if __state.toplevel_idless_etymon then
table.insert(categories, "Pages using etymon with no ID")
end
if __state.has_mismatched_id then
table.insert(categories, lang:getCanonicalName() .. " entries referencing etymons with mismatched IDs")
end
end
if #categories > 0 then
table.insert(output, M.utilities.format_categories(categories, lang))
end
if __state.warnings then
for _, warning in ipairs(__state.warnings) do
table.insert(output, "\n" .. warning)
end
end
return table.concat(output)
end
return export