Jump to content

Module:etymon/categories

Wiktionary වෙතින්


local export = {}

local data = mw.loadData("Module:etymon/data")
local STATUS = data.STATUS
local TRANSITIVE = data.TRANSITIVE
local keywords = data.keywords

-- Evaluate whether a keyword is transitive for a given term
local function is_transitive(transitive_mode, page_lang, term_lang)
	if transitive_mode == TRANSITIVE.ALWAYS then
		return true
	elseif transitive_mode == TRANSITIVE.NEVER then
		return false
	elseif transitive_mode == TRANSITIVE.CROSS_LANG then
		return page_lang:getCode() ~= term_lang:getCode()
	elseif transitive_mode == TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then
		return page_lang:getCode() ~= term_lang:getCode()
	end
	error("Unknown transitive mode: " .. tostring(transitive_mode))
end

-- Get keyword config with language-specific overrides
local function get_keyword_config(keyword, lang_exc)
	local base_config = keywords[keyword]
	if not base_config then
		return nil -- Invalid keyword
	end

	local overrides = lang_exc and lang_exc.keyword_overrides and lang_exc.keyword_overrides[keyword]

	if not overrides then
		return base_config
	end

	-- Merge overrides into base config
	local merged = {}
	for k, v in pairs(base_config) do
		merged[k] = v
	end
	for k, v in pairs(overrides) do
		merged[k] = v
	end
	return merged
end

function export.get_cat_name(source)
	local etymology_module = require("Module:etymology")
	local _, cat_name = etymology_module.get_display_and_cat_name(source, true)
	return cat_name
end

-- Normalize affix type aliases
local aftype_aliases = {
	["pre"] = "prefix",
	["suf"] = "suffix",
	["in"] = "infix",
	["inter"] = "interfix",
	["circum"] = "circumfix",
	["naf"] = "non-affix",
	["root"] = "non-affix",
}

-- Collect affix categories from top-level group containers
local function collect_affix_categories(node, page_lang, available_etymon_ids, senseid_parent_etymon, lang_exc)
	local affix_module = require("Module:affix")
	local parts = {}
	local part_index = 1

	for _, container in ipairs(node.children or {}) do
		local config = container.keyword_info
		if config and config.affix_categories then
			for _, term in ipairs(container.terms or {}) do
				if not term.unknown_term then
					local part_data = {
						term = term.title,
						tr = term.tr,
						ts = term.ts,
						alt = term.alt,
						itemno = part_index,
						orig_index = part_index
					}

					-- Determine affix type: explicit aftype > pos=root > auto-detect
					local aftype = term.aftype
					if aftype then
						aftype = aftype_aliases[aftype] or aftype
						part_data.type = aftype
					elseif term.args and term.args.pos and term.args.pos == "root" then
						part_data.type = "non-affix"
					end

					if term.lang:getCode() ~= page_lang:getCode() then
						part_data.lang = term.lang
					end

					local target_ids = available_etymon_ids[term.target_key]
					local has_multiple_ids = target_ids and #target_ids > 1
					local id_exists_in_disambiguation = false
					local matched_id = nil

					-- Count available senseids for the target page
					local senseid_count = 0
					local target_prefix = term.target_key .. ":"
					if senseid_parent_etymon then
						for key, _ in pairs(senseid_parent_etymon) do
							if key:sub(1, #target_prefix) == target_prefix then
								senseid_count = senseid_count + 1
							end
						end
					end
					local has_multiple_senseids = senseid_count > 1

					if term.id then
						-- Check if user provided a valid senseid
						local senseid_key = term.target_key .. ":" .. term.id
						if senseid_parent_etymon and senseid_parent_etymon[senseid_key] then
							if has_multiple_senseids then
								-- Ambiguous senseid: use senseid
								matched_id = term.id
								id_exists_in_disambiguation = true
							elseif has_multiple_ids then
								-- Unique senseid but ambiguous etymon: use etymon ID
								matched_id = term.etymon_id or term.id
								id_exists_in_disambiguation = true
							end
						else
							-- Check if user provided a valid etymon ID
							if has_multiple_ids and target_ids then
								for _, id_data in ipairs(target_ids) do
									local stored_id = type(id_data) == "table" and id_data.id or id_data
									if stored_id == term.id then
										-- Ambiguous etymon: use etymon ID
										id_exists_in_disambiguation = true
										matched_id = term.id
										break
									end
								end
							end
							
							-- Fallback: check resolved etymon_id (e.g. from previous steps)
							if not id_exists_in_disambiguation and has_multiple_ids and term.etymon_id and target_ids then
								for _, id_data in ipairs(target_ids) do
									local stored_id = type(id_data) == "table" and id_data.id or id_data
									if stored_id == term.etymon_id then
										id_exists_in_disambiguation = true
										matched_id = term.etymon_id
										break
									end
								end
							end
						end
					end

					-- Use the matched ID if found
					if term.override or id_exists_in_disambiguation then
						part_data.id = matched_id or term.id
					end

					table.insert(parts, part_data)
					part_index = part_index + 1
				end
			end
		end
	end

	if #parts == 0 then return {} end

	local affix_data = {
		lang = page_lang,
		parts = parts,
		pos = "term",
		sort_key = nil
	}

	local affix_categories = affix_module.get_affix_categories_only(affix_data)

	local result = {}
	for _, cat in ipairs(affix_categories) do
		if type(cat) == "table" then
			table.insert(result, cat.cat)
		else
			table.insert(result, cat)
		end
	end

	return result
end

-- Add borrowing-related categories (top-level only)
local function collect_borrowing_categories(categories, page_lang, term, config)
	local etymology_module = require("Module:etymology")

	if config.borrowing_type == "borrowed" then
		local temp_categories = {}
		etymology_module.insert_borrowed_cat(temp_categories, page_lang, term.lang)
		for _, cat in ipairs(temp_categories) do
			categories[cat] = true
		end
	end

	if config.specialized_borrowing then
		local etymology_specialized_module = require("Module:etymology/specialized")
		local result = etymology_specialized_module.specialized_borrowing {
			bortype = config.specialized_borrowing,
			lang = page_lang,
			sources = { term.lang },
			terms = { { lang = term.lang, term = "-" } },
			notext = true,
			nocat = false,
		}

		for cat_name in result:gmatch("%[%[Category:([^%]]+)%]%]") do
			categories[cat_name] = true
		end
	end
end

-- Add source-based derivation categories (top-level only)
local function collect_source_derivation_categories(categories, page_lang, term, config)
	if not config.source_category_type then
		return
	end

	local etymology_module = require("Module:etymology")
	local temp_categories = {}

	etymology_module.insert_source_cat_get_display {
		lang = page_lang,
		source = term.lang,
		categories = temp_categories,
		borrowing_type = config.source_category_type,
		nocat = false,
	}

	for _, cat in ipairs(temp_categories) do
		categories[cat] = true
	end
end

-- Add source language categories
local function collect_source_categories(categories, page_lang, term, chain, get_norm_lang_func)
	local etymology_module = require("Module:etymology")

	if page_lang:getCode() == get_norm_lang_func(term.lang):getCode() then
		return
	end

	local temp_categories = {}
	etymology_module.insert_source_cat_get_display {
		lang = page_lang,
		source = get_norm_lang_func(term.lang),
		categories = temp_categories,
		nocat = false,
	}

	for _, cat in ipairs(temp_categories) do
		categories[cat] = true
	end

	if chain.inherited then
		temp_categories = {}
		etymology_module.insert_source_cat_get_display {
			lang = page_lang,
			source = get_norm_lang_func(term.lang),
			categories = temp_categories,
			borrowing_type = "terms inherited",
			nocat = false,
		}

		for _, cat in ipairs(temp_categories) do
			categories[cat] = true
		end
	end
end

-- Add root/word categories
local function collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, chain,
									  get_norm_lang_func, lang_exc, keyword)
	local pos_types = { root = "root", word = "word" }

	-- Determine pos: from term's postype, keyword's pos_override, or args.pos
	local pos
	local config = get_keyword_config(keyword, lang_exc)
	if term.postype then
		-- Term-level postype modifier takes highest priority
		pos = term.postype
	elseif config and config.pos_override then
		pos = config.pos_override
	elseif type(term.args) == "table" and term.args.pos then
		pos = term.args.pos
	end

	local pos_type = pos_types[pos]

	if not pos_type or term.unknown_term then
		return
	end

	-- Skip root/word categories for descendants of affix groups
	if pos_type and chain.inside_affix then
		return
	end

	local same_language = get_norm_lang_func(page_lang):getFullCode() == get_norm_lang_func(term.lang):getFullCode()

	-- Skip self-references
	if same_language and root_title == term.title then
		return
	end

	-- Use makeEntryName to strip diacritics for category names
	local entry_name = term.lang:makeEntryName(term.title)

	local lang_name = page_lang:getCanonicalName()
	local cat_name

	if chain.passed_through then
		local etymon_lang_name = export.get_cat_name(term.lang)
		cat_name = lang_name .. " terms derived from the " .. etymon_lang_name .. " " .. pos_type .. " " .. entry_name
	else
		cat_name = lang_name .. " terms belonging to the " .. pos_type .. " " .. entry_name
	end

	-- Add ID disambiguation if needed (for roots/words: use etymon_id if resolved via senseid, otherwise use id)
	local target_ids = available_etymon_ids[term.target_key]
	local effective_id = term.etymon_id or term.id  -- etymon_id if senseid, otherwise id is already an etymon id
	if target_ids and effective_id then
		local same_pos_count = 0
		for _, id_data in ipairs(target_ids) do
			if type(id_data) == "table" and id_data.pos == pos then
				same_pos_count = same_pos_count + 1
			end
		end
		if same_pos_count > 1 then
			cat_name = cat_name .. " (" .. effective_id .. ")"
		end
	end

	categories[cat_name] = true
end

-- Compute chain state for a term based on parent chain and keyword config
-- Hyphen patterns for affix detection (regular hyphen + script-specific)
local AFFIX_HYPHEN_PATTERN = "[%-%־ـ᠊]" -- regular hyphen, Hebrew maqqef, Arabic tatweel, Mongolian hyphen

-- Check if a term is an actual affix (not a non-affix member of an affix group)
local function is_actual_affix(term)
	-- Check explicit aftype modifier
	if term.aftype then
		local normalized = aftype_aliases[term.aftype] or term.aftype
		return normalized ~= "non-affix"
	end
	-- Check if pos=root (treated as non-affix)
	if term.args and term.args.pos and term.args.pos == "root" then
		return false
	end
	-- Auto-detect by hyphen: prefix ends with -, suffix starts with -, etc.
	if term.title then
		local title = term.title
		-- Strip leading * for reconstructed terms before checking hyphens
		title = title:gsub("^%*", "")
		-- Check for hyphens at start or end (handles script-specific hyphens too)
		if title:match("^" .. AFFIX_HYPHEN_PATTERN) or title:match(AFFIX_HYPHEN_PATTERN .. "$") then
			return true
		end
	end
	-- Default: not an affix
	return false
end

local function compute_category_chain(parent_chain, config, page_lang, term_lang, get_norm_lang_func, parent_term_lang, term)
	-- Track if we're inside an actual affix (for suppressing root categories on descendants)
	-- Only set if the term is an actual affix (prefix, suffix, etc.), not a non-affix member
	local inside_affix = parent_chain.inside_affix
	if config.affix_categories and term and is_actual_affix(term) then
		inside_affix = true
	end

	-- If no_child_categories is set, disable everything
	if config.no_child_categories then
		return {
			passed_through = parent_chain.passed_through or page_lang:getCode() ~= get_norm_lang_func(term_lang):getCode(),
			inherited = false,
			source = false,
			pos = false,
			recurse = false,
			inside_affix = inside_affix,
		}
	end

	local term_is_transitive = is_transitive(config.transitive, page_lang, term_lang)
	local new_source = parent_chain.source and term_is_transitive
	
	-- For CROSS_LANG_NO_INTERNAL_SOURCE: track internal derivation language context
	-- Check if this term is internal relative to parent term's language (if parent_term_lang provided)
	-- or relative to page language (if no parent_term_lang)
	local internal_lang = parent_chain.internal_lang
	local is_internal_in_context = false
	if config.transitive == TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then
		local check_lang = parent_term_lang or page_lang
		local term_lang_code = get_norm_lang_func(term_lang):getCode()
		local check_lang_code = get_norm_lang_func(check_lang):getCode()
		
		if internal_lang then
			-- Already in an internal derivation context: check if this term is also internal
			is_internal_in_context = term_lang_code == internal_lang
		else
			-- Check if this term is internal relative to parent term (or page if no parent)
			is_internal_in_context = term_lang_code == check_lang_code
		end
	end
	
	-- Source chain behavior for CROSS_LANG_NO_INTERNAL_SOURCE
	if config.transitive == TRANSITIVE.CROSS_LANG_NO_INTERNAL_SOURCE then
		if is_internal_in_context then
			-- Internal derivation
			new_source = false
			internal_lang = get_norm_lang_func(term_lang):getCode()
		else
			-- Cross-language
			new_source = parent_chain.source and term_is_transitive
			internal_lang = nil
		end
	end
	
	local new_pos = parent_chain.pos

	return {
		passed_through = parent_chain.passed_through or page_lang:getCode() ~= get_norm_lang_func(term_lang):getCode(),
		inherited = parent_chain.inherited and config.inherited_chain,
		source = new_source,
		pos = new_pos,
		internal_lang = internal_lang,
		recurse = new_source or new_pos,
		inside_affix = inside_affix,
	}
end

function export.render(opts)
	opts = opts or {}
	local data_tree = opts.data_tree
	local page_lang = opts.page_lang
	local available_etymon_ids = opts.available_etymon_ids
	local senseid_parent_etymon = opts.senseid_parent_etymon
	local get_norm_lang_func = opts.get_norm_lang_func
	local lang_exc = opts.lang_exc

	local categories = {}
	local seen = {}
	local lang_name = page_lang:getCanonicalName()
	local root_title = data_tree.title

	-- Collect the tree recursively
	local function collect(node, parent_chain, is_toplevel)
		-- Avoid processing same node twice
		if not node.unknown_term and node.title then
			local key = node.lang:getFullCode() .. ":" .. (node.title or "") .. ":" .. (node.id or "")
			if seen[key] then return end
			seen[key] = true
		end

		-- Collect affix categories at top level only
		if is_toplevel then
			local affix_cats = collect_affix_categories(node, page_lang, available_etymon_ids, senseid_parent_etymon, lang_exc)
			for _, cat in ipairs(affix_cats) do
				categories[lang_name .. " " .. cat] = true
			end
		end

		-- Process each container
		for _, container in ipairs(node.children or {}) do
			local keyword = container.keyword
			local config = get_keyword_config(keyword, lang_exc)

			-- Skip invalid keywords
			if config then
				-- Process each term in the container
				for _, term in ipairs(container.terms or {}) do
					local term_chain = compute_category_chain(parent_chain, config, page_lang, term.lang, get_norm_lang_func, node.lang, term)
					local no_child_categories = config.no_child_categories == true
					local term_is_transitive = is_transitive(config.transitive, page_lang, term.lang)

					-- Top-level only processing
					if is_toplevel then
						-- Missing/ambiguous etymon tracking
						if not term.unknown_term and (term.status == STATUS.MISSING or term.status == STATUS.REDLINK) then
							categories[lang_name .. " entries referencing missing etymons"] = true
						end
						if not term.unknown_term and term.status == STATUS.AMBIGUOUS then
							categories[lang_name .. " entries referencing ambiguous etymons"] = true
						end

						-- Top-level category (e.g., "undefined derivations")
						if config.toplevel_category then
							categories[lang_name .. " " .. config.toplevel_category] = true
						end

						-- Borrowing categories (bor, lbor, slbor, ubor, obor)
						if config.borrowing_type or config.specialized_borrowing then
							collect_borrowing_categories(categories, page_lang, term, config)
						end

						-- Borrowing categories from <bor> modifier on :af terms
						if keyword == "af" and term.bor then
							local bor_config = { borrowing_type = "borrowed" }
							collect_borrowing_categories(categories, page_lang, term, bor_config)
						end

						-- Source-based derivation categories (sl, calque, pcal)
						if config.source_category_type then
							collect_source_derivation_categories(categories, page_lang, term, config)
						end

						-- Skip all child categorisation if no_child_categories is set
						if not no_child_categories then
							-- Source categories only if transitive
							if term_is_transitive then
								collect_source_categories(categories, page_lang, term, term_chain, get_norm_lang_func)
							end

							-- Pos categories always (unless no_child_categories)
							collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, term_chain,
								get_norm_lang_func, lang_exc, keyword)
						end
					else
						-- Below top level, respect the parent chain
						if parent_chain.source then
							collect_source_categories(categories, page_lang, term, term_chain, get_norm_lang_func)
						end

						if parent_chain.pos then
							collect_pos_categories(categories, page_lang, root_title, term, available_etymon_ids, term_chain,
								get_norm_lang_func, lang_exc, keyword)
						end
					end

					-- Recurse into term's children if needed and status allows
					if term_chain.recurse and (term.status == STATUS.OK or term.status == STATUS.INLINE) then
						collect(term, term_chain, false)
					end
				end
			end
		end
	end

	-- Initial chain state
	local initial_chain = {
		passed_through = false,
		inherited = true,
		source = true,
		pos = true,
		internal_lang = nil,
		recurse = true,
		inside_affix = false,
	}

	collect(data_tree, initial_chain, true)

	local cat_list = {}
	for cat in pairs(categories) do
		table.insert(cat_list, cat)
	end
	return cat_list
end

return export
"https://si.wiktionary.org/w/index.php?title=Module:etymon/categories&oldid=226939" වෙතින් සම්ප්‍රවේශනය කෙරිණි