Files
SubMiner/stats/src/lib/reading-utils.ts
T

74 lines
2.2 KiB
TypeScript

function isHiragana(ch: string): boolean {
const code = ch.charCodeAt(0);
return code >= 0x3040 && code <= 0x309f;
}
function isKatakana(ch: string): boolean {
const code = ch.charCodeAt(0);
return code >= 0x30a0 && code <= 0x30ff;
}
function katakanaToHiragana(text: string): string {
let result = '';
for (const ch of text) {
const code = ch.charCodeAt(0);
if (code >= 0x30a1 && code <= 0x30f6) {
result += String.fromCharCode(code - 0x60);
} else {
result += ch;
}
}
return result;
}
/**
* Reconstruct the full word reading from the surface form and the stored
* (possibly partial) reading.
*
* MeCab/Yomitan sometimes stores only the kanji portion's reading. For example,
* お前 (surface) with reading まえ — the stored reading covers only 前, missing
* the leading お. This function walks through the surface form: hiragana/katakana
* characters pass through as-is (converted to hiragana), and the remaining kanji
* portion is filled in from the stored reading.
*/
export function fullReading(headword: string, storedReading: string): string {
if (!storedReading || !headword) return storedReading || '';
const reading = katakanaToHiragana(storedReading);
const leadingKana: string[] = [];
const trailingKana: string[] = [];
const chars = [...headword];
let i = 0;
while (i < chars.length && (isHiragana(chars[i]) || isKatakana(chars[i]))) {
leadingKana.push(katakanaToHiragana(chars[i]));
i++;
}
if (i === chars.length) {
return reading;
}
let j = chars.length - 1;
while (j > i && (isHiragana(chars[j]) || isKatakana(chars[j]))) {
trailingKana.unshift(katakanaToHiragana(chars[j]));
j--;
}
// Strip matching trailing kana from the stored reading to get the core kanji reading
let coreReading = reading;
const trailStr = trailingKana.join('');
if (trailStr && coreReading.endsWith(trailStr)) {
coreReading = coreReading.slice(0, -trailStr.length);
}
// Strip matching leading kana from the stored reading if it already includes them
const leadStr = leadingKana.join('');
if (leadStr && coreReading.startsWith(leadStr)) {
return reading;
}
return leadStr + coreReading + trailStr;
}