initial commit

This commit is contained in:
2026-02-09 19:04:19 -08:00
commit f92b57c7b6
531 changed files with 196294 additions and 0 deletions

View File

@@ -0,0 +1,618 @@
/*
* Copyright (C) 2025 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Mozc's default Romaji to Hiragana list referenced to create ROMAJI_TO_HIRAGANA
// https://github.com/google/mozc/blob/035668c3452fa98ac09462fd2cf556948964aad7/src/data/preedit/romanji-hiragana.tsv
export const ROMAJI_TO_HIRAGANA = {
// Double letters - these **must** always be matched first or further down matches may cause inserting `っ` from double letters to require extra logic
// There **must** be an entry for every accepted double letter
// To not disturb further matches, an extra letter must be appended after the `っ`
'qq': 'っq',
'vv': 'っv',
'll': 'っl',
'xx': 'っx',
'kk': 'っk',
'gg': 'っg',
'ss': 'っs',
'zz': 'っz',
'jj': 'っj',
'tt': 'っt',
'dd': 'っd',
'hh': 'っh',
'ff': 'っf',
'bb': 'っb',
'pp': 'っp',
'mm': 'っm',
'yy': 'っy',
'rr': 'っr',
'ww': 'っw',
'cc': 'っc',
// Length 4 - longest matches
'hwyu': 'ふゅ',
'xtsu': 'っ',
'ltsu': 'っ',
// Length 3
'vya': 'ゔゃ',
'vyi': 'ゔぃ',
'vyu': 'ゔゅ',
'vye': 'ゔぇ',
'vyo': 'ゔょ',
'kya': 'きゃ',
'kyi': 'きぃ',
'kyu': 'きゅ',
'kye': 'きぇ',
'kyo': 'きょ',
'gya': 'ぎゃ',
'gyi': 'ぎぃ',
'gyu': 'ぎゅ',
'gye': 'ぎぇ',
'gyo': 'ぎょ',
'sya': 'しゃ',
'syi': 'しぃ',
'syu': 'しゅ',
'sye': 'しぇ',
'syo': 'しょ',
'sha': 'しゃ',
'shi': 'し',
'shu': 'しゅ',
'she': 'しぇ',
'sho': 'しょ',
'zya': 'じゃ',
'zyi': 'じぃ',
'zyu': 'じゅ',
'zye': 'じぇ',
'zyo': 'じょ',
'tya': 'ちゃ',
'tyi': 'ちぃ',
'tyu': 'ちゅ',
'tye': 'ちぇ',
'tyo': 'ちょ',
'cha': 'ちゃ',
'chi': 'ち',
'chu': 'ちゅ',
'che': 'ちぇ',
'cho': 'ちょ',
'cya': 'ちゃ',
'cyi': 'ちぃ',
'cyu': 'ちゅ',
'cye': 'ちぇ',
'cyo': 'ちょ',
'dya': 'ぢゃ',
'dyi': 'ぢぃ',
'dyu': 'ぢゅ',
'dye': 'ぢぇ',
'dyo': 'ぢょ',
'tsa': 'つぁ',
'tsi': 'つぃ',
'tse': 'つぇ',
'tso': 'つぉ',
'tha': 'てゃ',
'thi': 'てぃ',
'thu': 'てゅ',
'the': 'てぇ',
'tho': 'てょ',
'dha': 'でゃ',
'dhi': 'でぃ',
'dhu': 'でゅ',
'dhe': 'でぇ',
'dho': 'でょ',
'twa': 'とぁ',
'twi': 'とぃ',
'twu': 'とぅ',
'twe': 'とぇ',
'two': 'とぉ',
'dwa': 'どぁ',
'dwi': 'どぃ',
'dwu': 'どぅ',
'dwe': 'どぇ',
'dwo': 'どぉ',
'nya': 'にゃ',
'nyi': 'にぃ',
'nyu': 'にゅ',
'nye': 'にぇ',
'nyo': 'にょ',
'hya': 'ひゃ',
'hyi': 'ひぃ',
'hyu': 'ひゅ',
'hye': 'ひぇ',
'hyo': 'ひょ',
'bya': 'びゃ',
'byi': 'びぃ',
'byu': 'びゅ',
'bye': 'びぇ',
'byo': 'びょ',
'pya': 'ぴゃ',
'pyi': 'ぴぃ',
'pyu': 'ぴゅ',
'pye': 'ぴぇ',
'pyo': 'ぴょ',
'fya': 'ふゃ',
'fyu': 'ふゅ',
'fyo': 'ふょ',
'hwa': 'ふぁ',
'hwi': 'ふぃ',
'hwe': 'ふぇ',
'hwo': 'ふぉ',
'mya': 'みゃ',
'myi': 'みぃ',
'myu': 'みゅ',
'mye': 'みぇ',
'myo': 'みょ',
'rya': 'りゃ',
'ryi': 'りぃ',
'ryu': 'りゅ',
'rye': 'りぇ',
'ryo': 'りょ',
'lyi': 'ぃ',
'xyi': 'ぃ',
'lye': 'ぇ',
'xye': 'ぇ',
'xka': 'ヵ',
'xke': 'ヶ',
'lka': 'ヵ',
'lke': 'ヶ',
'kwa': 'くぁ',
'kwi': 'くぃ',
'kwu': 'くぅ',
'kwe': 'くぇ',
'kwo': 'くぉ',
'gwa': 'ぐぁ',
'gwi': 'ぐぃ',
'gwu': 'ぐぅ',
'gwe': 'ぐぇ',
'gwo': 'ぐぉ',
'swa': 'すぁ',
'swi': 'すぃ',
'swu': 'すぅ',
'swe': 'すぇ',
'swo': 'すぉ',
'zwa': 'ずぁ',
'zwi': 'ずぃ',
'zwu': 'ずぅ',
'zwe': 'ずぇ',
'zwo': 'ずぉ',
'jya': 'じゃ',
'jyi': 'じぃ',
'jyu': 'じゅ',
'jye': 'じぇ',
'jyo': 'じょ',
'tsu': 'つ',
'xtu': 'っ',
'ltu': 'っ',
'xya': 'ゃ',
'lya': 'ゃ',
'wyi': 'ゐ',
'xyu': 'ゅ',
'lyu': 'ゅ',
'wye': 'ゑ',
'xyo': 'ょ',
'lyo': 'ょ',
'xwa': 'ゎ',
'lwa': 'ゎ',
'wha': 'うぁ',
'whi': 'うぃ',
'whu': 'う',
'whe': 'うぇ',
'who': 'うぉ',
// Length 2
'nn': 'ん',
'n\'': 'ん',
'va': 'ゔぁ',
'vi': 'ゔぃ',
'vu': 'ゔ',
've': 'ゔぇ',
'vo': 'ゔぉ',
'fa': 'ふぁ',
'fi': 'ふぃ',
'fe': 'ふぇ',
'fo': 'ふぉ',
'xn': 'ん',
'wu': 'う',
'xa': 'ぁ',
'xi': 'ぃ',
'xu': 'ぅ',
'xe': 'ぇ',
'xo': 'ぉ',
'la': 'ぁ',
'li': 'ぃ',
'lu': 'ぅ',
'le': 'ぇ',
'lo': 'ぉ',
'ye': 'いぇ',
'ka': 'か',
'ki': 'き',
'ku': 'く',
'ke': 'け',
'ko': 'こ',
'ga': 'が',
'gi': 'ぎ',
'gu': 'ぐ',
'ge': 'げ',
'go': 'ご',
'sa': 'さ',
'si': 'し',
'su': 'す',
'se': 'せ',
'so': 'そ',
'ca': 'か',
'ci': 'し',
'cu': 'く',
'ce': 'せ',
'co': 'こ',
'qa': 'くぁ',
'qi': 'くぃ',
'qu': 'く',
'qe': 'くぇ',
'qo': 'くぉ',
'za': 'ざ',
'zi': 'じ',
'zu': 'ず',
'ze': 'ぜ',
'zo': 'ぞ',
'ja': 'じゃ',
'ji': 'じ',
'ju': 'じゅ',
'je': 'じぇ',
'jo': 'じょ',
'ta': 'た',
'ti': 'ち',
'tu': 'つ',
'te': 'て',
'to': 'と',
'da': 'だ',
'di': 'ぢ',
'du': 'づ',
'de': 'で',
'do': 'ど',
'na': 'な',
'ni': 'に',
'nu': 'ぬ',
'ne': 'ね',
'no': 'の',
'ha': 'は',
'hi': 'ひ',
'hu': 'ふ',
'fu': 'ふ',
'he': 'へ',
'ho': 'ほ',
'ba': 'ば',
'bi': 'び',
'bu': 'ぶ',
'be': 'べ',
'bo': 'ぼ',
'pa': 'ぱ',
'pi': 'ぴ',
'pu': 'ぷ',
'pe': 'ぺ',
'po': 'ぽ',
'ma': 'ま',
'mi': 'み',
'mu': 'む',
'me': 'め',
'mo': 'も',
'ya': 'や',
'yu': 'ゆ',
'yo': 'よ',
'ra': 'ら',
'ri': 'り',
'ru': 'る',
're': 'れ',
'ro': 'ろ',
'wa': 'わ',
'wi': 'うぃ',
'we': 'うぇ',
'wo': 'を',
// Length 1 - shortest matches
'a': 'あ',
'i': 'い',
'u': 'う',
'e': 'え',
'o': 'お',
// Length 1 Special/Symbols
'.': '。',
',': '、',
':': '',
'/': '・',
'!': '',
'?': '',
'~': '〜',
'-': 'ー',
'': '「',
'': '」',
'“': '『',
'”': '』',
'[': '',
']': '',
'(': '',
')': '',
'{': '',
'}': '',
' ': ' ',
// n -> ん is a special case.
'n': 'ん',
};
export const HIRAGANA_TO_ROMAJI = {
// Length 2
'んい': 'n\'i',
'ゔぁ': 'va',
'ゔぃ': 'vi',
'ゔぉ': 'vo',
'ゔゃ': 'vya',
'ゔゅ': 'vyu',
'ゔぇ': 've',
'ゔょ': 'vyo',
'きゃ': 'kya',
'きぃ': 'kyi',
'きゅ': 'kyu',
'きぇ': 'kye',
'きょ': 'kyo',
'ぎゃ': 'gya',
'ぎぃ': 'gyi',
'ぎゅ': 'gyu',
'ぎぇ': 'gye',
'ぎょ': 'gyo',
'しゃ': 'sha',
'しぃ': 'syi',
'しゅ': 'shu',
'しぇ': 'she',
'しょ': 'sho',
'ちゃ': 'cya',
'ちゅ': 'chu',
'ちぇ': 'che',
'ちょ': 'cho',
'ちぃ': 'cyi',
'ぢゃ': 'dya',
'ぢぃ': 'dyi',
'ぢゅ': 'dyu',
'ぢぇ': 'dye',
'ぢょ': 'dyo',
'つぁ': 'tsa',
'つぃ': 'tsi',
'つぇ': 'tse',
'つぉ': 'tso',
'てゃ': 'tha',
'てぃ': 'thi',
'てゅ': 'thu',
'てぇ': 'the',
'てょ': 'tho',
'でゃ': 'dha',
'でぃ': 'dhi',
'でゅ': 'dhu',
'でぇ': 'dhe',
'でょ': 'dho',
'とぁ': 'twa',
'とぃ': 'twi',
'とぅ': 'twu',
'とぇ': 'twe',
'とぉ': 'two',
'どぁ': 'dwa',
'どぃ': 'dwi',
'どぅ': 'dwu',
'どぇ': 'dwe',
'どぉ': 'dwo',
'にゃ': 'nya',
'にぃ': 'nyi',
'にゅ': 'nyu',
'にぇ': 'nye',
'にょ': 'nyo',
'ひゃ': 'hya',
'ひぃ': 'hyi',
'ひゅ': 'hyu',
'ひぇ': 'hye',
'ひょ': 'hyo',
'びゃ': 'bya',
'びぃ': 'byi',
'びゅ': 'byu',
'びぇ': 'bye',
'びょ': 'byo',
'ぴゃ': 'pya',
'ぴぃ': 'pyi',
'ぴゅ': 'pyu',
'ぴぇ': 'pye',
'ぴょ': 'pyo',
'ふゃ': 'fya',
'ふょ': 'fyo',
'ふぁ': 'fa',
'ふゅ': 'fyu',
'ふぃ': 'fi',
'ふぇ': 'fe',
'ふぉ': 'fo',
'みゃ': 'mya',
'みぃ': 'myi',
'みゅ': 'myu',
'みぇ': 'mye',
'みょ': 'myo',
'りゃ': 'rya',
'りぃ': 'ryi',
'りゅ': 'ryu',
'りぇ': 'rye',
'りょ': 'ryo',
'くぁ': 'kwa',
'くぃ': 'kwi',
'くぅ': 'kwu',
'くぇ': 'kwe',
'くぉ': 'kwo',
'ぐぁ': 'gwa',
'ぐぃ': 'gwi',
'ぐぅ': 'gwu',
'ぐぇ': 'gwe',
'ぐぉ': 'gwo',
'すぁ': 'swa',
'すぃ': 'swi',
'すぅ': 'swu',
'すぇ': 'swe',
'すぉ': 'swo',
'ずぁ': 'zwa',
'ずぃ': 'zwi',
'ずぅ': 'zwu',
'ずぇ': 'zwe',
'ずぉ': 'zwo',
'じゃ': 'ja',
'じぃ': 'jyi',
'じゅ': 'ju',
'じぇ': 'je',
'じょ': 'jo',
'うぁ': 'wha',
'うぃ': 'wi',
'うぇ': 'we',
'うぉ': 'who',
'いぇ': 'ye',
// Length 1
'ん': 'n',
'あ': 'a',
'い': 'i',
'う': 'u',
'え': 'e',
'お': 'o',
'ゔ': 'vu',
'か': 'ka',
'き': 'ki',
'く': 'ku',
'け': 'ke',
'こ': 'ko',
'が': 'ga',
'ぎ': 'gi',
'ぐ': 'gu',
'げ': 'ge',
'ご': 'go',
'さ': 'sa',
'し': 'shi',
'す': 'su',
'せ': 'se',
'そ': 'so',
'ざ': 'za',
'じ': 'ji',
'ず': 'zu',
'ぜ': 'ze',
'ぞ': 'zo',
'た': 'ta',
'ち': 'chi',
'つ': 'tsu',
'て': 'te',
'と': 'to',
'だ': 'da',
'ぢ': 'di',
'づ': 'du',
'で': 'de',
'ど': 'do',
'な': 'na',
'に': 'ni',
'ぬ': 'nu',
'ね': 'ne',
'の': 'no',
'は': 'ha',
'ひ': 'hi',
'ふ': 'fu',
'へ': 'he',
'ほ': 'ho',
'ば': 'ba',
'び': 'bi',
'ぶ': 'bu',
'べ': 'be',
'ぼ': 'bo',
'ぱ': 'pa',
'ぴ': 'pi',
'ぷ': 'pu',
'ぺ': 'pe',
'ぽ': 'po',
'ま': 'ma',
'み': 'mi',
'む': 'mu',
'め': 'me',
'も': 'mo',
'や': 'ya',
'ゆ': 'yu',
'よ': 'yo',
'ら': 'ra',
'り': 'ri',
'る': 'ru',
'れ': 're',
'ろ': 'ro',
'わ': 'wa',
'ゐ': 'wi',
'ゑ': 'we',
'を': 'wo',
// Length 1 Special/Symbols
'。': '.',
'、': ',',
'': ':',
'・': '/',
'': '!',
'': '?',
'〜': '~',
'ー': '-',
'「': '',
'」': '',
'『': '“',
'』': '”',
'': '[',
'': ']',
'': '(',
'': ')',
'': '{',
'': '}',
' ': ' ',
// Length 1 Small - Even though these are usually represented with `x` or `l` prepending them, in romaji it makes the most sense to not do that
'ゃ': 'ya',
'ゅ': 'yu',
'ょ': 'yo',
'ゎ': 'wa',
'ぁ': 'a',
'ぃ': 'i',
'ぅ': 'u',
'ぇ': 'e',
'ぉ': 'o',
'ヵ': 'ka',
'ヶ': 'ke',
// Double letters - these **must** always be matched last or they will break previous maches
'っq': 'qq',
'っv': 'vv',
'っx': 'xx',
'っk': 'kk',
'っg': 'gg',
'っs': 'ss',
'っz': 'zz',
'っj': 'jj',
'っt': 'tt',
'っd': 'dd',
'っh': 'hh',
'っf': 'ff',
'っb': 'bb',
'っp': 'pp',
'っm': 'mm',
'っy': 'yy',
'っr': 'rr',
'っw': 'ww',
'っc': 'cc',
// `っん` is a special case
'っn': 'n',
// single `っ` is weird, some converters just remove it, some leave the `っ` in kana, some replace with `xtsu` or `ltsu`
'っ': '',
};

View File

@@ -0,0 +1,118 @@
/*
* Copyright (C) 2024-2025 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import {basicTextProcessorOptions} from '../text-processors.js';
import {convertAlphabeticToKana} from './japanese-wanakana.js';
import {
collapseEmphaticSequences as collapseEmphaticSequencesFunction,
convertAlphanumericToFullWidth,
convertFullWidthAlphanumericToNormal,
convertHalfWidthKanaToFullWidth,
convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
normalizeCJKCompatibilityCharacters as normalizeCJKCompatibilityCharactersFunction,
normalizeCombiningCharacters as normalizeCombiningCharactersFunction,
} from './japanese.js';
import {convertVariants} from '../../../lib/kanji-processor.js';
/** @type {import('language').TextProcessor<boolean>} */
export const convertHalfWidthCharacters = {
name: 'Convert half width characters to full width',
description: 'ヨミチャン → ヨミチャン',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str),
};
/** @type {import('language').TextProcessor<boolean>} */
export const alphabeticToHiragana = {
name: 'Convert alphabetic characters to hiragana',
description: 'yomichan → よみちゃん',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertAlphabeticToKana(str) : str),
};
/** @type {import('language').BidirectionalConversionPreprocessor} */
export const alphanumericWidthVariants = {
name: 'Convert between alphabetic width variants',
description: ' → yomitan and vice versa',
options: ['off', 'direct', 'inverse'],
process: (str, setting) => {
switch (setting) {
case 'off':
return str;
case 'direct':
return convertFullWidthAlphanumericToNormal(str);
case 'inverse':
return convertAlphanumericToFullWidth(str);
}
},
};
/** @type {import('language').BidirectionalConversionPreprocessor} */
export const convertHiraganaToKatakana = {
name: 'Convert hiragana to katakana',
description: 'よみちゃん → ヨミチャン and vice versa',
options: ['off', 'direct', 'inverse'],
process: (str, setting) => {
switch (setting) {
case 'off':
return str;
case 'direct':
return convertHiraganaToKatakanaFunction(str);
case 'inverse':
return convertKatakanaToHiraganaFunction(str);
}
},
};
/** @type {import('language').TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
export const collapseEmphaticSequences = {
name: 'Collapse emphatic character sequences',
description: 'すっっごーーい → すっごーい / すごい',
options: [[false, false], [true, false], [true, true]],
process: (str, setting) => {
const [collapseEmphatic, collapseEmphaticFull] = setting;
if (collapseEmphatic) {
str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull);
}
return str;
},
};
/** @type {import('language').TextProcessor<boolean>} */
export const normalizeCombiningCharacters = {
name: 'Normalize combining characters',
description: 'ド → ド (U+30C8 U+3099 → U+30C9)',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str),
};
/** @type {import('language').TextProcessor<boolean>} */
export const normalizeCJKCompatibilityCharacters = {
name: 'Normalize CJK Compatibility Characters',
description: '㌀ → アパート',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? normalizeCJKCompatibilityCharactersFunction(str) : str),
};
/** @type {import('language').TextProcessor<boolean>} */
export const standardizeKanji = {
name: 'Convert kanji variants to their modern standard form',
description: '萬 → 万',
options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertVariants(str) : str),
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,149 @@
/*
* Copyright (C) 2024-2025 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import {HIRAGANA_TO_ROMAJI, ROMAJI_TO_HIRAGANA} from './japanese-kana-romaji-dicts.js';
import {convertHiraganaToKatakana} from './japanese.js';
/**
* @param {string} text
* @returns {string}
*/
export function convertToHiragana(text) {
let newText = text.toLowerCase();
for (const [romaji, kana] of Object.entries(ROMAJI_TO_HIRAGANA)) {
newText = newText.replaceAll(romaji, kana);
}
return fillSokuonGaps(newText);
}
/**
* @param {string} text
* @param {number} selectionStart
* @returns {import('language').KanaIMEOutput}
*/
export function convertToKanaIME(text, selectionStart) {
const prevSelectionStart = selectionStart;
const prevLength = text.length;
let kanaString = '';
// If the user starts typing a single `n`, hide it from the converter. (This only applies when using the converter as an IME)
// The converter must only allow the n to become ん when the user's text cursor is at least one character ahead of it.
// If `n` occurs directly behind the user's text cursor, it should be hidden from the converter.
// If `ny` occurs directly behind the user's text cursor, it must also be hidden from the converter as the user may be trying to type `nya` `nyi` `nyu` `nye` `nyo`.
// Examples (`|` shall be the user's text cursor):
// `たn|` does not convert to `たん|`. The `n` should be hidden from the converter and `た` should only be sent.
// `n|の` also does not convert to `ん|の`. Even though the cursor is not at the end of the line, the `n` should still be hidden since it is directly behind the user's text cursor.
// `ny|` does not convert to `んy|`. The `ny` must be hidden since the user may be trying to type something starting with `ny` such as `nya`.
// `たnt|` does convert to `たんt|`. The user's text cursor is one character ahead of the `n` so it does not need to be hidden and can be converted.
// `nとあ|` also converts to `んとあ|` The user's text cursor is two characters away from the `n`.
// `なno|` will still convert to `なの` instead of `なんお` without issue since the `no` -> `の` conversion will be found before `n` -> `ん` and `o` -> `お`.
// `nn|` will still convert to `ん` instead of `んん` since `nn` -> `ん` will be found before `n` -> `ん`.
// If the user pastes in a long string of `n` such as `nnnnn|` it should leave the last `n` and convert to `んんn`
const textLowered = text.toLowerCase();
if (textLowered[prevSelectionStart - 1] === 'n' && textLowered.slice(0, prevSelectionStart - 1).replaceAll('nn', '').at(-1) !== 'n') {
const n = text.slice(prevSelectionStart - 1, prevSelectionStart);
const beforeN = text.slice(0, prevSelectionStart - 1);
const afterN = text.slice(prevSelectionStart);
kanaString = convertToKana(beforeN) + n + convertToKana(afterN);
} else if (textLowered.slice(prevSelectionStart - 2, prevSelectionStart) === 'ny') {
const ny = text.slice(prevSelectionStart - 2, prevSelectionStart);
const beforeN = text.slice(0, prevSelectionStart - 2);
const afterN = text.slice(prevSelectionStart);
kanaString = convertToKana(beforeN) + ny + convertToKana(afterN);
} else {
kanaString = convertToKana(text);
}
const selectionOffset = kanaString.length - prevLength;
return {kanaString, newSelectionStart: prevSelectionStart + selectionOffset};
}
/**
* @param {string} text
* @returns {string}
*/
export function convertToKana(text) {
let newText = text;
for (const [romaji, kana] of Object.entries(ROMAJI_TO_HIRAGANA)) {
newText = newText.replaceAll(romaji, kana);
// Uppercase text converts to katakana
newText = newText.replaceAll(romaji.toUpperCase(), convertHiraganaToKatakana(kana).toUpperCase());
}
return fillSokuonGaps(newText);
}
/**
* @param {string} text
* @returns {string}
* Fills gaps in sokuons that replaceAll using ROMAJI_TO_HIRAGANA will miss due to it not running iteratively
* Example: `ttttttttttsu` -> `っっっっっっっっっつ` would become `ttttttttttsu` -> `っtっtっtっtっつ` without filling the gaps
*/
function fillSokuonGaps(text) {
return text.replaceAll(/っ[a-z](?=っ)/g, 'っっ').replaceAll(/ッ[A-Z](?=ッ)/g, 'ッッ');
}
/**
* @param {string} text
* @returns {string}
*/
export function convertToRomaji(text) {
let newText = text;
for (const [kana, romaji] of Object.entries(HIRAGANA_TO_ROMAJI)) {
newText = newText.replaceAll(kana, romaji);
newText = newText.replaceAll(convertHiraganaToKatakana(kana), romaji);
}
return newText;
}
/**
* @param {string} text
* @returns {string}
*/
export function convertAlphabeticToKana(text) {
let part = '';
let result = '';
for (const char of text) {
// Note: 0x61 is the character code for 'a'
let c = /** @type {number} */ (char.codePointAt(0));
if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z']
c += (0x61 - 0x41);
} else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z']
// NOP; c += (0x61 - 0x61);
} else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] fullwidth
c += (0x61 - 0xff21);
} else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] fullwidth
c += (0x61 - 0xff41);
} else if (c === 0x2d || c === 0xff0d) { // '-' or fullwidth dash
c = 0x2d; // '-'
} else {
if (part.length > 0) {
result += convertToHiragana(part);
part = '';
}
result += char;
continue;
}
part += String.fromCodePoint(c);
}
if (part.length > 0) {
result += convertToHiragana(part);
}
return result;
}

View File

@@ -0,0 +1,813 @@
/*
* Copyright (C) 2024-2025 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import {CJK_COMPATIBILITY, CJK_IDEOGRAPH_RANGES, CJK_PUNCTUATION_RANGE, FULLWIDTH_CHARACTER_RANGES, isCodePointInRange, isCodePointInRanges} from '../CJK-util.js';
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
const KATAKANA_SMALL_KE_CODE_POINT = 0x30f6;
const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc;
/** @type {import('CJK-util').CodepointRange} */
const HIRAGANA_RANGE = [0x3040, 0x309f];
/** @type {import('CJK-util').CodepointRange} */
const KATAKANA_RANGE = [0x30a0, 0x30ff];
/** @type {import('CJK-util').CodepointRange} */
const HIRAGANA_CONVERSION_RANGE = [0x3041, 0x3096];
/** @type {import('CJK-util').CodepointRange} */
const KATAKANA_CONVERSION_RANGE = [0x30a1, 0x30f6];
/** @type {import('CJK-util').CodepointRange[]} */
const KANA_RANGES = [HIRAGANA_RANGE, KATAKANA_RANGE];
/**
* Japanese character ranges, roughly ordered in order of expected frequency.
* @type {import('CJK-util').CodepointRange[]}
*/
const JAPANESE_RANGES = [
HIRAGANA_RANGE,
KATAKANA_RANGE,
...CJK_IDEOGRAPH_RANGES,
[0xff66, 0xff9f], // Halfwidth katakana
[0x30fb, 0x30fc], // Katakana punctuation
[0xff61, 0xff65], // Kana punctuation
CJK_PUNCTUATION_RANGE,
...FULLWIDTH_CHARACTER_RANGES,
];
const SMALL_KANA_SET = new Set('ぁぃぅぇぉゃゅょゎァィゥェォャュョヮ');
const HALFWIDTH_KATAKANA_MAPPING = new Map([
['・', '・--'],
['ヲ', 'ヲヺ-'],
['ァ', 'ァ--'],
['ィ', 'ィ--'],
['ゥ', 'ゥ--'],
['ェ', 'ェ--'],
['ォ', 'ォ--'],
['ャ', 'ャ--'],
['ュ', 'ュ--'],
['ョ', 'ョ--'],
['ッ', 'ッ--'],
['ー', 'ー--'],
['ア', 'ア--'],
['イ', 'イ--'],
['ウ', 'ウヴ-'],
['エ', 'エ--'],
['オ', 'オ--'],
['カ', 'カガ-'],
['キ', 'キギ-'],
['ク', 'クグ-'],
['ケ', 'ケゲ-'],
['コ', 'コゴ-'],
['サ', 'サザ-'],
['シ', 'シジ-'],
['ス', 'スズ-'],
['セ', 'セゼ-'],
['ソ', 'ソゾ-'],
['タ', 'タダ-'],
['チ', 'チヂ-'],
['ツ', 'ツヅ-'],
['テ', 'テデ-'],
['ト', 'トド-'],
['ナ', 'ナ--'],
['ニ', 'ニ--'],
['ヌ', 'ヌ--'],
['ネ', 'ネ--'],
['ノ', '--'],
['ハ', 'ハバパ'],
['ヒ', 'ヒビピ'],
['フ', 'フブプ'],
['ヘ', 'ヘベペ'],
['ホ', 'ホボポ'],
['マ', 'マ--'],
['ミ', 'ミ--'],
['ム', 'ム--'],
['メ', 'メ--'],
['モ', 'モ--'],
['ヤ', 'ヤ--'],
['ユ', 'ユ--'],
['ヨ', 'ヨ--'],
['ラ', 'ラ--'],
['リ', 'リ--'],
['ル', 'ル--'],
['レ', 'レ--'],
['ロ', 'ロ--'],
['ワ', 'ワ--'],
['ン', 'ン--'],
]);
const VOWEL_TO_KANA_MAPPING = new Map([
['a', 'ぁあかがさざただなはばぱまゃやらゎわヵァアカガサザタダナハバパマャヤラヮワヵヷ'],
['i', 'ぃいきぎしじちぢにひびぴみりゐィイキギシジチヂニヒビピミリヰヸ'],
['u', 'ぅうくぐすずっつづぬふぶぷむゅゆるゥウクグスズッツヅヌフブプムュユルヴ'],
['e', 'ぇえけげせぜてでねへべぺめれゑヶェエケゲセゼテデネヘベペメレヱヶヹ'],
['o', 'ぉおこごそぞとどのほぼぽもょよろをォオコゴソゾトドノホボポモョヨロヲヺ'],
['', 'のノ'],
]);
/** @type {Map<string, string>} */
const KANA_TO_VOWEL_MAPPING = new Map();
for (const [vowel, characters] of VOWEL_TO_KANA_MAPPING) {
for (const character of characters) {
KANA_TO_VOWEL_MAPPING.set(character, vowel);
}
}
const kana = 'うゔ-かが-きぎ-くぐ-けげ-こご-さざ-しじ-すず-せぜ-そぞ-ただ-ちぢ-つづ-てで-とど-はばぱひびぴふぶぷへべぺほぼぽワヷ-ヰヸ-ウヴ-ヱヹ-ヲヺ-カガ-キギ-クグ-ケゲ-コゴ-サザ-シジ-スズ-セゼ-ソゾ-タダ-チヂ-ツヅ-テデ-トド-ハバパヒビピフブプヘベペホボポ';
/** @type {Map<string, {character: string, type: import('japanese-util').DiacriticType}>} */
const DIACRITIC_MAPPING = new Map();
for (let i = 0, ii = kana.length; i < ii; i += 3) {
const character = kana[i];
const dakuten = kana[i + 1];
const handakuten = kana[i + 2];
DIACRITIC_MAPPING.set(dakuten, {character, type: 'dakuten'});
if (handakuten !== '-') {
DIACRITIC_MAPPING.set(handakuten, {character, type: 'handakuten'});
}
}
/**
* @param {string} previousCharacter
* @returns {?string}
*/
function getProlongedHiragana(previousCharacter) {
switch (KANA_TO_VOWEL_MAPPING.get(previousCharacter)) {
case 'a': return 'あ';
case 'i': return 'い';
case 'u': return 'う';
case 'e': return 'え';
case 'o': return 'う';
default: return null;
}
}
/**
* @param {string} text
* @param {string} reading
* @returns {import('japanese-util').FuriganaSegment}
*/
function createFuriganaSegment(text, reading) {
return {text, reading};
}
/**
* @param {string} reading
* @param {string} readingNormalized
* @param {import('japanese-util').FuriganaGroup[]} groups
* @param {number} groupsStart
* @returns {?(import('japanese-util').FuriganaSegment[])}
*/
function segmentizeFurigana(reading, readingNormalized, groups, groupsStart) {
const groupCount = groups.length - groupsStart;
if (groupCount <= 0) {
return reading.length === 0 ? [] : null;
}
const group = groups[groupsStart];
const {isKana, text} = group;
const textLength = text.length;
if (isKana) {
const {textNormalized} = group;
if (textNormalized !== null && readingNormalized.startsWith(textNormalized)) {
const segments = segmentizeFurigana(
reading.substring(textLength),
readingNormalized.substring(textLength),
groups,
groupsStart + 1,
);
if (segments !== null) {
if (reading.startsWith(text)) {
segments.unshift(createFuriganaSegment(text, ''));
} else {
segments.unshift(...getFuriganaKanaSegments(text, reading));
}
return segments;
}
}
return null;
} else {
let result = null;
for (let i = reading.length; i >= textLength; --i) {
const segments = segmentizeFurigana(
reading.substring(i),
readingNormalized.substring(i),
groups,
groupsStart + 1,
);
if (segments !== null) {
if (result !== null) {
// More than one way to segmentize the tail; mark as ambiguous
return null;
}
const segmentReading = reading.substring(0, i);
segments.unshift(createFuriganaSegment(text, segmentReading));
result = segments;
}
// There is only one way to segmentize the last non-kana group
if (groupCount === 1) {
break;
}
}
return result;
}
}
/**
* @param {string} text
* @param {string} reading
* @returns {import('japanese-util').FuriganaSegment[]}
*/
function getFuriganaKanaSegments(text, reading) {
const textLength = text.length;
const newSegments = [];
let start = 0;
let state = (reading[0] === text[0]);
for (let i = 1; i < textLength; ++i) {
const newState = (reading[i] === text[i]);
if (state === newState) { continue; }
newSegments.push(createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i)));
state = newState;
start = i;
}
newSegments.push(createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength)));
return newSegments;
}
/**
* @param {string} text1
* @param {string} text2
* @returns {number}
*/
function getStemLength(text1, text2) {
const minLength = Math.min(text1.length, text2.length);
if (minLength === 0) { return 0; }
let i = 0;
while (true) {
const char1 = /** @type {number} */ (text1.codePointAt(i));
const char2 = /** @type {number} */ (text2.codePointAt(i));
if (char1 !== char2) { break; }
const charLength = String.fromCodePoint(char1).length;
i += charLength;
if (i >= minLength) {
if (i > minLength) {
i -= charLength; // Don't consume partial UTF16 surrogate characters
}
break;
}
}
return i;
}
// Character code testing functions
/**
* @param {number} codePoint
* @returns {boolean}
*/
export function isCodePointKanji(codePoint) {
return isCodePointInRanges(codePoint, CJK_IDEOGRAPH_RANGES);
}
/**
* @param {number} codePoint
* @returns {boolean}
*/
export function isCodePointKana(codePoint) {
return isCodePointInRanges(codePoint, KANA_RANGES);
}
/**
* @param {number} codePoint
* @returns {boolean}
*/
export function isCodePointJapanese(codePoint) {
return isCodePointInRanges(codePoint, JAPANESE_RANGES);
}
// String testing functions
/**
* @param {string} str
* @returns {boolean}
*/
export function isStringEntirelyKana(str) {
if (str.length === 0) { return false; }
for (const c of str) {
if (!isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), KANA_RANGES)) {
return false;
}
}
return true;
}
/**
* @param {string} str
* @returns {boolean}
*/
export function isStringPartiallyJapanese(str) {
if (str.length === 0) { return false; }
for (const c of str) {
if (isCodePointInRanges(/** @type {number} */ (c.codePointAt(0)), JAPANESE_RANGES)) {
return true;
}
}
return false;
}
// Mora functions
/**
* @param {number} moraIndex
* @param {number | string} pitchAccentValue
* @returns {boolean}
*/
export function isMoraPitchHigh(moraIndex, pitchAccentValue) {
if (typeof pitchAccentValue === 'string') {
return pitchAccentValue[moraIndex] === 'H';
}
switch (pitchAccentValue) {
case 0: return (moraIndex > 0);
case 1: return (moraIndex < 1);
default: return (moraIndex > 0 && moraIndex < pitchAccentValue);
}
}
/**
* @param {string} text
* @param {number | string} pitchAccentValue
* @param {boolean} isVerbOrAdjective
* @returns {?import('japanese-util').PitchCategory}
*/
export function getPitchCategory(text, pitchAccentValue, isVerbOrAdjective) {
const pitchAccentDownstepPosition = typeof pitchAccentValue === 'string' ? getDownstepPositions(pitchAccentValue)[0] : pitchAccentValue;
if (pitchAccentDownstepPosition === 0) {
return 'heiban';
}
if (isVerbOrAdjective) {
return pitchAccentDownstepPosition > 0 ? 'kifuku' : null;
}
if (pitchAccentDownstepPosition === 1) {
return 'atamadaka';
}
if (pitchAccentDownstepPosition > 1) {
return pitchAccentDownstepPosition >= getKanaMoraCount(text) ? 'odaka' : 'nakadaka';
}
return null;
}
/**
* @param {string} pitchString
* @returns {number[]}
*/
export function getDownstepPositions(pitchString) {
const downsteps = [];
const moraCount = pitchString.length;
for (let i = 0; i < moraCount; i++) {
if (i > 0 && pitchString[i - 1] === 'H' && pitchString[i] === 'L') {
downsteps.push(i);
}
}
if (downsteps.length === 0) {
downsteps.push(pitchString.startsWith('L') ? 0 : -1);
}
return downsteps;
}
/**
* @param {string} text
* @returns {string[]}
*/
export function getKanaMorae(text) {
const morae = [];
let i;
for (const c of text) {
if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) {
morae[i - 1] += c;
} else {
morae.push(c);
}
}
return morae;
}
/**
* @param {string} text
* @returns {number}
*/
export function getKanaMoraCount(text) {
let moraCount = 0;
for (const c of text) {
if (!(SMALL_KANA_SET.has(c) && moraCount > 0)) {
++moraCount;
}
}
return moraCount;
}
// Conversion functions
/**
* @param {string} text
* @param {boolean} [keepProlongedSoundMarks]
* @returns {string}
*/
export function convertKatakanaToHiragana(text, keepProlongedSoundMarks = false) {
let result = '';
const offset = (HIRAGANA_CONVERSION_RANGE[0] - KATAKANA_CONVERSION_RANGE[0]);
for (let char of text) {
const codePoint = /** @type {number} */ (char.codePointAt(0));
switch (codePoint) {
case KATAKANA_SMALL_KA_CODE_POINT:
case KATAKANA_SMALL_KE_CODE_POINT:
// No change
break;
case KANA_PROLONGED_SOUND_MARK_CODE_POINT:
if (!keepProlongedSoundMarks && result.length > 0) {
const char2 = getProlongedHiragana(result[result.length - 1]);
if (char2 !== null) { char = char2; }
}
break;
default:
if (isCodePointInRange(codePoint, KATAKANA_CONVERSION_RANGE)) {
char = String.fromCodePoint(codePoint + offset);
}
break;
}
result += char;
}
return result;
}
/**
* @param {string} text
* @returns {string}
*/
export function convertHiraganaToKatakana(text) {
let result = '';
const offset = (KATAKANA_CONVERSION_RANGE[0] - HIRAGANA_CONVERSION_RANGE[0]);
for (let char of text) {
const codePoint = /** @type {number} */ (char.codePointAt(0));
if (isCodePointInRange(codePoint, HIRAGANA_CONVERSION_RANGE)) {
char = String.fromCodePoint(codePoint + offset);
}
result += char;
}
return result;
}
/**
* @param {string} text
* @returns {string}
*/
export function convertAlphanumericToFullWidth(text) {
let result = '';
for (const char of text) {
let c = /** @type {number} */ (char.codePointAt(0));
if (c >= 0x30 && c <= 0x39) { // ['0', '9']
c += 0xff10 - 0x30; // 0xff10 = '0' full width
} else if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z']
c += 0xff21 - 0x41; // 0xff21 = 'A' full width
} else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z']
c += 0xff41 - 0x61; // 0xff41 = 'a' full width
}
result += String.fromCodePoint(c);
}
return result;
}
/**
* @param {string} text
* @returns {string}
*/
export function convertFullWidthAlphanumericToNormal(text) {
let result = '';
const length = text.length;
for (let i = 0; i < length; i++) {
let c = /** @type {number} */ (text[i].codePointAt(0));
if (c >= 0xff10 && c <= 0xff19) { // ['', '']
c -= 0xff10 - 0x30; // 0x30 = '0'
} else if (c >= 0xff21 && c <= 0xff3a) { // ['', '']
c -= 0xff21 - 0x41; // 0x41 = 'A'
} else if (c >= 0xff41 && c <= 0xff5a) { // ['', '']
c -= 0xff41 - 0x61; // 0x61 = 'a'
}
result += String.fromCodePoint(c);
}
return result;
}
/**
* @param {string} text
* @returns {string}
*/
export function convertHalfWidthKanaToFullWidth(text) {
let result = '';
// This function is safe to use charCodeAt instead of codePointAt, since all
// the relevant characters are represented with a single UTF-16 character code.
for (let i = 0, ii = text.length; i < ii; ++i) {
const c = text[i];
const mapping = HALFWIDTH_KATAKANA_MAPPING.get(c);
if (typeof mapping !== 'string') {
result += c;
continue;
}
let index = 0;
switch (text.charCodeAt(i + 1)) {
case 0xff9e: // Dakuten
index = 1;
break;
case 0xff9f: // Handakuten
index = 2;
break;
}
let c2 = mapping[index];
if (index > 0) {
if (c2 === '-') { // Invalid
index = 0;
c2 = mapping[0];
} else {
++i;
}
}
result += c2;
}
return result;
}
/**
* @param {string} character
* @returns {?{character: string, type: import('japanese-util').DiacriticType}}
*/
export function getKanaDiacriticInfo(character) {
const info = DIACRITIC_MAPPING.get(character);
return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null;
}
/**
* @param {number} codePoint
* @returns {boolean}
*/
function dakutenAllowed(codePoint) {
// To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attached are included
// かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ
// カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ
return ((codePoint >= 0x304B && codePoint <= 0x3068) ||
(codePoint >= 0x306F && codePoint <= 0x307B) ||
(codePoint >= 0x30AB && codePoint <= 0x30C8) ||
(codePoint >= 0x30CF && codePoint <= 0x30DB));
}
/**
* @param {number} codePoint
* @returns {boolean}
*/
function handakutenAllowed(codePoint) {
// To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attached are included
// はばぱひびぴふぶぷへべぺほ
// ハバパヒビピフブプヘベペホ
return ((codePoint >= 0x306F && codePoint <= 0x307B) ||
(codePoint >= 0x30CF && codePoint <= 0x30DB));
}
/**
* @param {string} text
* @returns {string}
*/
export function normalizeCombiningCharacters(text) {
let result = '';
let i = text.length - 1;
// Ignoring the first character is intentional, it cannot combine with anything
while (i > 0) {
if (text[i] === '\u3099') {
const dakutenCombinee = text[i - 1].codePointAt(0);
if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) {
result = String.fromCodePoint(dakutenCombinee + 1) + result;
i -= 2;
continue;
}
} else if (text[i] === '\u309A') {
const handakutenCombinee = text[i - 1].codePointAt(0);
if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) {
result = String.fromCodePoint(handakutenCombinee + 2) + result;
i -= 2;
continue;
}
}
result = text[i] + result;
i--;
}
// i === -1 when first two characters are combined
if (i === 0) {
result = text[0] + result;
}
return result;
}
/**
* @param {string} text
* @returns {string}
*/
export function normalizeCJKCompatibilityCharacters(text) {
let result = '';
for (let i = 0; i < text.length; i++) {
const codePoint = text[i].codePointAt(0);
result += codePoint && isCodePointInRange(codePoint, CJK_COMPATIBILITY) ? text[i].normalize('NFKD') : text[i];
}
return result;
}
// Furigana distribution
/**
* @param {string} term
* @param {string} reading
* @returns {import('japanese-util').FuriganaSegment[]}
*/
export function distributeFurigana(term, reading) {
if (reading === term) {
// Same
return [createFuriganaSegment(term, '')];
}
/** @type {import('japanese-util').FuriganaGroup[]} */
const groups = [];
/** @type {?import('japanese-util').FuriganaGroup} */
let groupPre = null;
let isKanaPre = null;
for (const c of term) {
const codePoint = /** @type {number} */ (c.codePointAt(0));
const isKana = isCodePointKana(codePoint);
if (isKana === isKanaPre) {
/** @type {import('japanese-util').FuriganaGroup} */ (groupPre).text += c;
} else {
groupPre = {isKana, text: c, textNormalized: null};
groups.push(groupPre);
isKanaPre = isKana;
}
}
for (const group of groups) {
if (group.isKana) {
group.textNormalized = convertKatakanaToHiragana(group.text);
}
}
const readingNormalized = convertKatakanaToHiragana(reading);
const segments = segmentizeFurigana(reading, readingNormalized, groups, 0);
if (segments !== null) {
return segments;
}
// Fallback
return [createFuriganaSegment(term, reading)];
}
/**
* @param {string} term
* @param {string} reading
* @param {string} source
* @returns {import('japanese-util').FuriganaSegment[]}
*/
export function distributeFuriganaInflected(term, reading, source) {
const termNormalized = convertKatakanaToHiragana(term);
const readingNormalized = convertKatakanaToHiragana(reading);
const sourceNormalized = convertKatakanaToHiragana(source);
let mainText = term;
let stemLength = getStemLength(termNormalized, sourceNormalized);
// Check if source is derived from the reading instead of the term
const readingStemLength = getStemLength(readingNormalized, sourceNormalized);
if (readingStemLength > 0 && readingStemLength >= stemLength) {
mainText = reading;
stemLength = readingStemLength;
reading = `${source.substring(0, stemLength)}${reading.substring(stemLength)}`;
}
const segments = [];
if (stemLength > 0) {
mainText = `${source.substring(0, stemLength)}${mainText.substring(stemLength)}`;
const segments2 = distributeFurigana(mainText, reading);
let consumed = 0;
for (const segment of segments2) {
const {text} = segment;
const start = consumed;
consumed += text.length;
if (consumed < stemLength) {
segments.push(segment);
} else if (consumed === stemLength) {
segments.push(segment);
break;
} else {
if (start < stemLength) {
segments.push(createFuriganaSegment(mainText.substring(start, stemLength), ''));
}
break;
}
}
}
if (stemLength < source.length) {
const remainder = source.substring(stemLength);
const segmentCount = segments.length;
if (segmentCount > 0 && segments[segmentCount - 1].reading.length === 0) {
// Append to the last segment if it has an empty reading
segments[segmentCount - 1].text += remainder;
} else {
// Otherwise, create a new segment
segments.push(createFuriganaSegment(remainder, ''));
}
}
return segments;
}
// Miscellaneous
/**
* @param {number} codePoint
* @returns {boolean}
*/
export function isEmphaticCodePoint(codePoint) {
return (
codePoint === HIRAGANA_SMALL_TSU_CODE_POINT ||
codePoint === KATAKANA_SMALL_TSU_CODE_POINT ||
codePoint === KANA_PROLONGED_SOUND_MARK_CODE_POINT
);
}
/**
* @param {string} text
* @param {boolean} fullCollapse
* @returns {string}
*/
export function collapseEmphaticSequences(text, fullCollapse) {
let left = 0;
while (left < text.length && isEmphaticCodePoint(/** @type {number} */ (text.codePointAt(left)))) {
++left;
}
let right = text.length - 1;
while (right >= 0 && isEmphaticCodePoint(/** @type {number} */ (text.codePointAt(right)))) {
--right;
}
// Whole string is emphatic
if (left > right) {
return text;
}
const leadingEmphatics = text.substring(0, left);
const trailingEmphatics = text.substring(right + 1);
let middle = '';
let currentCollapsedCodePoint = -1;
for (let i = left; i <= right; ++i) {
const char = text[i];
const codePoint = /** @type {number} */ (char.codePointAt(0));
if (isEmphaticCodePoint(codePoint)) {
if (currentCollapsedCodePoint !== codePoint) {
currentCollapsedCodePoint = codePoint;
if (!fullCollapse) {
middle += char;
continue;
}
}
} else {
currentCollapsedCodePoint = -1;
middle += char;
}
}
return leadingEmphatics + middle + trailingEmphatics;
}