/* * Copyright (C) 2024-2025 Yomitan Authors * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-preprocessors.js'; import { addHamzaBottom, addHamzaTop, convertAlifMaqsuraToYaa, convertHaToTaMarbuta, normalizeUnicode, removeArabicScriptDiacritics, removeTatweel, } from './ar/arabic-text-preprocessors.js'; import {arabicTransforms} from './ar/arabic-transforms.js'; import {normalizeRadicalCharacters} from './CJK-util.js'; import {eszettPreprocessor} from './de/german-text-preprocessors.js'; import {germanTransforms} from './de/german-transforms.js'; import {removeDoubleAcuteAccents} from './el/modern-greek-processors.js'; import {englishTransforms} from './en/english-transforms.js'; import {esperantoTransforms} from './eo/esperanto-transforms.js'; import {spanishTransforms} from './es/spanish-transforms.js'; import {apostropheVariants} from './fr/french-text-preprocessors.js'; import {frenchTransforms} from './fr/french-transforms.js'; import {irishTransforms} from './ga/irish-transforms.js'; import {convertLatinToGreek} from './grc/ancient-greek-processors.js'; import {ancientGreekTransforms} from './grc/ancient-greek-transforms.js'; import { alphabeticToHiragana, alphanumericWidthVariants, collapseEmphaticSequences, convertHalfWidthCharacters, convertHiraganaToKatakana, normalizeCJKCompatibilityCharacters, normalizeCombiningCharacters, standardizeKanji, } from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; import {georgianTransforms} from './ka/georgian-transforms.js'; import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js'; import {koreanTransforms} from './ko/korean-transforms.js'; import {processDiphtongs} from './la/latin-text-preprocessors.js'; import {latinTransforms} from './la/latin-transforms.js'; import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; import {tagalogTransforms} from './tl/tagalog-transforms.js'; import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {convertFinalLetters, convertYiddishLigatures} from './yi/yiddish-text-postprocessors.js'; import {combineYiddishLigatures, removeYiddishDiacritics} from './yi/yiddish-text-preprocessors.js'; import {yiddishTransforms} from './yi/yiddish-transforms.js'; import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, capitalizeFirstLetter, }; /** @type {import('language-descriptors').LanguageDescriptorAny[]} */ const languageDescriptors = [ { iso: 'aii', iso639_3: 'aii', name: 'Assyrian Neo-Aramaic', exampleText: 'ܟܵܬܹܒ݂', textPreprocessors: { removeSyriacScriptDiacritics, }, }, { iso: 'ar', iso639_3: 'ara', name: 'Arabic (MSA)', exampleText: 'قَرَأَ', textPreprocessors: { removeArabicScriptDiacritics, removeTatweel, normalizeUnicode, addHamzaTop, addHamzaBottom, convertAlifMaqsuraToYaa, }, languageTransforms: arabicTransforms, }, { iso: 'arz', iso639_3: 'arz', name: 'Arabic (Egyptian)', exampleText: 'قَرَأَ', textPreprocessors: { removeArabicScriptDiacritics, removeTatweel, normalizeUnicode, addHamzaTop, addHamzaBottom, convertAlifMaqsuraToYaa, convertHaToTaMarbuta, }, languageTransforms: arabicTransforms, }, { iso: 'bg', iso639_3: 'bul', name: 'Bulgarian', exampleText: 'чета', textPreprocessors: capitalizationPreprocessors, }, { iso: 'cs', iso639_3: 'ces', name: 'Czech', exampleText: 'číst', textPreprocessors: capitalizationPreprocessors, }, { iso: 'da', iso639_3: 'dan', name: 'Danish', exampleText: 'læse', textPreprocessors: { ...capitalizationPreprocessors, }, }, { iso: 'de', iso639_3: 'deu', name: 'German', exampleText: 'lesen', textPreprocessors: { ...capitalizationPreprocessors, eszettPreprocessor, }, languageTransforms: germanTransforms, }, { iso: 'el', iso639_3: 'ell', name: 'Greek', exampleText: 'διαβάζω', textPreprocessors: { ...capitalizationPreprocessors, removeDoubleAcuteAccents, }, }, { iso: 'en', iso639_3: 'eng', name: 'English', exampleText: 'read', textPreprocessors: capitalizationPreprocessors, languageTransforms: englishTransforms, }, { iso: 'eo', iso639_3: 'epo', name: 'Esperanto', exampleText: 'legi', textPreprocessors: capitalizationPreprocessors, languageTransforms: esperantoTransforms, }, { iso: 'es', iso639_3: 'spa', name: 'Spanish', exampleText: 'leer', textPreprocessors: capitalizationPreprocessors, languageTransforms: spanishTransforms, }, { iso: 'et', iso639_3: 'est', name: 'Estonian', exampleText: 'lugema', textPreprocessors: capitalizationPreprocessors, }, { iso: 'fa', iso639_3: 'fas', name: 'Persian', exampleText: 'خواندن', textPreprocessors: { removeArabicScriptDiacritics, }, }, { iso: 'fi', iso639_3: 'fin', name: 'Finnish', exampleText: 'lukea', textPreprocessors: capitalizationPreprocessors, }, { iso: 'fr', iso639_3: 'fra', name: 'French', exampleText: 'lire', textPreprocessors: { ...capitalizationPreprocessors, apostropheVariants, }, languageTransforms: frenchTransforms, }, { iso: 'ga', iso639_3: 'gle', name: 'Irish', exampleText: 'léigh', textPreprocessors: capitalizationPreprocessors, languageTransforms: irishTransforms, }, { iso: 'grc', iso639_3: 'grc', name: 'Ancient Greek', exampleText: 'γράφω', /* 'to write' */ textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, convertLatinToGreek, }, languageTransforms: ancientGreekTransforms, }, { // no 2 letter iso for hawaiian iso: 'haw', iso639_3: 'haw', name: 'Hawaiian', exampleText: 'heluhelu', textPreprocessors: capitalizationPreprocessors, }, { iso: 'he', iso639_3: 'heb', name: 'Hebrew', exampleText: 'קריאה', }, { iso: 'hi', iso639_3: 'hin', name: 'Hindi', exampleText: 'पढ़ने के लिए', }, { iso: 'hu', iso639_3: 'hun', name: 'Hungarian', exampleText: 'olvasni', textPreprocessors: capitalizationPreprocessors, }, { iso: 'id', iso639_3: 'ind', name: 'Indonesian', exampleText: 'baca', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, }, { iso: 'it', iso639_3: 'ita', name: 'Italian', exampleText: 'leggere', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, }, { iso: 'la', iso639_3: 'lat', name: 'Latin', exampleText: 'legō', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, processDiphtongs, }, languageTransforms: latinTransforms, }, { iso: 'lo', iso639_3: 'lao', name: 'Lao', exampleText: 'ອ່ານ', }, { iso: 'lv', iso639_3: 'lav', name: 'Latvian', exampleText: 'lasīt', textPreprocessors: capitalizationPreprocessors, }, { iso: 'ja', iso639_3: 'jpn', name: 'Japanese', exampleText: '読め', isTextLookupWorthy: isStringPartiallyJapanese, textPreprocessors: { convertHalfWidthCharacters, alphabeticToHiragana, normalizeCombiningCharacters, normalizeCJKCompatibilityCharacters, normalizeRadicalCharacters, alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, standardizeKanji, }, languageTransforms: japaneseTransforms, }, { iso: 'ka', iso639_3: 'kat', name: 'Georgian', exampleText: 'კითხვა', // Georgian for “read” languageTransforms: georgianTransforms, }, { iso: 'kn', iso639_3: 'kan', name: 'Kannada', exampleText: 'ಓದು', }, { iso: 'km', iso639_3: 'khm', name: 'Khmer', exampleText: 'អាន', }, { iso: 'ko', iso639_3: 'kor', name: 'Korean', exampleText: '읽어', textPreprocessors: { disassembleHangul, }, textPostprocessors: { reassembleHangul, }, languageTransforms: koreanTransforms, }, { iso: 'mn', iso639_3: 'mon', name: 'Mongolian', exampleText: 'унших', textPreprocessors: capitalizationPreprocessors, }, { iso: 'mt', iso639_3: 'mlt', name: 'Maltese', exampleText: 'kiteb', textPreprocessors: capitalizationPreprocessors, }, { iso: 'nl', iso639_3: 'nld', name: 'Dutch', exampleText: 'lezen', textPreprocessors: capitalizationPreprocessors, }, { iso: 'no', iso639_3: 'nor', name: 'Norwegian', exampleText: 'lese', textPreprocessors: { ...capitalizationPreprocessors, }, }, { iso: 'pl', iso639_3: 'pol', name: 'Polish', exampleText: 'czytać', textPreprocessors: capitalizationPreprocessors, }, { iso: 'pt', iso639_3: 'por', name: 'Portuguese', exampleText: 'ler', textPreprocessors: capitalizationPreprocessors, }, { iso: 'ro', iso639_3: 'ron', name: 'Romanian', exampleText: 'citi', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, }, { iso: 'ru', iso639_3: 'rus', name: 'Russian', exampleText: 'читать', textPreprocessors: { ...capitalizationPreprocessors, yoToE, removeRussianDiacritics, }, }, { iso: 'sga', iso639_3: 'sga', name: 'Old Irish', exampleText: 'légaid', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, languageTransforms: oldIrishTransforms, }, { iso: 'sh', iso639_3: 'hbs', name: 'Serbo-Croatian', exampleText: 'čìtati', textPreprocessors: { ...capitalizationPreprocessors, removeSerboCroatianAccentMarks, }, }, { iso: 'sq', iso639_3: 'sqi', name: 'Albanian', exampleText: 'ndihmoj', /* 'to help' */ textPreprocessors: capitalizationPreprocessors, languageTransforms: albanianTransforms, }, { iso: 'sv', iso639_3: 'swe', name: 'Swedish', exampleText: 'läsa', textPreprocessors: capitalizationPreprocessors, }, { iso: 'th', iso639_3: 'tha', name: 'Thai', exampleText: 'อ่าน', }, { iso: 'tl', iso639_3: 'tgl', name: 'Tagalog', exampleText: 'basahin', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, languageTransforms: tagalogTransforms, }, { iso: 'tr', iso639_3: 'tur', name: 'Turkish', exampleText: 'okumak', textPreprocessors: capitalizationPreprocessors, }, { iso: 'tok', iso639_3: 'tok', name: 'Toki Pona', exampleText: 'wile', textPreprocessors: capitalizationPreprocessors, }, { iso: 'uk', iso639_3: 'ukr', name: 'Ukrainian', exampleText: 'читати', textPreprocessors: capitalizationPreprocessors, }, { iso: 'vi', iso639_3: 'vie', name: 'Vietnamese', exampleText: 'đọc', textPreprocessors: { ...capitalizationPreprocessors, normalizeDiacritics, }, }, { iso: 'cy', iso639_3: 'cym', name: 'Welsh', exampleText: 'ddarllen', textPreprocessors: capitalizationPreprocessors, }, { iso: 'yi', iso639_3: 'yid', name: 'Yiddish', exampleText: 'באַשאַפֿן', textPreprocessors: { removeYiddishDiacritics, combineYiddishLigatures, }, textPostprocessors: { convertFinalLetters, convertYiddishLigatures, }, languageTransforms: yiddishTransforms, }, { iso: 'yue', iso639_3: 'yue', name: 'Cantonese', exampleText: '讀', textPreprocessors: { normalizeRadicalCharacters, }, }, { iso: 'zh', iso639_3: 'zho', name: 'Chinese', exampleText: '读', isTextLookupWorthy: isStringPartiallyChinese, readingNormalizer: normalizePinyin, textPreprocessors: { normalizeRadicalCharacters, }, }, ]; /** @type {Map} */ export const languageDescriptorMap = new Map(); for (const languageDescriptor of languageDescriptors) { languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor); }