From 2211c086c062a8454f97591b52c95ffa80338433 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sun, 15 Feb 2026 21:00:00 -0800 Subject: [PATCH] refactor: consolidate JLPT token filter utilities --- src/core/services/index.ts | 10 ++++ src/core/services/jlpt-excluded-terms.ts | 29 ----------- src/core/services/jlpt-token-filter-config.ts | 23 --------- ...red-mecab-pos1.ts => jlpt-token-filter.ts} | 49 ++++++++++++++++++- src/core/services/tokenizer-service.ts | 6 ++- 5 files changed, 61 insertions(+), 56 deletions(-) delete mode 100644 src/core/services/jlpt-excluded-terms.ts delete mode 100644 src/core/services/jlpt-token-filter-config.ts rename src/core/services/{jlpt-ignored-mecab-pos1.ts => jlpt-token-filter.ts} (56%) diff --git a/src/core/services/index.ts b/src/core/services/index.ts index bbf444b..1ce9d73 100644 --- a/src/core/services/index.ts +++ b/src/core/services/index.ts @@ -38,6 +38,16 @@ export { export { openYomitanSettingsWindow } from "./yomitan-settings-service"; export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service"; export { createJlptVocabularyLookupService } from "./jlpt-vocab-service"; +export { + getIgnoredPos1Entries, + JlptIgnoredPos1Entry, + JLPT_EXCLUDED_TERMS, + JLPT_IGNORED_MECAB_POS1, + JLPT_IGNORED_MECAB_POS1_ENTRIES, + JLPT_IGNORED_MECAB_POS1_LIST, + shouldIgnoreJlptByTerm, + shouldIgnoreJlptForMecabPos1, +} from "./jlpt-token-filter"; export { loadYomitanExtensionService } from "./yomitan-extension-loader-service"; export { getJimakuLanguagePreferenceService, diff --git a/src/core/services/jlpt-excluded-terms.ts b/src/core/services/jlpt-excluded-terms.ts deleted file mode 100644 index 1139300..0000000 --- a/src/core/services/jlpt-excluded-terms.ts +++ /dev/null @@ -1,29 +0,0 @@ -// Token-level lexical terms excluded from JLPT highlighting. -// These are not tied to POS and act as a safety layer for non-dictionary cases. -export const JLPT_EXCLUDED_TERMS = new Set([ - "この", - "その", - "あの", - "どの", - "これ", - "それ", - "あれ", - "どれ", - "ここ", - "そこ", - "あそこ", - "どこ", - "こと", - "ああ", - "ええ", - "うう", - "おお", - "はは", - "へえ", - "ふう", - "ほう", -]); - -export function shouldIgnoreJlptByTerm(term: string): boolean { - return JLPT_EXCLUDED_TERMS.has(term); -} diff --git a/src/core/services/jlpt-token-filter-config.ts b/src/core/services/jlpt-token-filter-config.ts deleted file mode 100644 index 7ef63c7..0000000 --- a/src/core/services/jlpt-token-filter-config.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { - JlptIgnoredPos1Entry, - JLPT_IGNORED_MECAB_POS1, - JLPT_IGNORED_MECAB_POS1_ENTRIES, -} from "./jlpt-ignored-mecab-pos1"; - -export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry }; - -// Data-driven MeCab POS names (pos1) used for JLPT filtering. -export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] = - JLPT_IGNORED_MECAB_POS1; - -const JLPT_IGNORED_MECAB_POS1_SET = new Set( - JLPT_IGNORED_MECAB_POS1_LIST, -); - -export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] { - return JLPT_IGNORED_MECAB_POS1_ENTRIES; -} - -export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean { - return JLPT_IGNORED_MECAB_POS1_SET.has(pos1); -} diff --git a/src/core/services/jlpt-ignored-mecab-pos1.ts b/src/core/services/jlpt-token-filter.ts similarity index 56% rename from src/core/services/jlpt-ignored-mecab-pos1.ts rename to src/core/services/jlpt-token-filter.ts index 6d8b198..f340421 100644 --- a/src/core/services/jlpt-ignored-mecab-pos1.ts +++ b/src/core/services/jlpt-token-filter.ts @@ -1,10 +1,40 @@ -// MeCab POS1 categories that should be excluded from JLPT-level token tagging. -// These are filtered out because they are typically functional or non-lexical words. export type JlptIgnoredPos1Entry = { pos1: string; reason: string; }; +// Token-level lexical terms excluded from JLPT highlighting. +// These are not tied to POS and act as a safety layer for non-dictionary cases. +export const JLPT_EXCLUDED_TERMS = new Set([ + "この", + "その", + "あの", + "どの", + "これ", + "それ", + "あれ", + "どれ", + "ここ", + "そこ", + "あそこ", + "どこ", + "こと", + "ああ", + "ええ", + "うう", + "おお", + "はは", + "へえ", + "ふう", + "ほう", +]); + +export function shouldIgnoreJlptByTerm(term: string): boolean { + return JLPT_EXCLUDED_TERMS.has(term); +} + +// MeCab POS1 categories that should be excluded from JLPT-level token tagging. +// These are filtered out because they are typically functional or non-lexical words. export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [ { pos1: "助詞", @@ -43,3 +73,18 @@ export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [ export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map( (entry) => entry.pos1, ); + +export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] = + JLPT_IGNORED_MECAB_POS1; + +const JLPT_IGNORED_MECAB_POS1_SET = new Set( + JLPT_IGNORED_MECAB_POS1_LIST, +); + +export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] { + return JLPT_IGNORED_MECAB_POS1_ENTRIES; +} + +export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean { + return JLPT_IGNORED_MECAB_POS1_SET.has(pos1); +} diff --git a/src/core/services/tokenizer-service.ts b/src/core/services/tokenizer-service.ts index 0cac83e..a276a68 100644 --- a/src/core/services/tokenizer-service.ts +++ b/src/core/services/tokenizer-service.ts @@ -8,8 +8,10 @@ import { SubtitleData, Token, } from "../../types"; -import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config"; -import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms"; +import { + shouldIgnoreJlptForMecabPos1, + shouldIgnoreJlptByTerm, +} from "./jlpt-token-filter"; interface YomitanParseHeadword { term?: unknown;