refactor: consolidate JLPT token filter utilities

This commit is contained in:
2026-02-15 21:00:00 -08:00
parent e14dad410e
commit 2211c086c0
5 changed files with 61 additions and 56 deletions

View File

@@ -38,6 +38,16 @@ export {
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
export {
getIgnoredPos1Entries,
JlptIgnoredPos1Entry,
JLPT_EXCLUDED_TERMS,
JLPT_IGNORED_MECAB_POS1,
JLPT_IGNORED_MECAB_POS1_ENTRIES,
JLPT_IGNORED_MECAB_POS1_LIST,
shouldIgnoreJlptByTerm,
shouldIgnoreJlptForMecabPos1,
} from "./jlpt-token-filter";
export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
export {
getJimakuLanguagePreferenceService,

View File

@@ -1,29 +0,0 @@
// Token-level lexical terms excluded from JLPT highlighting.
// These are not tied to POS and act as a safety layer for non-dictionary cases.
export const JLPT_EXCLUDED_TERMS = new Set([
"この",
"その",
"あの",
"どの",
"これ",
"それ",
"あれ",
"どれ",
"ここ",
"そこ",
"あそこ",
"どこ",
"こと",
"ああ",
"ええ",
"うう",
"おお",
"はは",
"へえ",
"ふう",
"ほう",
]);
export function shouldIgnoreJlptByTerm(term: string): boolean {
return JLPT_EXCLUDED_TERMS.has(term);
}

View File

@@ -1,23 +0,0 @@
import {
JlptIgnoredPos1Entry,
JLPT_IGNORED_MECAB_POS1,
JLPT_IGNORED_MECAB_POS1_ENTRIES,
} from "./jlpt-ignored-mecab-pos1";
export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry };
// Data-driven MeCab POS names (pos1) used for JLPT filtering.
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
JLPT_IGNORED_MECAB_POS1;
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
JLPT_IGNORED_MECAB_POS1_LIST,
);
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
}
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
}

View File

@@ -1,10 +1,40 @@
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
// These are filtered out because they are typically functional or non-lexical words.
export type JlptIgnoredPos1Entry = {
pos1: string;
reason: string;
};
// Token-level lexical terms excluded from JLPT highlighting.
// These are not tied to POS and act as a safety layer for non-dictionary cases.
export const JLPT_EXCLUDED_TERMS = new Set([
"この",
"その",
"あの",
"どの",
"これ",
"それ",
"あれ",
"どれ",
"ここ",
"そこ",
"あそこ",
"どこ",
"こと",
"ああ",
"ええ",
"うう",
"おお",
"はは",
"へえ",
"ふう",
"ほう",
]);
export function shouldIgnoreJlptByTerm(term: string): boolean {
return JLPT_EXCLUDED_TERMS.has(term);
}
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
// These are filtered out because they are typically functional or non-lexical words.
export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
{
pos1: "助詞",
@@ -43,3 +73,18 @@ export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
(entry) => entry.pos1,
);
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
JLPT_IGNORED_MECAB_POS1;
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
JLPT_IGNORED_MECAB_POS1_LIST,
);
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
}
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
}

View File

@@ -8,8 +8,10 @@ import {
SubtitleData,
Token,
} from "../../types";
import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config";
import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms";
import {
shouldIgnoreJlptForMecabPos1,
shouldIgnoreJlptByTerm,
} from "./jlpt-token-filter";
interface YomitanParseHeadword {
term?: unknown;