refactor: consolidate JLPT token filter utilities

2026-06-15 03:13:33 -07:00 · 2026-02-15 21:00:00 -08:00
parent e14dad410e
commit 2211c086c0
5 changed files with 61 additions and 56 deletions
@@ -38,6 +38,16 @@ export {
 export { openYomitanSettingsWindow } from "./yomitan-settings-service";
 export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
 export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
 export {
  getIgnoredPos1Entries,
  JlptIgnoredPos1Entry,
  JLPT_EXCLUDED_TERMS,
  JLPT_IGNORED_MECAB_POS1,
  JLPT_IGNORED_MECAB_POS1_ENTRIES,
  JLPT_IGNORED_MECAB_POS1_LIST,
  shouldIgnoreJlptByTerm,
  shouldIgnoreJlptForMecabPos1,
 } from "./jlpt-token-filter";
 export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
 export {
  getJimakuLanguagePreferenceService,
@@ -1,29 +0,0 @@
 // Token-level lexical terms excluded from JLPT highlighting.
 // These are not tied to POS and act as a safety layer for non-dictionary cases.
 export const JLPT_EXCLUDED_TERMS = new Set([
  "この",
  "その",
  "あの",
  "どの",
  "これ",
  "それ",
  "あれ",
  "どれ",
  "ここ",
  "そこ",
  "あそこ",
  "どこ",
  "こと",
  "ああ",
  "ええ",
  "うう",
  "おお",
  "はは",
  "へえ",
  "ふう",
  "ほう",
 ]);
 export function shouldIgnoreJlptByTerm(term: string): boolean {
  return JLPT_EXCLUDED_TERMS.has(term);
 }
@@ -1,23 +0,0 @@
 import {
  JlptIgnoredPos1Entry,
  JLPT_IGNORED_MECAB_POS1,
  JLPT_IGNORED_MECAB_POS1_ENTRIES,
 } from "./jlpt-ignored-mecab-pos1";
 export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry };
 // Data-driven MeCab POS names (pos1) used for JLPT filtering.
 export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
  JLPT_IGNORED_MECAB_POS1;
 const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
  JLPT_IGNORED_MECAB_POS1_LIST,
 );
 export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
  return JLPT_IGNORED_MECAB_POS1_ENTRIES;
 }
 export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
  return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
 }
@@ -1,10 +1,40 @@
 // MeCab POS1 categories that should be excluded from JLPT-level token tagging.
 // These are filtered out because they are typically functional or non-lexical words.
 export type JlptIgnoredPos1Entry = {
  pos1: string;
  reason: string;
 };
 // Token-level lexical terms excluded from JLPT highlighting.
 // These are not tied to POS and act as a safety layer for non-dictionary cases.
 export const JLPT_EXCLUDED_TERMS = new Set([
  "この",
  "その",
  "あの",
  "どの",
  "これ",
  "それ",
  "あれ",
  "どれ",
  "ここ",
  "そこ",
  "あそこ",
  "どこ",
  "こと",
  "ああ",
  "ええ",
  "うう",
  "おお",
  "はは",
  "へえ",
  "ふう",
  "ほう",
 ]);
 export function shouldIgnoreJlptByTerm(term: string): boolean {
  return JLPT_EXCLUDED_TERMS.has(term);
 }
 // MeCab POS1 categories that should be excluded from JLPT-level token tagging.
 // These are filtered out because they are typically functional or non-lexical words.
 export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
  {
    pos1: "助詞",
@@ -43,3 +73,18 @@ export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
 export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
  (entry) => entry.pos1,
 );
 export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
  JLPT_IGNORED_MECAB_POS1;
 const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
  JLPT_IGNORED_MECAB_POS1_LIST,
 );
 export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
  return JLPT_IGNORED_MECAB_POS1_ENTRIES;
 }
 export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
  return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
 }
@@ -8,8 +8,10 @@ import {
  SubtitleData,
  Token,
 } from "../../types";
-import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config";
+import {
-import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms";
+  shouldIgnoreJlptForMecabPos1,
  shouldIgnoreJlptByTerm,
 } from "./jlpt-token-filter";
 interface YomitanParseHeadword {
  term?: unknown;