mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
refactor: consolidate JLPT token filter utilities
This commit is contained in:
@@ -38,6 +38,16 @@ export {
|
||||
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
|
||||
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
|
||||
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
|
||||
export {
|
||||
getIgnoredPos1Entries,
|
||||
JlptIgnoredPos1Entry,
|
||||
JLPT_EXCLUDED_TERMS,
|
||||
JLPT_IGNORED_MECAB_POS1,
|
||||
JLPT_IGNORED_MECAB_POS1_ENTRIES,
|
||||
JLPT_IGNORED_MECAB_POS1_LIST,
|
||||
shouldIgnoreJlptByTerm,
|
||||
shouldIgnoreJlptForMecabPos1,
|
||||
} from "./jlpt-token-filter";
|
||||
export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
|
||||
export {
|
||||
getJimakuLanguagePreferenceService,
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
// Token-level lexical terms excluded from JLPT highlighting.
|
||||
// These are not tied to POS and act as a safety layer for non-dictionary cases.
|
||||
export const JLPT_EXCLUDED_TERMS = new Set([
|
||||
"この",
|
||||
"その",
|
||||
"あの",
|
||||
"どの",
|
||||
"これ",
|
||||
"それ",
|
||||
"あれ",
|
||||
"どれ",
|
||||
"ここ",
|
||||
"そこ",
|
||||
"あそこ",
|
||||
"どこ",
|
||||
"こと",
|
||||
"ああ",
|
||||
"ええ",
|
||||
"うう",
|
||||
"おお",
|
||||
"はは",
|
||||
"へえ",
|
||||
"ふう",
|
||||
"ほう",
|
||||
]);
|
||||
|
||||
export function shouldIgnoreJlptByTerm(term: string): boolean {
|
||||
return JLPT_EXCLUDED_TERMS.has(term);
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
import {
|
||||
JlptIgnoredPos1Entry,
|
||||
JLPT_IGNORED_MECAB_POS1,
|
||||
JLPT_IGNORED_MECAB_POS1_ENTRIES,
|
||||
} from "./jlpt-ignored-mecab-pos1";
|
||||
|
||||
export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry };
|
||||
|
||||
// Data-driven MeCab POS names (pos1) used for JLPT filtering.
|
||||
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
|
||||
JLPT_IGNORED_MECAB_POS1;
|
||||
|
||||
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
|
||||
JLPT_IGNORED_MECAB_POS1_LIST,
|
||||
);
|
||||
|
||||
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
|
||||
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
|
||||
}
|
||||
|
||||
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
|
||||
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
|
||||
}
|
||||
@@ -1,10 +1,40 @@
|
||||
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
|
||||
// These are filtered out because they are typically functional or non-lexical words.
|
||||
export type JlptIgnoredPos1Entry = {
|
||||
pos1: string;
|
||||
reason: string;
|
||||
};
|
||||
|
||||
// Token-level lexical terms excluded from JLPT highlighting.
|
||||
// These are not tied to POS and act as a safety layer for non-dictionary cases.
|
||||
export const JLPT_EXCLUDED_TERMS = new Set([
|
||||
"この",
|
||||
"その",
|
||||
"あの",
|
||||
"どの",
|
||||
"これ",
|
||||
"それ",
|
||||
"あれ",
|
||||
"どれ",
|
||||
"ここ",
|
||||
"そこ",
|
||||
"あそこ",
|
||||
"どこ",
|
||||
"こと",
|
||||
"ああ",
|
||||
"ええ",
|
||||
"うう",
|
||||
"おお",
|
||||
"はは",
|
||||
"へえ",
|
||||
"ふう",
|
||||
"ほう",
|
||||
]);
|
||||
|
||||
export function shouldIgnoreJlptByTerm(term: string): boolean {
|
||||
return JLPT_EXCLUDED_TERMS.has(term);
|
||||
}
|
||||
|
||||
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
|
||||
// These are filtered out because they are typically functional or non-lexical words.
|
||||
export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
|
||||
{
|
||||
pos1: "助詞",
|
||||
@@ -43,3 +73,18 @@ export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
|
||||
export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
|
||||
(entry) => entry.pos1,
|
||||
);
|
||||
|
||||
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
|
||||
JLPT_IGNORED_MECAB_POS1;
|
||||
|
||||
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
|
||||
JLPT_IGNORED_MECAB_POS1_LIST,
|
||||
);
|
||||
|
||||
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
|
||||
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
|
||||
}
|
||||
|
||||
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
|
||||
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
|
||||
}
|
||||
@@ -8,8 +8,10 @@ import {
|
||||
SubtitleData,
|
||||
Token,
|
||||
} from "../../types";
|
||||
import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config";
|
||||
import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms";
|
||||
import {
|
||||
shouldIgnoreJlptForMecabPos1,
|
||||
shouldIgnoreJlptByTerm,
|
||||
} from "./jlpt-token-filter";
|
||||
|
||||
interface YomitanParseHeadword {
|
||||
term?: unknown;
|
||||
|
||||
Reference in New Issue
Block a user