mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
refactor: consolidate JLPT token filter utilities
This commit is contained in:
@@ -38,6 +38,16 @@ export {
|
|||||||
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
|
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
|
||||||
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
|
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
|
||||||
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
|
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
|
||||||
|
export {
|
||||||
|
getIgnoredPos1Entries,
|
||||||
|
JlptIgnoredPos1Entry,
|
||||||
|
JLPT_EXCLUDED_TERMS,
|
||||||
|
JLPT_IGNORED_MECAB_POS1,
|
||||||
|
JLPT_IGNORED_MECAB_POS1_ENTRIES,
|
||||||
|
JLPT_IGNORED_MECAB_POS1_LIST,
|
||||||
|
shouldIgnoreJlptByTerm,
|
||||||
|
shouldIgnoreJlptForMecabPos1,
|
||||||
|
} from "./jlpt-token-filter";
|
||||||
export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
|
export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
|
||||||
export {
|
export {
|
||||||
getJimakuLanguagePreferenceService,
|
getJimakuLanguagePreferenceService,
|
||||||
|
|||||||
@@ -1,29 +0,0 @@
|
|||||||
// Token-level lexical terms excluded from JLPT highlighting.
|
|
||||||
// These are not tied to POS and act as a safety layer for non-dictionary cases.
|
|
||||||
export const JLPT_EXCLUDED_TERMS = new Set([
|
|
||||||
"この",
|
|
||||||
"その",
|
|
||||||
"あの",
|
|
||||||
"どの",
|
|
||||||
"これ",
|
|
||||||
"それ",
|
|
||||||
"あれ",
|
|
||||||
"どれ",
|
|
||||||
"ここ",
|
|
||||||
"そこ",
|
|
||||||
"あそこ",
|
|
||||||
"どこ",
|
|
||||||
"こと",
|
|
||||||
"ああ",
|
|
||||||
"ええ",
|
|
||||||
"うう",
|
|
||||||
"おお",
|
|
||||||
"はは",
|
|
||||||
"へえ",
|
|
||||||
"ふう",
|
|
||||||
"ほう",
|
|
||||||
]);
|
|
||||||
|
|
||||||
export function shouldIgnoreJlptByTerm(term: string): boolean {
|
|
||||||
return JLPT_EXCLUDED_TERMS.has(term);
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
import {
|
|
||||||
JlptIgnoredPos1Entry,
|
|
||||||
JLPT_IGNORED_MECAB_POS1,
|
|
||||||
JLPT_IGNORED_MECAB_POS1_ENTRIES,
|
|
||||||
} from "./jlpt-ignored-mecab-pos1";
|
|
||||||
|
|
||||||
export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry };
|
|
||||||
|
|
||||||
// Data-driven MeCab POS names (pos1) used for JLPT filtering.
|
|
||||||
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
|
|
||||||
JLPT_IGNORED_MECAB_POS1;
|
|
||||||
|
|
||||||
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
|
|
||||||
JLPT_IGNORED_MECAB_POS1_LIST,
|
|
||||||
);
|
|
||||||
|
|
||||||
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
|
|
||||||
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
|
|
||||||
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
|
|
||||||
}
|
|
||||||
@@ -1,10 +1,40 @@
|
|||||||
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
|
|
||||||
// These are filtered out because they are typically functional or non-lexical words.
|
|
||||||
export type JlptIgnoredPos1Entry = {
|
export type JlptIgnoredPos1Entry = {
|
||||||
pos1: string;
|
pos1: string;
|
||||||
reason: string;
|
reason: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Token-level lexical terms excluded from JLPT highlighting.
|
||||||
|
// These are not tied to POS and act as a safety layer for non-dictionary cases.
|
||||||
|
export const JLPT_EXCLUDED_TERMS = new Set([
|
||||||
|
"この",
|
||||||
|
"その",
|
||||||
|
"あの",
|
||||||
|
"どの",
|
||||||
|
"これ",
|
||||||
|
"それ",
|
||||||
|
"あれ",
|
||||||
|
"どれ",
|
||||||
|
"ここ",
|
||||||
|
"そこ",
|
||||||
|
"あそこ",
|
||||||
|
"どこ",
|
||||||
|
"こと",
|
||||||
|
"ああ",
|
||||||
|
"ええ",
|
||||||
|
"うう",
|
||||||
|
"おお",
|
||||||
|
"はは",
|
||||||
|
"へえ",
|
||||||
|
"ふう",
|
||||||
|
"ほう",
|
||||||
|
]);
|
||||||
|
|
||||||
|
export function shouldIgnoreJlptByTerm(term: string): boolean {
|
||||||
|
return JLPT_EXCLUDED_TERMS.has(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
|
||||||
|
// These are filtered out because they are typically functional or non-lexical words.
|
||||||
export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
|
export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
|
||||||
{
|
{
|
||||||
pos1: "助詞",
|
pos1: "助詞",
|
||||||
@@ -43,3 +73,18 @@ export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
|
|||||||
export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
|
export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
|
||||||
(entry) => entry.pos1,
|
(entry) => entry.pos1,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
|
||||||
|
JLPT_IGNORED_MECAB_POS1;
|
||||||
|
|
||||||
|
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
|
||||||
|
JLPT_IGNORED_MECAB_POS1_LIST,
|
||||||
|
);
|
||||||
|
|
||||||
|
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
|
||||||
|
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
|
||||||
|
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
|
||||||
|
}
|
||||||
@@ -8,8 +8,10 @@ import {
|
|||||||
SubtitleData,
|
SubtitleData,
|
||||||
Token,
|
Token,
|
||||||
} from "../../types";
|
} from "../../types";
|
||||||
import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config";
|
import {
|
||||||
import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms";
|
shouldIgnoreJlptForMecabPos1,
|
||||||
|
shouldIgnoreJlptByTerm,
|
||||||
|
} from "./jlpt-token-filter";
|
||||||
|
|
||||||
interface YomitanParseHeadword {
|
interface YomitanParseHeadword {
|
||||||
term?: unknown;
|
term?: unknown;
|
||||||
|
|||||||
Reference in New Issue
Block a user