mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Add opt-in JLPT tagging flow
This commit is contained in:
@@ -37,6 +37,7 @@ export {
|
||||
} from "./runtime-config-service";
|
||||
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
|
||||
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
|
||||
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
|
||||
export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
|
||||
export {
|
||||
getJimakuLanguagePreferenceService,
|
||||
|
||||
29
src/core/services/jlpt-excluded-terms.ts
Normal file
29
src/core/services/jlpt-excluded-terms.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
// Token-level lexical terms excluded from JLPT highlighting.
|
||||
// These are not tied to POS and act as a safety layer for non-dictionary cases.
|
||||
export const JLPT_EXCLUDED_TERMS = new Set([
|
||||
"この",
|
||||
"その",
|
||||
"あの",
|
||||
"どの",
|
||||
"これ",
|
||||
"それ",
|
||||
"あれ",
|
||||
"どれ",
|
||||
"ここ",
|
||||
"そこ",
|
||||
"あそこ",
|
||||
"どこ",
|
||||
"こと",
|
||||
"ああ",
|
||||
"ええ",
|
||||
"うう",
|
||||
"おお",
|
||||
"はは",
|
||||
"へえ",
|
||||
"ふう",
|
||||
"ほう",
|
||||
]);
|
||||
|
||||
export function shouldIgnoreJlptByTerm(term: string): boolean {
|
||||
return JLPT_EXCLUDED_TERMS.has(term);
|
||||
}
|
||||
45
src/core/services/jlpt-ignored-mecab-pos1.ts
Normal file
45
src/core/services/jlpt-ignored-mecab-pos1.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
|
||||
// These are filtered out because they are typically functional or non-lexical words.
|
||||
export type JlptIgnoredPos1Entry = {
|
||||
pos1: string;
|
||||
reason: string;
|
||||
};
|
||||
|
||||
export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
|
||||
{
|
||||
pos1: "助詞",
|
||||
reason: "Particles (ko/kara/nagara etc.): mostly grammatical glue, not independent vocabulary.",
|
||||
},
|
||||
{
|
||||
pos1: "助動詞",
|
||||
reason: "Auxiliary verbs (past tense, politeness, modality): grammar helpers.",
|
||||
},
|
||||
{
|
||||
pos1: "記号",
|
||||
reason: "Symbols/punctuation and symbols-like tokens.",
|
||||
},
|
||||
{
|
||||
pos1: "補助記号",
|
||||
reason: "Auxiliary symbols (e.g. bracket-like or markup tokens).",
|
||||
},
|
||||
{
|
||||
pos1: "連体詞",
|
||||
reason: "Adnominal forms (e.g. demonstratives like \"この\").",
|
||||
},
|
||||
{
|
||||
pos1: "感動詞",
|
||||
reason: "Interjections/onomatopoeia-style exclamations.",
|
||||
},
|
||||
{
|
||||
pos1: "接続詞",
|
||||
reason: "Conjunctions that connect clauses, usually not target vocab items.",
|
||||
},
|
||||
{
|
||||
pos1: "接頭詞",
|
||||
reason: "Prefixes/prefix-like grammatical elements.",
|
||||
},
|
||||
] as const satisfies readonly JlptIgnoredPos1Entry[];
|
||||
|
||||
export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
|
||||
(entry) => entry.pos1,
|
||||
);
|
||||
23
src/core/services/jlpt-token-filter-config.ts
Normal file
23
src/core/services/jlpt-token-filter-config.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import {
|
||||
JlptIgnoredPos1Entry,
|
||||
JLPT_IGNORED_MECAB_POS1,
|
||||
JLPT_IGNORED_MECAB_POS1_ENTRIES,
|
||||
} from "./jlpt-ignored-mecab-pos1";
|
||||
|
||||
export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry };
|
||||
|
||||
// Data-driven MeCab POS names (pos1) used for JLPT filtering.
|
||||
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
|
||||
JLPT_IGNORED_MECAB_POS1;
|
||||
|
||||
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
|
||||
JLPT_IGNORED_MECAB_POS1_LIST,
|
||||
);
|
||||
|
||||
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
|
||||
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
|
||||
}
|
||||
|
||||
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
|
||||
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
|
||||
}
|
||||
194
src/core/services/jlpt-vocab-service.ts
Normal file
194
src/core/services/jlpt-vocab-service.ts
Normal file
@@ -0,0 +1,194 @@
|
||||
import * as fs from "fs";
|
||||
import * as path from "path";
|
||||
|
||||
import type { JlptLevel } from "../../types";
|
||||
|
||||
export interface JlptVocabLookupOptions {
|
||||
searchPaths: string[];
|
||||
log: (message: string) => void;
|
||||
}
|
||||
|
||||
const JLPT_BANK_FILES: { level: JlptLevel; filename: string }[] = [
|
||||
{ level: "N1", filename: "term_meta_bank_1.json" },
|
||||
{ level: "N2", filename: "term_meta_bank_2.json" },
|
||||
{ level: "N3", filename: "term_meta_bank_3.json" },
|
||||
{ level: "N4", filename: "term_meta_bank_4.json" },
|
||||
{ level: "N5", filename: "term_meta_bank_5.json" },
|
||||
];
|
||||
|
||||
const NOOP_LOOKUP = (): null => null;
|
||||
|
||||
function normalizeJlptTerm(value: string): string {
|
||||
return value.trim();
|
||||
}
|
||||
|
||||
function hasFrequencyDisplayValue(meta: unknown): boolean {
|
||||
if (!meta || typeof meta !== "object") return false;
|
||||
const frequency = (meta as { frequency?: unknown }).frequency;
|
||||
if (!frequency || typeof frequency !== "object") return false;
|
||||
return Object.prototype.hasOwnProperty.call(
|
||||
frequency as Record<string, unknown>,
|
||||
"displayValue",
|
||||
);
|
||||
}
|
||||
|
||||
function addEntriesToMap(
|
||||
rawEntries: unknown,
|
||||
level: JlptLevel,
|
||||
terms: Map<string, JlptLevel>,
|
||||
log: (message: string) => void,
|
||||
): void {
|
||||
if (!Array.isArray(rawEntries)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const rawEntry of rawEntries) {
|
||||
if (!Array.isArray(rawEntry)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const [term, _entryId, meta] = rawEntry as [unknown, unknown, unknown];
|
||||
if (typeof term !== "string") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const normalizedTerm = normalizeJlptTerm(term);
|
||||
if (!normalizedTerm) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!hasFrequencyDisplayValue(meta)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!terms.has(normalizedTerm)) {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (terms.get(normalizedTerm) !== "N1" && level === "N1") {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (terms.get(normalizedTerm) !== "N1" && terms.get(normalizedTerm) !== "N2" && level === "N2") {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
terms.get(normalizedTerm) !== "N1" &&
|
||||
terms.get(normalizedTerm) !== "N2" &&
|
||||
terms.get(normalizedTerm) !== "N3" &&
|
||||
level === "N3"
|
||||
) {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
terms.get(normalizedTerm) !== "N1" &&
|
||||
terms.get(normalizedTerm) !== "N2" &&
|
||||
terms.get(normalizedTerm) !== "N3" &&
|
||||
terms.get(normalizedTerm) !== "N4" &&
|
||||
level === "N4"
|
||||
) {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
terms.get(normalizedTerm) !== "N1" &&
|
||||
terms.get(normalizedTerm) !== "N2" &&
|
||||
terms.get(normalizedTerm) !== "N3" &&
|
||||
terms.get(normalizedTerm) !== "N4" &&
|
||||
terms.get(normalizedTerm) !== "N5" &&
|
||||
level === "N5"
|
||||
) {
|
||||
terms.set(normalizedTerm, level);
|
||||
}
|
||||
|
||||
log(
|
||||
`JLPT dictionary already has ${normalizedTerm} as ${terms.get(normalizedTerm)}; keeping that level instead of ${level}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function collectDictionaryFromPath(
|
||||
dictionaryPath: string,
|
||||
log: (message: string) => void,
|
||||
): Map<string, JlptLevel> {
|
||||
const terms = new Map<string, JlptLevel>();
|
||||
|
||||
for (const bank of JLPT_BANK_FILES) {
|
||||
const bankPath = path.join(dictionaryPath, bank.filename);
|
||||
if (!fs.existsSync(bankPath)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let rawText: string;
|
||||
try {
|
||||
rawText = fs.readFileSync(bankPath, "utf-8");
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
let rawEntries: unknown;
|
||||
try {
|
||||
rawEntries = JSON.parse(rawText) as unknown;
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
addEntriesToMap(rawEntries, bank.level, terms, log);
|
||||
}
|
||||
|
||||
return terms;
|
||||
}
|
||||
|
||||
export async function createJlptVocabularyLookupService(
|
||||
options: JlptVocabLookupOptions,
|
||||
): Promise<(term: string) => JlptLevel | null> {
|
||||
const attemptedPaths: string[] = [];
|
||||
let foundDirectoryCount = 0;
|
||||
let foundBankCount = 0;
|
||||
for (const dictionaryPath of options.searchPaths) {
|
||||
attemptedPaths.push(dictionaryPath);
|
||||
if (!fs.existsSync(dictionaryPath)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!fs.statSync(dictionaryPath).isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foundDirectoryCount += 1;
|
||||
|
||||
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
|
||||
if (terms.size > 0) {
|
||||
foundBankCount += 1;
|
||||
options.log(
|
||||
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
|
||||
);
|
||||
return (term: string): JlptLevel | null => {
|
||||
if (!term) return null;
|
||||
const normalized = normalizeJlptTerm(term);
|
||||
return normalized ? terms.get(normalized) ?? null : null;
|
||||
};
|
||||
}
|
||||
|
||||
options.log(
|
||||
`JLPT dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`,
|
||||
);
|
||||
}
|
||||
|
||||
options.log(
|
||||
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
|
||||
);
|
||||
if (foundDirectoryCount > 0 && foundBankCount === 0) {
|
||||
options.log(
|
||||
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
|
||||
);
|
||||
}
|
||||
return NOOP_LOOKUP;
|
||||
}
|
||||
@@ -92,6 +92,7 @@ export async function runAppReadyRuntimeService(
|
||||
): Promise<void> {
|
||||
deps.loadSubtitlePosition();
|
||||
deps.resolveKeybindings();
|
||||
await deps.createMecabTokenizerAndCheck();
|
||||
deps.createMpvClient();
|
||||
|
||||
deps.reloadConfig();
|
||||
@@ -117,7 +118,6 @@ export async function runAppReadyRuntimeService(
|
||||
deps.log("mpv_websocket detected, skipping built-in WebSocket server");
|
||||
}
|
||||
|
||||
await deps.createMecabTokenizerAndCheck();
|
||||
deps.createSubtitleTimingTracker();
|
||||
await deps.loadYomitanExtension();
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ function makeDeps(
|
||||
setYomitanParserInitPromise: () => {},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => "headword",
|
||||
getJlptLevel: () => null,
|
||||
tokenizeWithMecab: async () => null,
|
||||
...overrides,
|
||||
};
|
||||
@@ -43,10 +44,171 @@ function makeDepsFromMecabTokenizer(
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize,
|
||||
}),
|
||||
getJlptLevel: () => null,
|
||||
...overrides,
|
||||
});
|
||||
}
|
||||
|
||||
test("tokenizeSubtitleService assigns JLPT level to parsed Yomitan tokens", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫",
|
||||
reading: "ねこ",
|
||||
headwords: [[{ term: "猫" }]],
|
||||
},
|
||||
{
|
||||
text: "です",
|
||||
reading: "です",
|
||||
headwords: [[{ term: "です" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
tokenizeWithMecab: async () => null,
|
||||
getJlptLevel: (text) => (text === "猫" ? "N5" : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, "N5");
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService skips JLPT level for excluded demonstratives", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"この",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "この",
|
||||
reading: "この",
|
||||
headwords: [[{ term: "この" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
tokenizeWithMecab: async () => null,
|
||||
getJlptLevel: (text) => (text === "この" ? "N5" : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService skips JLPT level for repeated kana SFX", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"ああ",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "ああ",
|
||||
reading: "ああ",
|
||||
headwords: [[{ term: "ああ" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
tokenizeWithMecab: async () => null,
|
||||
getJlptLevel: (text) => (text === "ああ" ? "N5" : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService assigns JLPT level to mecab tokens", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDepsFromMecabTokenizer(async () => [
|
||||
{
|
||||
word: "猫",
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: "",
|
||||
pos2: "",
|
||||
pos3: "",
|
||||
pos4: "",
|
||||
inflectionType: "",
|
||||
inflectionForm: "",
|
||||
headword: "猫",
|
||||
katakanaReading: "ネコ",
|
||||
pronunciation: "ネコ",
|
||||
},
|
||||
], {
|
||||
getJlptLevel: (text) => (text === "猫" ? "N4" : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, "N4");
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService skips JLPT level for mecab tokens marked as ineligible", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"は",
|
||||
makeDepsFromMecabTokenizer(async () => [
|
||||
{
|
||||
word: "は",
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: "助詞",
|
||||
pos2: "",
|
||||
pos3: "",
|
||||
pos4: "",
|
||||
inflectionType: "",
|
||||
inflectionForm: "",
|
||||
headword: "は",
|
||||
katakanaReading: "ハ",
|
||||
pronunciation: "ハ",
|
||||
},
|
||||
], {
|
||||
getJlptLevel: (text) => (text === "は" ? "N5" : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.pos1, "助詞");
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService returns null tokens for empty normalized text", async () => {
|
||||
const result = await tokenizeSubtitleService(" \\n ", makeDeps());
|
||||
assert.deepEqual(result, { text: " \\n ", tokens: null });
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
import { BrowserWindow, Extension, session } from "electron";
|
||||
import { markNPlusOneTargets, mergeTokens } from "../../token-merger";
|
||||
import {
|
||||
JlptLevel,
|
||||
MergedToken,
|
||||
NPlusOneMatchMode,
|
||||
PartOfSpeech,
|
||||
SubtitleData,
|
||||
Token,
|
||||
} from "../../types";
|
||||
import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config";
|
||||
import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms";
|
||||
|
||||
interface YomitanParseHeadword {
|
||||
term?: unknown;
|
||||
@@ -34,6 +37,8 @@ export interface TokenizerServiceDeps {
|
||||
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
|
||||
isKnownWord: (text: string) => boolean;
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
}
|
||||
|
||||
@@ -51,6 +56,8 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
|
||||
isKnownWord: (text: string) => boolean;
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
@@ -67,6 +74,8 @@ export function createTokenizerDepsRuntimeService(
|
||||
setYomitanParserInitPromise: options.setYomitanParserInitPromise,
|
||||
isKnownWord: options.isKnownWord,
|
||||
getKnownWordMatchMode: options.getKnownWordMatchMode,
|
||||
getJlptLevel: options.getJlptLevel,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
tokenizeWithMecab: async (text) => {
|
||||
const mecabTokenizer = options.getMecabTokenizer();
|
||||
if (!mecabTokenizer) {
|
||||
@@ -112,6 +121,142 @@ function applyKnownWordMarking(
|
||||
});
|
||||
}
|
||||
|
||||
function resolveJlptLookupText(token: MergedToken): string {
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
if (token.reading && token.reading.length > 0) {
|
||||
return token.reading;
|
||||
}
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function normalizeJlptTextForExclusion(text: string): string {
|
||||
const raw = text.trim();
|
||||
if (!raw) {
|
||||
return "";
|
||||
}
|
||||
|
||||
let normalized = "";
|
||||
for (const char of raw) {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (code >= 0x30a1 && code <= 0x30f6) {
|
||||
normalized += String.fromCodePoint(code - 0x60);
|
||||
continue;
|
||||
}
|
||||
|
||||
normalized += char;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function isKanaChar(char: string): boolean {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
(code >= 0x3041 && code <= 0x3096) ||
|
||||
(code >= 0x309b && code <= 0x309f) ||
|
||||
(code >= 0x30a0 && code <= 0x30fa) ||
|
||||
(code >= 0x30fd && code <= 0x30ff)
|
||||
);
|
||||
}
|
||||
|
||||
function isRepeatedKanaSfx(text: string): boolean {
|
||||
const normalized = text.trim();
|
||||
if (!normalized) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const chars = [...normalized];
|
||||
if (!chars.every(isKanaChar)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const counts = new Map<string, number>();
|
||||
let hasAdjacentRepeat = false;
|
||||
|
||||
for (let i = 0; i < chars.length; i += 1) {
|
||||
const char = chars[i];
|
||||
counts.set(char, (counts.get(char) ?? 0) + 1);
|
||||
if (i > 0 && chars[i] === chars[i - 1]) {
|
||||
hasAdjacentRepeat = true;
|
||||
}
|
||||
}
|
||||
|
||||
const topCount = Math.max(...counts.values());
|
||||
if (chars.length <= 2) {
|
||||
return hasAdjacentRepeat || topCount >= 2;
|
||||
}
|
||||
|
||||
if (hasAdjacentRepeat) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return topCount >= Math.ceil(chars.length / 2);
|
||||
}
|
||||
|
||||
function isJlptEligibleToken(token: MergedToken): boolean {
|
||||
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) return false;
|
||||
|
||||
const candidates = [
|
||||
resolveJlptLookupText(token),
|
||||
token.surface,
|
||||
token.reading,
|
||||
token.headword,
|
||||
].filter((candidate): candidate is string => typeof candidate === "string" && candidate.length > 0);
|
||||
|
||||
for (const candidate of candidates) {
|
||||
const normalizedCandidate = normalizeJlptTextForExclusion(candidate);
|
||||
if (!normalizedCandidate) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const trimmedCandidate = candidate.trim();
|
||||
if (
|
||||
shouldIgnoreJlptByTerm(trimmedCandidate) ||
|
||||
shouldIgnoreJlptByTerm(normalizedCandidate)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (
|
||||
isRepeatedKanaSfx(candidate) ||
|
||||
isRepeatedKanaSfx(normalizedCandidate)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
function applyJlptMarking(
|
||||
tokens: MergedToken[],
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
): MergedToken[] {
|
||||
return tokens.map((token) => {
|
||||
if (!isJlptEligibleToken(token)) {
|
||||
return { ...token, jlptLevel: undefined };
|
||||
}
|
||||
|
||||
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
|
||||
const fallbackLevel = getJlptLevel(token.surface);
|
||||
|
||||
return {
|
||||
...token,
|
||||
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function extractYomitanHeadword(segment: YomitanParseSegment): string {
|
||||
const headwords = segment.headwords;
|
||||
if (!Array.isArray(headwords) || headwords.length === 0) {
|
||||
@@ -131,6 +276,7 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
parseResults: unknown,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
knownWordMatchMode: NPlusOneMatchMode,
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
): MergedToken[] | null {
|
||||
if (!Array.isArray(parseResults) || parseResults.length === 0) {
|
||||
return null;
|
||||
@@ -205,6 +351,7 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
startPos: start,
|
||||
endPos: end,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: "",
|
||||
isMerged: true,
|
||||
isNPlusOneTarget: false,
|
||||
isKnown: (() => {
|
||||
@@ -221,6 +368,94 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
return tokens.length > 0 ? tokens : null;
|
||||
}
|
||||
|
||||
function pickClosestMecabPos1(
|
||||
token: MergedToken,
|
||||
mecabTokens: MergedToken[],
|
||||
): string | undefined {
|
||||
if (mecabTokens.length === 0) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const tokenStart = token.startPos ?? 0;
|
||||
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
||||
|
||||
let bestPos1: string | undefined;
|
||||
let bestOverlap = 0;
|
||||
let bestSpan = 0;
|
||||
let bestStart = Number.MAX_SAFE_INTEGER;
|
||||
|
||||
for (const mecabToken of mecabTokens) {
|
||||
if (!mecabToken.pos1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const mecabStart = mecabToken.startPos ?? 0;
|
||||
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
|
||||
const overlapStart = Math.max(tokenStart, mecabStart);
|
||||
const overlapEnd = Math.min(tokenEnd, mecabEnd);
|
||||
const overlap = Math.max(0, overlapEnd - overlapStart);
|
||||
if (overlap === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const span = mecabEnd - mecabStart;
|
||||
if (
|
||||
overlap > bestOverlap ||
|
||||
(overlap === bestOverlap &&
|
||||
(span > bestSpan ||
|
||||
(span === bestSpan && mecabStart < bestStart)))
|
||||
) {
|
||||
bestOverlap = overlap;
|
||||
bestSpan = span;
|
||||
bestStart = mecabStart;
|
||||
bestPos1 = mecabToken.pos1;
|
||||
}
|
||||
}
|
||||
|
||||
return bestOverlap > 0 ? bestPos1 : undefined;
|
||||
}
|
||||
|
||||
async function enrichYomitanPos1(
|
||||
tokens: MergedToken[],
|
||||
deps: TokenizerServiceDeps,
|
||||
text: string,
|
||||
): Promise<MergedToken[]> {
|
||||
if (!tokens || tokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
let mecabTokens: MergedToken[] | null = null;
|
||||
try {
|
||||
mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
} catch (err) {
|
||||
console.warn(
|
||||
"Failed to enrich Yomitan tokens with MeCab POS:",
|
||||
(err as Error).message,
|
||||
);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
if (!mecabTokens || mecabTokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
return tokens.map((token) => {
|
||||
if (token.pos1) {
|
||||
return token;
|
||||
}
|
||||
|
||||
const pos1 = pickClosestMecabPos1(token, mecabTokens);
|
||||
if (!pos1) {
|
||||
return token;
|
||||
}
|
||||
|
||||
return {
|
||||
...token,
|
||||
pos1,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function ensureYomitanParserWindow(
|
||||
deps: TokenizerServiceDeps,
|
||||
): Promise<boolean> {
|
||||
@@ -356,11 +591,17 @@ async function parseWithYomitanInternalParser(
|
||||
script,
|
||||
true,
|
||||
);
|
||||
return mapYomitanParseResultsToMergedTokens(
|
||||
const yomitanTokens = mapYomitanParseResultsToMergedTokens(
|
||||
parseResults,
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
deps.getJlptLevel,
|
||||
);
|
||||
if (!yomitanTokens || yomitanTokens.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return enrichYomitanPos1(yomitanTokens, deps, text);
|
||||
} catch (err) {
|
||||
console.error("Yomitan parser request failed:", (err as Error).message);
|
||||
return null;
|
||||
@@ -385,6 +626,7 @@ export async function tokenizeSubtitleService(
|
||||
.replace(/\n/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
const jlptEnabled = deps.getJlptEnabled?.() !== false;
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
@@ -393,7 +635,10 @@ export async function tokenizeSubtitleService(
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
return { text: displayText, tokens: markNPlusOneTargets(knownMarkedTokens) };
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -404,7 +649,10 @@ export async function tokenizeSubtitleService(
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
return { text: displayText, tokens: markNPlusOneTargets(knownMarkedTokens) };
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Tokenization error:", (err as Error).message);
|
||||
|
||||
@@ -59,6 +59,7 @@ export async function loadYomitanExtensionService(
|
||||
deps: YomitanExtensionLoaderDeps,
|
||||
): Promise<Extension | null> {
|
||||
const searchPaths = [
|
||||
path.join(__dirname, "..", "..", "vendor", "yomitan"),
|
||||
path.join(__dirname, "..", "..", "..", "vendor", "yomitan"),
|
||||
path.join(process.resourcesPath, "yomitan"),
|
||||
"/usr/share/SubMiner/yomitan",
|
||||
|
||||
Reference in New Issue
Block a user