Fix tokenizer and jlpt issues

This commit is contained in:
2026-02-15 17:06:27 -08:00
parent f492622a8b
commit 2a2eee825c
4 changed files with 150 additions and 184 deletions

View File

@@ -15,6 +15,13 @@ const JLPT_BANK_FILES: { level: JlptLevel; filename: string }[] = [
{ level: "N4", filename: "term_meta_bank_4.json" },
{ level: "N5", filename: "term_meta_bank_5.json" },
];
const JLPT_LEVEL_PRECEDENCE: Record<JlptLevel, number> = {
N1: 5,
N2: 4,
N3: 3,
N4: 2,
N5: 1,
};
const NOOP_LOOKUP = (): null => null;
@@ -38,6 +45,14 @@ function addEntriesToMap(
terms: Map<string, JlptLevel>,
log: (message: string) => void,
): void {
const shouldUpdateLevel = (
existingLevel: JlptLevel | undefined,
incomingLevel: JlptLevel,
): boolean =>
existingLevel === undefined ||
JLPT_LEVEL_PRECEDENCE[incomingLevel] >
JLPT_LEVEL_PRECEDENCE[existingLevel];
if (!Array.isArray(rawEntries)) {
return;
}
@@ -61,55 +76,14 @@ function addEntriesToMap(
continue;
}
if (!terms.has(normalizedTerm)) {
const existingLevel = terms.get(normalizedTerm);
if (shouldUpdateLevel(existingLevel, level)) {
terms.set(normalizedTerm, level);
continue;
}
if (terms.get(normalizedTerm) !== "N1" && level === "N1") {
terms.set(normalizedTerm, level);
continue;
}
if (terms.get(normalizedTerm) !== "N1" && terms.get(normalizedTerm) !== "N2" && level === "N2") {
terms.set(normalizedTerm, level);
continue;
}
if (
terms.get(normalizedTerm) !== "N1" &&
terms.get(normalizedTerm) !== "N2" &&
terms.get(normalizedTerm) !== "N3" &&
level === "N3"
) {
terms.set(normalizedTerm, level);
continue;
}
if (
terms.get(normalizedTerm) !== "N1" &&
terms.get(normalizedTerm) !== "N2" &&
terms.get(normalizedTerm) !== "N3" &&
terms.get(normalizedTerm) !== "N4" &&
level === "N4"
) {
terms.set(normalizedTerm, level);
continue;
}
if (
terms.get(normalizedTerm) !== "N1" &&
terms.get(normalizedTerm) !== "N2" &&
terms.get(normalizedTerm) !== "N3" &&
terms.get(normalizedTerm) !== "N4" &&
terms.get(normalizedTerm) !== "N5" &&
level === "N5"
) {
terms.set(normalizedTerm, level);
}
log(
`JLPT dictionary already has ${normalizedTerm} as ${terms.get(normalizedTerm)}; keeping that level instead of ${level}`,
`JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`,
);
}
}

View File

@@ -16,8 +16,8 @@ interface YomitanParseHeadword {
}
interface YomitanParseSegment {
text?: unknown;
reading?: unknown;
text?: string;
reading?: string;
headwords?: unknown;
}
@@ -27,6 +27,20 @@ interface YomitanParseResultItem {
content?: unknown;
}
type YomitanParseLine = YomitanParseSegment[];
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object");
}
function isString(value: unknown): value is string {
return typeof value === "string";
}
export interface TokenizerServiceDeps {
getYomitanExt: () => Extension | null;
getYomitanParserWindow: () => BrowserWindow | null;
@@ -144,8 +158,8 @@ function normalizeJlptTextForExclusion(text: string): string {
continue;
}
if (code >= 0x30a1 && code <= 0x30f6) {
normalized += String.fromCodePoint(code - 0x60);
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
continue;
}
@@ -238,6 +252,67 @@ function isJlptEligibleToken(token: MergedToken): boolean {
return true;
}
function isYomitanParseResultItem(
value: unknown,
): value is YomitanParseResultItem {
if (!isObject(value)) {
return false;
}
if ((value as YomitanParseResultItem).source !== "scanning-parser") {
return false;
}
if (!Array.isArray((value as YomitanParseResultItem).content)) {
return false;
}
return true;
}
function isYomitanParseLine(value: unknown): value is YomitanParseLine {
if (!Array.isArray(value)) {
return false;
}
return value.every((segment) => {
if (!isObject(segment)) {
return false;
}
const candidate = segment as YomitanParseSegment;
return isString(candidate.text);
});
}
function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
return (
Array.isArray(value) &&
value.every(
(group) =>
Array.isArray(group) &&
group.every((item) =>
isObject(item) && isString((item as YomitanParseHeadword).term),
),
)
);
}
function extractYomitanHeadword(segment: YomitanParseSegment): string {
const headwords = segment.headwords;
if (!isYomitanHeadwordRows(headwords)) {
return "";
}
for (const group of headwords) {
if (group.length > 0) {
const firstHeadword = group[0] as YomitanParseHeadword;
if (isString(firstHeadword?.term)) {
return firstHeadword.term;
}
}
}
return "";
}
function applyJlptMarking(
tokens: MergedToken[],
getJlptLevel: (text: string) => JlptLevel | null,
@@ -250,46 +325,25 @@ function applyJlptMarking(
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
const fallbackLevel = getJlptLevel(token.surface);
return {
...token,
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
return {
...token,
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
});
}
function extractYomitanHeadword(segment: YomitanParseSegment): string {
const headwords = segment.headwords;
if (!Array.isArray(headwords) || headwords.length === 0) {
return "";
}
const firstGroup = headwords[0];
if (!Array.isArray(firstGroup) || firstGroup.length === 0) {
return "";
}
const firstHeadword = firstGroup[0] as YomitanParseHeadword;
return typeof firstHeadword?.term === "string" ? firstHeadword.term : "";
}
function mapYomitanParseResultsToMergedTokens(
parseResults: unknown,
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
getJlptLevel: (text: string) => JlptLevel | null,
): MergedToken[] | null {
if (!Array.isArray(parseResults) || parseResults.length === 0) {
return null;
}
const scanningItems = parseResults.filter((item) => {
const resultItem = item as YomitanParseResultItem;
return (
resultItem &&
resultItem.source === "scanning-parser" &&
Array.isArray(resultItem.content)
);
}) as YomitanParseResultItem[];
const scanningItems = parseResults.filter(
(item): item is YomitanParseResultItem => isYomitanParseResultItem(item),
);
if (scanningItems.length === 0) {
return null;
@@ -304,24 +358,21 @@ function mapYomitanParseResultsToMergedTokens(
const tokens: MergedToken[] = [];
let charOffset = 0;
let validLineCount = 0;
for (const line of content) {
if (!Array.isArray(line)) {
if (!isYomitanParseLine(line)) {
continue;
}
validLineCount += 1;
let surface = "";
let reading = "";
let headword = "";
for (const rawSegment of line) {
const segment = rawSegment as YomitanParseSegment;
if (!segment || typeof segment !== "object") {
continue;
}
for (const segment of line) {
const segmentText = segment.text;
if (typeof segmentText !== "string" || segmentText.length === 0) {
if (!segmentText || segmentText.length === 0) {
continue;
}
@@ -365,6 +416,9 @@ function mapYomitanParseResultsToMergedTokens(
});
}
if (validLineCount === 0) {
return null;
}
return tokens.length > 0 ? tokens : null;
}
@@ -428,14 +482,22 @@ async function enrichYomitanPos1(
try {
mecabTokens = await deps.tokenizeWithMecab(text);
} catch (err) {
const error = err as Error;
console.warn(
"Failed to enrich Yomitan tokens with MeCab POS:",
(err as Error).message,
error.message,
`tokenCount=${tokens.length}`,
`textLength=${text.length}`,
);
return tokens;
}
if (!mecabTokens || mecabTokens.length === 0) {
console.warn(
"MeCab enrichment returned no tokens; preserving Yomitan token output.",
`tokenCount=${tokens.length}`,
`textLength=${text.length}`,
);
return tokens;
}
@@ -591,11 +653,10 @@ async function parseWithYomitanInternalParser(
script,
true,
);
const yomitanTokens = mapYomitanParseResultsToMergedTokens(
const yomitanTokens = mapYomitanParseResultsToMergedTokens(
parseResults,
deps.isKnownWord,
deps.getKnownWordMatchMode(),
deps.getJlptLevel,
);
if (!yomitanTokens || yomitanTokens.length === 0) {
return null;

View File

@@ -469,106 +469,37 @@ function loadSubtitlePosition(): SubtitlePosition | null {
function getJlptDictionarySearchPaths(): string[] {
const homeDir = os.homedir();
const userDataPath = app.getPath("userData");
return [
const dictionaryRoots = [
// Source checkout paths (development + source tree)
path.join(__dirname, "..", "..", "vendor", "yomitan-jlpt-vocab"),
path.join(
__dirname,
"..",
"..",
"vendor",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(__dirname, "..", "..", "..", "vendor", "yomitan-jlpt-vocab"),
path.join(
__dirname,
"..",
"..",
"..",
"vendor",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(process.resourcesPath, "yomitan-jlpt-vocab"),
path.join(
process.resourcesPath,
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(app.getAppPath(), "vendor", "yomitan-jlpt-vocab"),
path.join(
app.getAppPath(),
"vendor",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
// Runtime package bundle paths
path.join(process.resourcesPath, "yomitan-jlpt-vocab"),
path.join(process.resourcesPath, "app.asar", "vendor", "yomitan-jlpt-vocab"),
path.join(
process.resourcesPath,
"app.asar",
"vendor",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(USER_DATA_PATH, "yomitan-jlpt-vocab"),
path.join(USER_DATA_PATH, "yomitan-jlpt-vocab", "yomitan-jlpt-vocab"),
path.join(userDataPath, "yomitan-jlpt-vocab"),
path.join(userDataPath, "yomitan-jlpt-vocab", "yomitan-jlpt-vocab"),
path.join(homeDir, ".config", "SubMiner", "yomitan-jlpt-vocab"),
path.join(
homeDir,
".config",
"SubMiner",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(homeDir, ".config", "subminer", "yomitan-jlpt-vocab"),
path.join(
homeDir,
".config",
"subminer",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(
homeDir,
"Library",
"Application Support",
"SubMiner",
"yomitan-jlpt-vocab",
),
path.join(
homeDir,
"Library",
"Application Support",
"SubMiner",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(
homeDir,
"Library",
"Application Support",
"subminer",
"yomitan-jlpt-vocab",
),
path.join(
homeDir,
"Library",
"Application Support",
"subminer",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
path.join(process.cwd(), "vendor", "yomitan-jlpt-vocab"),
path.join(
process.cwd(),
"vendor",
"yomitan-jlpt-vocab",
"yomitan-jlpt-vocab",
),
// User-configurable override locations
USER_DATA_PATH,
app.getPath("userData"),
path.join(homeDir, ".config", "SubMiner"),
path.join(homeDir, ".config", "subminer"),
path.join(homeDir, "Library", "Application Support", "SubMiner"),
path.join(homeDir, "Library", "Application Support", "subminer"),
// CLI invocation path (when launched from project root)
process.cwd(),
];
const searchPaths: string[] = [];
for (const dictionaryRoot of dictionaryRoots) {
searchPaths.push(dictionaryRoot);
searchPaths.push(path.join(dictionaryRoot, "vendor", "yomitan-jlpt-vocab"));
searchPaths.push(path.join(dictionaryRoot, "yomitan-jlpt-vocab"));
}
const uniquePaths = new Set<string>();
for (const searchPath of searchPaths) {
uniquePaths.add(searchPath);
}
return [...uniquePaths];
}
async function initializeJlptDictionaryLookup(): Promise<void> {

View File

@@ -65,7 +65,7 @@ test("JLPT CSS rules use underline-only styling in renderer stylesheet", () => {
assert.ok(block.length > 0, `word-jlpt-n${level} class should exist`);
assert.match(block, /text-decoration-line:\s*underline;/);
assert.match(block, /text-decoration-thickness:\s*2px;/);
assert.match(block, /text-underline-offset:\s*2px;/);
assert.match(block, /text-underline-offset:\s*4px;/);
assert.match(block, /color:\s*inherit;/);
}
});