mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Fix tokenizer and jlpt issues
This commit is contained in:
@@ -15,6 +15,13 @@ const JLPT_BANK_FILES: { level: JlptLevel; filename: string }[] = [
|
||||
{ level: "N4", filename: "term_meta_bank_4.json" },
|
||||
{ level: "N5", filename: "term_meta_bank_5.json" },
|
||||
];
|
||||
const JLPT_LEVEL_PRECEDENCE: Record<JlptLevel, number> = {
|
||||
N1: 5,
|
||||
N2: 4,
|
||||
N3: 3,
|
||||
N4: 2,
|
||||
N5: 1,
|
||||
};
|
||||
|
||||
const NOOP_LOOKUP = (): null => null;
|
||||
|
||||
@@ -38,6 +45,14 @@ function addEntriesToMap(
|
||||
terms: Map<string, JlptLevel>,
|
||||
log: (message: string) => void,
|
||||
): void {
|
||||
const shouldUpdateLevel = (
|
||||
existingLevel: JlptLevel | undefined,
|
||||
incomingLevel: JlptLevel,
|
||||
): boolean =>
|
||||
existingLevel === undefined ||
|
||||
JLPT_LEVEL_PRECEDENCE[incomingLevel] >
|
||||
JLPT_LEVEL_PRECEDENCE[existingLevel];
|
||||
|
||||
if (!Array.isArray(rawEntries)) {
|
||||
return;
|
||||
}
|
||||
@@ -61,55 +76,14 @@ function addEntriesToMap(
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!terms.has(normalizedTerm)) {
|
||||
const existingLevel = terms.get(normalizedTerm);
|
||||
if (shouldUpdateLevel(existingLevel, level)) {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (terms.get(normalizedTerm) !== "N1" && level === "N1") {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (terms.get(normalizedTerm) !== "N1" && terms.get(normalizedTerm) !== "N2" && level === "N2") {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
terms.get(normalizedTerm) !== "N1" &&
|
||||
terms.get(normalizedTerm) !== "N2" &&
|
||||
terms.get(normalizedTerm) !== "N3" &&
|
||||
level === "N3"
|
||||
) {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
terms.get(normalizedTerm) !== "N1" &&
|
||||
terms.get(normalizedTerm) !== "N2" &&
|
||||
terms.get(normalizedTerm) !== "N3" &&
|
||||
terms.get(normalizedTerm) !== "N4" &&
|
||||
level === "N4"
|
||||
) {
|
||||
terms.set(normalizedTerm, level);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
terms.get(normalizedTerm) !== "N1" &&
|
||||
terms.get(normalizedTerm) !== "N2" &&
|
||||
terms.get(normalizedTerm) !== "N3" &&
|
||||
terms.get(normalizedTerm) !== "N4" &&
|
||||
terms.get(normalizedTerm) !== "N5" &&
|
||||
level === "N5"
|
||||
) {
|
||||
terms.set(normalizedTerm, level);
|
||||
}
|
||||
|
||||
log(
|
||||
`JLPT dictionary already has ${normalizedTerm} as ${terms.get(normalizedTerm)}; keeping that level instead of ${level}`,
|
||||
`JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@ interface YomitanParseHeadword {
|
||||
}
|
||||
|
||||
interface YomitanParseSegment {
|
||||
text?: unknown;
|
||||
reading?: unknown;
|
||||
text?: string;
|
||||
reading?: string;
|
||||
headwords?: unknown;
|
||||
}
|
||||
|
||||
@@ -27,6 +27,20 @@ interface YomitanParseResultItem {
|
||||
content?: unknown;
|
||||
}
|
||||
|
||||
type YomitanParseLine = YomitanParseSegment[];
|
||||
|
||||
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
|
||||
function isObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value && typeof value === "object");
|
||||
}
|
||||
|
||||
function isString(value: unknown): value is string {
|
||||
return typeof value === "string";
|
||||
}
|
||||
|
||||
export interface TokenizerServiceDeps {
|
||||
getYomitanExt: () => Extension | null;
|
||||
getYomitanParserWindow: () => BrowserWindow | null;
|
||||
@@ -144,8 +158,8 @@ function normalizeJlptTextForExclusion(text: string): string {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (code >= 0x30a1 && code <= 0x30f6) {
|
||||
normalized += String.fromCodePoint(code - 0x60);
|
||||
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
|
||||
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -238,6 +252,67 @@ function isJlptEligibleToken(token: MergedToken): boolean {
|
||||
return true;
|
||||
}
|
||||
|
||||
function isYomitanParseResultItem(
|
||||
value: unknown,
|
||||
): value is YomitanParseResultItem {
|
||||
if (!isObject(value)) {
|
||||
return false;
|
||||
}
|
||||
if ((value as YomitanParseResultItem).source !== "scanning-parser") {
|
||||
return false;
|
||||
}
|
||||
if (!Array.isArray((value as YomitanParseResultItem).content)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function isYomitanParseLine(value: unknown): value is YomitanParseLine {
|
||||
if (!Array.isArray(value)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return value.every((segment) => {
|
||||
if (!isObject(segment)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const candidate = segment as YomitanParseSegment;
|
||||
return isString(candidate.text);
|
||||
});
|
||||
}
|
||||
|
||||
function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
|
||||
return (
|
||||
Array.isArray(value) &&
|
||||
value.every(
|
||||
(group) =>
|
||||
Array.isArray(group) &&
|
||||
group.every((item) =>
|
||||
isObject(item) && isString((item as YomitanParseHeadword).term),
|
||||
),
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function extractYomitanHeadword(segment: YomitanParseSegment): string {
|
||||
const headwords = segment.headwords;
|
||||
if (!isYomitanHeadwordRows(headwords)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
for (const group of headwords) {
|
||||
if (group.length > 0) {
|
||||
const firstHeadword = group[0] as YomitanParseHeadword;
|
||||
if (isString(firstHeadword?.term)) {
|
||||
return firstHeadword.term;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
function applyJlptMarking(
|
||||
tokens: MergedToken[],
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
@@ -250,46 +325,25 @@ function applyJlptMarking(
|
||||
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
|
||||
const fallbackLevel = getJlptLevel(token.surface);
|
||||
|
||||
return {
|
||||
...token,
|
||||
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
|
||||
};
|
||||
return {
|
||||
...token,
|
||||
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function extractYomitanHeadword(segment: YomitanParseSegment): string {
|
||||
const headwords = segment.headwords;
|
||||
if (!Array.isArray(headwords) || headwords.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const firstGroup = headwords[0];
|
||||
if (!Array.isArray(firstGroup) || firstGroup.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const firstHeadword = firstGroup[0] as YomitanParseHeadword;
|
||||
return typeof firstHeadword?.term === "string" ? firstHeadword.term : "";
|
||||
}
|
||||
|
||||
function mapYomitanParseResultsToMergedTokens(
|
||||
parseResults: unknown,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
knownWordMatchMode: NPlusOneMatchMode,
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
): MergedToken[] | null {
|
||||
if (!Array.isArray(parseResults) || parseResults.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const scanningItems = parseResults.filter((item) => {
|
||||
const resultItem = item as YomitanParseResultItem;
|
||||
return (
|
||||
resultItem &&
|
||||
resultItem.source === "scanning-parser" &&
|
||||
Array.isArray(resultItem.content)
|
||||
);
|
||||
}) as YomitanParseResultItem[];
|
||||
const scanningItems = parseResults.filter(
|
||||
(item): item is YomitanParseResultItem => isYomitanParseResultItem(item),
|
||||
);
|
||||
|
||||
if (scanningItems.length === 0) {
|
||||
return null;
|
||||
@@ -304,24 +358,21 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
|
||||
const tokens: MergedToken[] = [];
|
||||
let charOffset = 0;
|
||||
let validLineCount = 0;
|
||||
|
||||
for (const line of content) {
|
||||
if (!Array.isArray(line)) {
|
||||
if (!isYomitanParseLine(line)) {
|
||||
continue;
|
||||
}
|
||||
validLineCount += 1;
|
||||
|
||||
let surface = "";
|
||||
let reading = "";
|
||||
let headword = "";
|
||||
|
||||
for (const rawSegment of line) {
|
||||
const segment = rawSegment as YomitanParseSegment;
|
||||
if (!segment || typeof segment !== "object") {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const segment of line) {
|
||||
const segmentText = segment.text;
|
||||
if (typeof segmentText !== "string" || segmentText.length === 0) {
|
||||
if (!segmentText || segmentText.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -365,6 +416,9 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
});
|
||||
}
|
||||
|
||||
if (validLineCount === 0) {
|
||||
return null;
|
||||
}
|
||||
return tokens.length > 0 ? tokens : null;
|
||||
}
|
||||
|
||||
@@ -428,14 +482,22 @@ async function enrichYomitanPos1(
|
||||
try {
|
||||
mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
console.warn(
|
||||
"Failed to enrich Yomitan tokens with MeCab POS:",
|
||||
(err as Error).message,
|
||||
error.message,
|
||||
`tokenCount=${tokens.length}`,
|
||||
`textLength=${text.length}`,
|
||||
);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
if (!mecabTokens || mecabTokens.length === 0) {
|
||||
console.warn(
|
||||
"MeCab enrichment returned no tokens; preserving Yomitan token output.",
|
||||
`tokenCount=${tokens.length}`,
|
||||
`textLength=${text.length}`,
|
||||
);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
@@ -591,11 +653,10 @@ async function parseWithYomitanInternalParser(
|
||||
script,
|
||||
true,
|
||||
);
|
||||
const yomitanTokens = mapYomitanParseResultsToMergedTokens(
|
||||
const yomitanTokens = mapYomitanParseResultsToMergedTokens(
|
||||
parseResults,
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
deps.getJlptLevel,
|
||||
);
|
||||
if (!yomitanTokens || yomitanTokens.length === 0) {
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user