feat: merge AniList character dictionaries by recent usage

2026-03-07 03:22:17 -08:00 · 2026-03-06 01:01:31 -08:00
parent e2b51c6306
commit 8c2c950564
17 changed files with 1386 additions and 517 deletions
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -263,6 +263,7 @@ function isKanaChar(char: string): boolean {
  return (
    (code >= 0x3041 && code <= 0x3096) ||
    (code >= 0x309b && code <= 0x309f) ||
+    code === 0x30fc ||
    (code >= 0x30a0 && code <= 0x30fa) ||
    (code >= 0x30fd && code <= 0x30ff)
  );
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -231,6 +231,7 @@ function isKanaChar(char: string): boolean {
  return (
    (code >= 0x3041 && code <= 0x3096) ||
    (code >= 0x309b && code <= 0x309f) ||
+    code === 0x30fc ||
    (code >= 0x30a0 && code <= 0x30fa) ||
    (code >= 0x30fd && code <= 0x30ff)
  );
--- a/src/core/services/tokenizer/parser-selection-stage.test.ts
+++ b/src/core/services/tokenizer/parser-selection-stage.test.ts
@@ -127,3 +127,88 @@ test('drops scanning parser tokens which have no dictionary headword', () => {
    ],
  );
 });
+
+test('prefers the longest dictionary headword across merged segments', () => {
+  const parseResults = [
+    makeParseItem('scanning-parser', [
+      [
+        { text: 'バニ', reading: 'ばに', headword: 'バニ' },
+        { text: 'ール', reading: 'ーる', headword: 'バニール' },
+      ],
+    ]),
+  ];
+
+  const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
+  assert.deepEqual(
+    tokens?.map((token) => ({
+      surface: token.surface,
+      reading: token.reading,
+      headword: token.headword,
+    })),
+    [
+      {
+        surface: 'バニール',
+        reading: 'ばにーる',
+        headword: 'バニール',
+      },
+    ],
+  );
+});
+
+test('keeps the first headword when later segments are standalone words', () => {
+  const parseResults = [
+    makeParseItem('scanning-parser', [
+      [
+        { text: '猫', reading: 'ねこ', headword: '猫' },
+        { text: 'です', reading: 'です', headword: 'です' },
+      ],
+    ]),
+  ];
+
+  const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
+  assert.deepEqual(
+    tokens?.map((token) => ({
+      surface: token.surface,
+      reading: token.reading,
+      headword: token.headword,
+    })),
+    [
+      {
+        surface: '猫です',
+        reading: 'ねこです',
+        headword: '猫',
+      },
+    ],
+  );
+});
+
+test('merges trailing katakana continuation without headword into previous token', () => {
+  const parseResults = [
+    makeParseItem('scanning-parser', [
+      [{ text: 'カズ', reading: 'かず', headword: 'カズマ' }],
+      [{ text: 'マ', reading: 'ま' }],
+      [{ text: '魔王軍', reading: 'まおうぐん', headword: '魔王軍' }],
+    ]),
+  ];
+
+  const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
+  assert.deepEqual(
+    tokens?.map((token) => ({
+      surface: token.surface,
+      reading: token.reading,
+      headword: token.headword,
+    })),
+    [
+      {
+        surface: 'カズマ',
+        reading: 'かずま',
+        headword: 'カズマ',
+      },
+      {
+        surface: '魔王軍',
+        reading: 'まおうぐん',
+        headword: '魔王軍',
+      },
+    ],
+  );
+});
--- a/src/core/services/tokenizer/parser-selection-stage.ts
+++ b/src/core/services/tokenizer/parser-selection-stage.ts
@@ -49,6 +49,7 @@ function isKanaChar(char: string): boolean {
  return (
    (code >= 0x3041 && code <= 0x3096) ||
    (code >= 0x309b && code <= 0x309f) ||
+    code === 0x30fc ||
    (code >= 0x30a0 && code <= 0x30fa) ||
    (code >= 0x30fd && code <= 0x30ff)
  );
@@ -111,6 +112,51 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string {
  return '';
 }

+function selectMergedHeadword(
+  firstHeadword: string,
+  expandedHeadwords: string[],
+  surface: string,
+): string {
+  if (expandedHeadwords.length > 0) {
+    const exactSurfaceMatch = expandedHeadwords.find((headword) => headword === surface);
+    if (exactSurfaceMatch) {
+      return exactSurfaceMatch;
+    }
+
+    return expandedHeadwords.reduce((best, current) => {
+      if (current.length !== best.length) {
+        return current.length > best.length ? current : best;
+      }
+      return best;
+    });
+  }
+
+  if (!firstHeadword) {
+    return '';
+  }
+  return firstHeadword;
+}
+
+function isKanaOnlyText(text: string): boolean {
+  return text.length > 0 && Array.from(text).every((char) => isKanaChar(char));
+}
+
+function shouldMergeKanaContinuation(
+  previousToken: MergedToken | undefined,
+  continuationSurface: string,
+): previousToken is MergedToken {
+  if (!previousToken || !continuationSurface || !isKanaOnlyText(continuationSurface)) {
+    return false;
+  }
+
+  if (!previousToken.headword || previousToken.headword.length <= previousToken.surface.length) {
+    return false;
+  }
+
+  const appendedSurface = previousToken.surface + continuationSurface;
+  return previousToken.headword.startsWith(appendedSurface);
+}
+
 export function mapYomitanParseResultItemToMergedTokens(
  parseResult: YomitanParseResultItem,
  isKnownWord: (text: string) => boolean,
@@ -140,7 +186,8 @@ export function mapYomitanParseResultItemToMergedTokens(

    let combinedSurface = '';
    let combinedReading = '';
-    let combinedHeadword = '';
+    let firstHeadword = '';
+    const expandedHeadwords: string[] = [];

    for (const segment of line) {
      const segmentText = segment.text;
@@ -152,8 +199,14 @@ export function mapYomitanParseResultItemToMergedTokens(
      if (typeof segment.reading === 'string') {
        combinedReading += segment.reading;
      }
-      if (!combinedHeadword) {
-        combinedHeadword = extractYomitanHeadword(segment);
+      const segmentHeadword = extractYomitanHeadword(segment);
+      if (segmentHeadword) {
+        if (!firstHeadword) {
+          firstHeadword = segmentHeadword;
+        }
+        if (segmentHeadword.length > segmentText.length) {
+          expandedHeadwords.push(segmentHeadword);
+        }
      }
    }

@@ -164,7 +217,20 @@ export function mapYomitanParseResultItemToMergedTokens(
    const start = charOffset;
    const end = start + combinedSurface.length;
    charOffset = end;
+    const combinedHeadword = selectMergedHeadword(
+      firstHeadword,
+      expandedHeadwords,
+      combinedSurface,
+    );
    if (!combinedHeadword) {
+      const previousToken = tokens[tokens.length - 1];
+      if (shouldMergeKanaContinuation(previousToken, combinedSurface)) {
+        previousToken.surface += combinedSurface;
+        previousToken.reading += combinedReading;
+        previousToken.endPos = end;
+        continue;
+      }
+
      // No dictionary-backed headword for this merged unit; skip it entirely so
      // downstream keyboard/frequency/JLPT flows only operate on lookup-backed tokens.
      continue;