fix(subtitle): restore known and JLPT token annotations

2026-04-12 04:19:25 -07:00 · 2026-03-19 18:03:20 -07:00
parent 1b5f0c6999
commit 43a0d11446
16 changed files with 258 additions and 43 deletions
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -55,6 +55,29 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
  assert.equal(surfaceResult[0]?.isKnown, false);
 });

+test('annotateTokens falls back to reading for known-word matches when headword lookup misses', () => {
+  const tokens = [
+    makeToken({
+      surface: '大体',
+      headword: '大体',
+      reading: 'だいたい',
+      frequencyRank: 1895,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'だいたい',
+      getJlptLevel: (text) => (text === '大体' ? 'N4' : null),
+    }),
+  );
+
+  assert.equal(result[0]?.isKnown, true);
+  assert.equal(result[0]?.jlptLevel, 'N4');
+  assert.equal(result[0]?.frequencyRank, 1895);
+});
+
 test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
  const tokens = [
    makeToken({
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -560,12 +560,7 @@ function isJlptEligibleToken(token: MergedToken): boolean {
    return false;
  }

-  const candidates = [
-    resolveJlptLookupText(token),
-    token.surface,
-    token.reading,
-    token.headword,
-  ].filter(
+  const candidates = [resolveJlptLookupText(token), token.surface, token.headword].filter(
    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
  );

@@ -659,7 +654,16 @@ function computeTokenKnownStatus(
  knownWordMatchMode: NPlusOneMatchMode,
 ): boolean {
  const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode);
-  return token.isKnown || (matchText ? isKnownWord(matchText) : false);
+  if (token.isKnown || (matchText ? isKnownWord(matchText) : false)) {
+    return true;
+  }
+
+  const normalizedReading = token.reading.trim();
+  if (!normalizedReading) {
+    return false;
+  }
+
+  return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
 }

 function filterTokenFrequencyRank(