fix(tokenizer): preserve known-word highlight when POS filters suppress

- Known-word cache matches now set isKnown=true even for tokens excluded by POS filters - POS exclusion gate suppresses N+1, frequency, and JLPT only; known status is computed before the gate - Jellyfin subtitle preload continues after cleanup failures instead of aborting - Update config docs and option description to document the known-word bypass behavior
2026-05-26 12:55:16 -07:00 · 2026-05-22 02:27:40 -07:00
parent 3de7ed8b54
commit 9ba7f909b5
9 changed files with 111 additions and 52 deletions
@@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati
  assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
  assert.equal(result.tokens?.[0]?.frequencyRank, 40);
  assert.equal(result.tokens?.[1]?.surface, 'です');
-  assert.equal(result.tokens?.[1]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.isKnown, true);
  assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
  assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
@@ -3365,7 +3365,7 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
 });

-test('tokenizeSubtitle clears known-word highlight for exact non-independent kanji noun tokens', async () => {
+test('tokenizeSubtitle keeps known-word highlight for exact non-independent kanji noun tokens', async () => {
  const result = await tokenizeSubtitle(
    'その点',
    makeDepsFromYomitanTokens(
@@ -3413,7 +3413,7 @@ test('tokenizeSubtitle clears known-word highlight for exact non-independent kan
  assert.equal(result.tokens?.length, 2);
  assert.equal(result.tokens?.[0]?.isKnown, false);
  assert.equal(result.tokens?.[1]?.surface, '点');
-  assert.equal(result.tokens?.[1]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.isKnown, true);
  assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
  assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
@@ -4028,7 +4028,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper
      {
        surface: 'これで',
        headword: 'これ',
-        isKnown: false,
+        isKnown: true,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
@@ -4143,7 +4143,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings'
      {
        surface: 'のかな',
        headword: 'の',
-        isKnown: false,
+        isKnown: true,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
@@ -4672,7 +4672,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru
    {
      surface: 'ある',
      headword: 'ある',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -4717,7 +4717,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w
    {
      surface: 'ですよ',
      headword: 'です',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -5044,7 +5044,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi
    {
      surface: 'れた',
      headword: 'れる',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -5181,7 +5181,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
    {
      surface: 'てく',
      headword: 'てく',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -5192,7 +5192,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
    {
      surface: 'れた',
      headword: 'れる',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -425,6 +425,21 @@ test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside exp
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), false);
 });

+test('shouldExcludeTokenFromSubtitleAnnotations still excludes lexical non-independent kanji nouns from non-known annotations', () => {
+  const token = makeToken({
+    surface: '以外',
+    headword: '以外',
+    reading: 'イガイ',
+    partOfSpeech: PartOfSpeech.noun,
+    pos1: '名詞',
+    pos2: '非自立',
+    pos3: '副詞可能',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+  assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true);
+});
+
 test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particles auxiliaries and adnominals', () => {
  const tokens = [
    makeToken({
@@ -971,8 +986,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  );

  assert.equal(result[0]?.isKnown, false);
-  assert.equal(result[1]?.isKnown, false);
-  assert.equal(result[2]?.isKnown, false);
+  assert.equal(result[1]?.isKnown, true);
+  assert.equal(result[2]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

@@ -1186,7 +1201,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

-test('annotateTokens clears known-word status for non-independent kanji noun tokens', () => {
+test('annotateTokens keeps known-word status for non-independent kanji noun tokens', () => {
  const tokens = [
    makeToken({
      surface: '点',
@@ -1211,12 +1226,41 @@ test('annotateTokens clears known-word status for non-independent kanji noun tok
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

+test('annotateTokens keeps known-word status for lexical non-independent kanji nouns', () => {
+  const tokens = [
+    makeToken({
+      surface: '以外',
+      reading: 'イガイ',
+      headword: '以外',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞',
+      pos2: '非自立',
+      pos3: '副詞可能',
+      startPos: 2,
+      endPos: 4,
+      frequencyRank: 437,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '以外',
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, true);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+});
+
 test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
  const tokens = [
    makeToken({
@@ -1401,7 +1445,7 @@ test('annotateTokens excludes composite tokens when all component pos tags are e
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

-test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => {
+test('annotateTokens lets known words bypass the shared exclusion gate for known status only', () => {
  const tokens = [
    makeToken({
      surface: 'これで',
@@ -1425,13 +1469,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for kana-only non-independent noun helper merges', () => {
+test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => {
  const tokens = [
    makeToken({
      surface: 'ことに',
@@ -1455,13 +1499,13 @@ test('annotateTokens clears known status and other annotations for kana-only non
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for standalone auxiliary inflection fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => {
  const tokens = [
    makeToken({
      surface: 'れる',
@@ -1497,14 +1541,14 @@ test('annotateTokens clears known status and other annotations for standalone au
  );

  for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
  }
 });

-test('annotateTokens clears known status and other annotations for auxiliary-only te-kureru helper spans', () => {
+test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => {
  const tokens = [
    makeToken({
      surface: 'てく',
@@ -1540,7 +1584,7 @@ test('annotateTokens clears known status and other annotations for auxiliary-onl
  );

  for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
@@ -1576,7 +1620,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', ()
  assert.equal(result[0]?.jlptLevel, 'N4');
 });

-test('annotateTokens clears known status and other annotations for standalone して helper fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => {
  const tokens = [
    makeToken({
      surface: 'してる',
@@ -1600,13 +1644,13 @@ test('annotateTokens clears known status and other annotations for standalone
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for standalone particle fragments without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => {
  const tokens = [
    makeToken({
      surface: 'と',
@@ -1630,13 +1674,13 @@ test('annotateTokens clears known status and other annotations for standalone pa
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status from standalone particles even when the known-word cache contains them', () => {
+test('annotateTokens keeps known status on standalone particles when the known-word cache contains them', () => {
  const tokens = [
    makeToken({
      surface: 'に',
@@ -1671,7 +1715,7 @@ test('annotateTokens clears known status from standalone particles even when the
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
@@ -1728,7 +1772,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () =
  assert.equal(result[1]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for rhetorical もんか grammar particle phrases', () => {
+test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => {
  const tokens = [
    makeToken({
      surface: 'もんか',
@@ -1752,13 +1796,13 @@ test('annotateTokens clears known status and other annotations for rhetorical
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for bare くれ auxiliary fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => {
  const tokens = [
    makeToken({
      surface: 'くれ',
@@ -1782,13 +1826,13 @@ test('annotateTokens clears known status and other annotations for bare くれ a
    { minSentenceWordsForNPlusOne: 1 },
  );

-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for aru existence verbs', () => {
+test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => {
  const tokens = [
    makeToken({
      surface: '有る',
@@ -1818,14 +1862,14 @@ test('annotateTokens clears known status and other annotations for aru existence

  assert.equal(result[0]?.surface, '有る');
  assert.equal(result[0]?.headword, '有る');
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.isNameMatch, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears known status and other annotations for standalone quote particle and auxiliary grammar terms', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => {
  const tokens = [
    makeToken({
      surface: 'って',
@@ -1861,14 +1905,14 @@ test('annotateTokens clears known status and other annotations for standalone qu
  );

  for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
  }
 });

-test('annotateTokens clears known status and other annotations from standalone あ interjections without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => {
  const tokens = [
    makeToken({
      surface: 'あ',
@@ -1898,13 +1942,13 @@ test('annotateTokens clears known status and other annotations from standalone
  assert.equal(result[0]?.surface, 'あ');
  assert.equal(result[0]?.headword, 'あ');
  assert.equal(result[0]?.reading, 'あ');
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });

-test('annotateTokens clears all annotations from expressive subtitle interjections without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations from expressive subtitle interjections without POS tags', () => {
  const tokens = [
    makeToken({
      surface: 'ハァ',
@@ -1960,7 +2004,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio
  );

  for (const token of result.slice(0, 2)) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
@@ -680,6 +680,11 @@ export function annotateTokens(

  // Single pass: compute known word status, frequency filtering, and JLPT level together
  const annotated = tokens.map((token, index) => {
+    const isKnownForMatching = shouldComputeKnownStatus
+      ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
+      : false;
+    nPlusOneKnownStatuses[index] = isKnownForMatching;
+
    if (
      sharedShouldExcludeTokenFromSubtitleAnnotations(token, {
        pos1Exclusions,
@@ -690,18 +695,13 @@ export function annotateTokens(
        pos1Exclusions,
        pos2Exclusions,
      });
-      nPlusOneKnownStatuses[index] = false;
      return {
        ...strippedToken,
-        isKnown: false,
+        isKnown: knownWordsEnabled ? isKnownForMatching : false,
      };
    }

    const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
-    const isKnownForMatching = shouldComputeKnownStatus
-      ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
-      : false;
-    nPlusOneKnownStatuses[index] = isKnownForMatching;

    const frequencyRank =
      frequencyEnabled && !prioritizedNameMatch