feat(tokenizer): use Yomitan word classes for subtitle POS filtering

- Carry matched headword wordClasses from termsFind into YomitanScanToken - Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation - MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1 - Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations - Respect source-text punctuation gaps when counting N+1 sentence words - Preserve known-word highlight on excluded kanji-containing tokens - Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
2026-05-03 04:19:27 -07:00 · 2026-04-25 23:08:33 -07:00
parent 13e2b5f8c8
commit 4b3ebd6ef6
11 changed files with 926 additions and 39 deletions
--- a/src/core/services/tokenizer/yomitan-parser-runtime.ts
+++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts
@@ -53,6 +53,7 @@ export interface YomitanScanToken {
  endPos: number;
  isNameMatch?: boolean;
  frequencyRank?: number;
+  wordClasses?: string[];
 }

 interface YomitanProfileMetadata {
@@ -91,7 +92,10 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
        typeof entry.startPos === 'number' &&
        typeof entry.endPos === 'number' &&
        (entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
-        (entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
+        (entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number') &&
+        (entry.wordClasses === undefined ||
+          (Array.isArray(entry.wordClasses) &&
+            entry.wordClasses.every((wordClass) => typeof wordClass === 'string'))),
    )
  );
 }
@@ -975,6 +979,11 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
        return best;
      }
      function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
+        function normalizeWordClasses(headword) {
+          if (!Array.isArray(headword?.wordClasses)) { return undefined; }
+          const classes = headword.wordClasses.filter((wordClass) => typeof wordClass === "string" && wordClass.trim().length > 0);
+          return classes.length > 0 ? classes : undefined;
+        }
        function appendDictionaryNames(target, value) {
          if (!value || typeof value !== 'object') {
            return;
@@ -1033,6 +1042,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
          return {
            term: preferredMatch.headword.term,
            reading: preferredMatch.headword.reading,
+            wordClasses: normalizeWordClasses(preferredMatch.headword),
            isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
            frequencyRank: getBestFrequencyRankForMatches(
              exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
@@ -1099,7 +1109,7 @@ ${YOMITAN_SCANNING_HELPERS}
          if (preferredHeadword && typeof preferredHeadword.term === "string") {
            const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
            const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
-            tokens.push({
+            const tokenPayload = {
              surface: segments.map((segment) => segment.text).join("") || source,
              reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
              headword: preferredHeadword.term,
@@ -1110,7 +1120,11 @@ ${YOMITAN_SCANNING_HELPERS}
                typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
                  ? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
                  : undefined,
-            });
+            };
+            if (Array.isArray(preferredHeadword.wordClasses) && preferredHeadword.wordClasses.length > 0) {
+              tokenPayload.wordClasses = preferredHeadword.wordClasses;
+            }
+            tokens.push(tokenPayload);
            i += originalTextLength;
            continue;
          }