diff --git a/backlog/tasks/task-315 - Suppress-annotations-for-standalone-じゃない-and-です-ending-tokens.md b/backlog/tasks/task-315 - Suppress-annotations-for-standalone-じゃない-and-です-ending-tokens.md index 28f41904..ed32daef 100644 --- a/backlog/tasks/task-315 - Suppress-annotations-for-standalone-じゃない-and-です-ending-tokens.md +++ b/backlog/tasks/task-315 - Suppress-annotations-for-standalone-じゃない-and-です-ending-tokens.md @@ -5,7 +5,7 @@ status: Done assignee: - codex created_date: '2026-05-03 00:02' -updated_date: '2026-05-03 00:31' +updated_date: '2026-05-03 06:05' labels: - bug - tokenizer @@ -25,6 +25,7 @@ Standalone `じゃない` grammar ending tokens should not display or persist su - [x] #2 Common polite/question variants such as `じゃないですか` and `ですよ` remain excluded when tokenized as a single ending token. - [x] #3 Regression coverage proves same-line Yomitan segments split content from trailing grammar endings so the content word can be annotated without coloring the ending. - [x] #4 Auxiliary-only helper spans such as `てく` + `れた` in `ベアトリスがいてくれたから` have known-word, N+1, frequency, and JLPT annotation metadata cleared. +- [x] #5 Hard-coded grammar-ending phrase permutations are replaced by shared pattern matching, with parser selection and subtitle annotation filtering using the same grammar-ending classifier. ## Implementation Plan @@ -35,6 +36,10 @@ Standalone `じゃない` grammar ending tokens should not display or persist su 3. Patch the shared subtitle annotation filter so kana-only auxiliary helper spans made only of grammar POS components are excluded while preserving lexical content tokens. 4. Re-run targeted tokenizer/annotation tests, then run SubMiner change verification classifier/verifier for the touched files. 5. Update TASK-315 acceptance criteria, notes, and final summary with commands and outcomes. + +Replace explicit standalone grammar-ending permutations with a compact shared matcher used by parser selection and annotation filtering. + +Add regression tests first for non-enumerated polite copula / ja-nai variants so the matcher behavior is proven, then refactor implementation and verify targeted lanes. ## Implementation Notes @@ -45,14 +50,18 @@ Implemented as one focused tokenizer fix. Parser selection now splits dictionary 2026-05-03: Reopened for approved add-on covering auxiliary-only `てく` + `れた` helper highlighting report. 2026-05-03: Added regression coverage for `ベアトリスがいてくれたから` where Yomitan emits `てく` + `れた` and MeCab enrichment tags `てく` as `助詞|動詞` / `接続助詞|非自立`. The regression initially failed because `てく` kept `isKnown: true` and `jlptLevel: N4`. Added a shared-filter helper for kana-only particle+non-independent-verb helper spans, preserving lexical `自立` verbs. Verification: `bun test src/core/services/tokenizer/annotation-stage.test.ts`, `bun test src/core/services/tokenizer.test.ts`, `bun test src/core/services/tokenizer/parser-selection-stage.test.ts`, `bun x prettier --check ...`, and `bun run typecheck` passed. SubMiner verifier core lane passed typecheck but `bun run test:fast` failed on unrelated existing cross-suite issues: `window.electronAPI` undefined in `src/renderer/handlers/keyboard.ts` during `src/core/services/subsync.test.ts`, followed by Bun `node:test` nested-test cascade. + +2026-05-03: Reopened for follow-up requested by user: remove hard-coded standalone grammar-ending permutation list and lean on pattern/POS filtering where possible. + +2026-05-03: Added shared `grammar-ending.ts` matcher for polite copula, negative copula, and explanatory endings. Parser selection now uses the standalone-ending matcher instead of `STANDALONE_GRAMMAR_ENDINGS`. Shared subtitle filter now uses the same grammar classifier instead of generated phrase sets. Removed stale duplicate subtitle-exclusion helpers from `annotation-stage.ts`; annotation-stage continues to delegate subtitle exclusion to the shared filter. Verification passed: targeted tokenizer/parser/annotation tests, Prettier check, `bun run typecheck`, `bun run test:fast`, `bun run test:env`, `bun run build`, and `bun run test:smoke:dist`. `bun run changelog:lint` remains blocked by pre-existing malformed fragment `changes/319-interjection-annotation-filter.md`. ## Final Summary -Split dictionary-backed trailing grammar ending segments (`です`, `じゃない*`) from preceding Yomitan same-line content before annotation, and added bare `です` to the explicit polite copula exclusion set. +Replaced grammar-ending phrase permutations with shared pattern matching. `parser-selection-stage.ts` now splits standalone grammar endings through `grammar-ending.ts` instead of `STANDALONE_GRAMMAR_ENDINGS`; `subtitle-annotation-filter.ts` uses the same classifier for polite copula, negative copula, and explanatory endings instead of generated exact phrase sets. -Added the approved auxiliary-helper fix for `ベアトリスがいてくれたから`: kana-only `てく` + `れた` helper spans now clear known-word, N+1, frequency, and JLPT annotation metadata when POS enrichment shows a particle + non-independent verb helper, while lexical `自立` verb forms like `くれ`/`くれる` remain eligible. +Kept exclusion ownership cleaner: subtitle annotation exclusion remains in the shared filter, while `annotation-stage.ts` no longer carries stale duplicate subtitle-exclusion constants/helpers. Added regressions for pattern coverage including `ではないですか` splitting and no-POS grammar-ending annotation clearing. -Verification passed for targeted tokenizer/annotation/parser tests, Prettier check on touched files, and `bun run typecheck`. The SubMiner core verifier's `test:fast` step remains blocked by unrelated pre-existing cross-suite failures in `subsync`/renderer keyboard globals plus Bun `node:test` cascade; artifact: `.tmp/skill-verification/subminer-verify-20260502-173004-CMu3ai/`. +Verification passed: targeted tokenizer/parser/annotation tests, Prettier check, `bun run typecheck`, `bun run test:fast`, `bun run test:env`, `bun run build`, and `bun run test:smoke:dist`. `bun run changelog:lint` is blocked by pre-existing malformed `changes/319-interjection-annotation-filter.md`; new fragment `changes/321-grammar-ending-pattern-filter.md` uses the current metadata format. diff --git a/backlog/tasks/task-321 - Preserve-word-audio-during-manual-clipboard-card-updates.md b/backlog/tasks/task-321 - Preserve-word-audio-during-manual-clipboard-card-updates.md new file mode 100644 index 00000000..431bbeb0 --- /dev/null +++ b/backlog/tasks/task-321 - Preserve-word-audio-during-manual-clipboard-card-updates.md @@ -0,0 +1,63 @@ +--- +id: TASK-321 +title: Preserve word audio during manual clipboard card updates +status: Done +assignee: + - '@Codex' +created_date: '2026-05-03 06:22' +updated_date: '2026-05-03 06:23' +labels: + - anki + - mining +dependencies: [] +priority: medium +--- + +## Description + + +Manual Ctrl+Shift+C/Ctrl+V card updates on already-mined cards should refresh the sentence content and generated sentence media without removing or replacing the existing word/expression audio. The word is unchanged in this flow, so the configured word audio field must be left untouched while sentence audio remains forced-overwrite behavior from TASK-299. + + +## Acceptance Criteria + +- [x] #1 Manual clipboard subtitle update replaces the resolved sentence audio field with newly generated sentence audio. +- [x] #2 Manual clipboard subtitle update does not include the configured word/expression audio field in Anki field updates. +- [x] #3 Animated image generation still uses the existing word audio duration for lead-in sync when configured. +- [x] #4 A regression test covers preserving word/expression audio during manual clipboard update. + + +## Implementation Plan + + +1. Update the focused manual clipboard card update regression so generated audio is written only to the resolved sentence audio field and the configured word/expression audio field is absent from updateNoteFields payloads. +2. Run the focused test and confirm it fails for the existing TASK-299 behavior. +3. Change CardCreationService.updateLastAddedFromClipboard to stop merging/updating expression audio while preserving forced overwrite for sentence audio. +4. Run the focused test; then run adjacent Anki card-creation tests if the focused gate passes. +5. Update task acceptance criteria/final notes with verification results. + + +## Implementation Notes + + +Implemented narrow manual clipboard update change in CardCreationService.updateLastAddedFromClipboard: generated audio now force-overwrites only the resolved sentence audio field and no longer writes the configured word/expression audio field. Animated AVIF lead-in still runs from the original note info before image generation, preserving existing word-audio sync behavior. + + +## Final Summary + + +Summary: +- Manual Ctrl+Shift+C/Ctrl+V card updates now leave the configured word/expression audio field untouched while force-replacing the resolved sentence audio field. +- Updated the regression test to assert the Anki update payload omits ExpressionAudio and only merges SentenceAudio with forced overwrite. +- Updated docs-site behavior notes and added a changelog fragment for the sentence-only manual audio replacement behavior. + +Verification: +- bun test src/anki-integration/card-creation-manual-update.test.ts src/anki-integration/card-creation.test.ts src/anki-integration/animated-image-sync.test.ts +- bun run typecheck +- bun run docs:test +- bun run docs:build +- git diff --check -- src/anki-integration/card-creation.ts src/anki-integration/card-creation-manual-update.test.ts docs-site/mining-workflow.md docs-site/anki-integration.md docs-site/configuration.md changes/322-preserve-word-audio-manual-update.md + +Blocked gate: +- bun run changelog:lint is blocked by pre-existing malformed changes/319-interjection-annotation-filter.md, which is outside this task's files. + diff --git a/changes/321-grammar-ending-pattern-filter.md b/changes/321-grammar-ending-pattern-filter.md new file mode 100644 index 00000000..d09a4057 --- /dev/null +++ b/changes/321-grammar-ending-pattern-filter.md @@ -0,0 +1,5 @@ +type: fixed +area: tokenizer + +- Tokenizer: Replaced hard-coded standalone grammar-ending permutations with shared pattern matching for polite copula, negative copula, and explanatory subtitle endings. +- Tokenizer: Kept grammar annotation exclusion logic in the shared subtitle filter and removed stale duplicate exclusion helpers from the annotation stage. diff --git a/changes/322-preserve-word-audio-manual-update.md b/changes/322-preserve-word-audio-manual-update.md new file mode 100644 index 00000000..42e0ce67 --- /dev/null +++ b/changes/322-preserve-word-audio-manual-update.md @@ -0,0 +1,4 @@ +type: fixed +area: anki + +- Anki: Manual clipboard subtitle updates now preserve existing word audio while replacing sentence audio and animated-image media. diff --git a/docs-site/anki-integration.md b/docs-site/anki-integration.md index 7a1c05f8..a024336d 100644 --- a/docs-site/anki-integration.md +++ b/docs-site/anki-integration.md @@ -213,7 +213,7 @@ Animated AVIF requires an AV1 encoder (`libaom-av1`, `libsvtav1`, or `librav1e`) } ``` -`overwriteAudio` applies to automatic card updates and duplicate-card enrichment. Manual clipboard subtitle updates (`Ctrl/Cmd+C`, then `Ctrl/Cmd+V`) always replace generated audio in both the expression audio field and sentence audio field. +`overwriteAudio` applies to automatic card updates and duplicate-card enrichment. Manual clipboard subtitle updates (`Ctrl/Cmd+C`, then `Ctrl/Cmd+V`) always replace generated sentence audio, while leaving the word audio field unchanged. ## AI Translation diff --git a/docs-site/configuration.md b/docs-site/configuration.md index eb8e2fbf..8acab511 100644 --- a/docs-site/configuration.md +++ b/docs-site/configuration.md @@ -893,7 +893,7 @@ This example is intentionally compact. The option table below documents availabl | `media.audioPadding` | number (seconds) | Padding around audio clip timing (default: `0.5`) | | `media.fallbackDuration` | number (seconds) | Default duration if timing unavailable (default: `3.0`) | | `media.maxMediaDuration` | number (seconds) | Max duration for generated media from multi-line copy (default: `30`, `0` to disable) | -| `behavior.overwriteAudio` | `true`, `false` | Replace existing audio on updates; when `false`, new audio is appended/prepended per `behavior.mediaInsertMode`; manual clipboard updates always replace generated audio (default: `true`) | +| `behavior.overwriteAudio` | `true`, `false` | Replace existing audio on updates; when `false`, new audio is appended/prepended per `behavior.mediaInsertMode`; manual clipboard updates always replace generated sentence audio (default: `true`) | | `behavior.overwriteImage` | `true`, `false` | Replace existing images on updates; when `false`, new images are appended/prepended per `behavior.mediaInsertMode` (default: `true`) | | `behavior.mediaInsertMode` | `"append"`, `"prepend"` | Where to insert new media when overwrite is off (default: `"append"`) | | `behavior.highlightWord` | `true`, `false` | Highlight the word in sentence context (default: `true`) | diff --git a/docs-site/mining-workflow.md b/docs-site/mining-workflow.md index 5154a470..406dc15d 100644 --- a/docs-site/mining-workflow.md +++ b/docs-site/mining-workflow.md @@ -100,7 +100,7 @@ If you prefer a hands-on approach (animecards-style), you can copy the current s - For multiple lines: press `Ctrl/Cmd+Shift+C`, then a digit `1`–`9` to select how many recent subtitle lines to combine. The combined text is copied to the clipboard. 3. Press `Ctrl/Cmd+V` to update the last-added card with the clipboard contents plus audio, image, and translation — the same fields auto-update would fill. -Manual clipboard updates always replace generated audio in both the expression audio field and sentence audio field, even when `ankiConnect.behavior.overwriteAudio` is disabled. The manual flow assumes you are intentionally replacing the proxy-generated clip on the newest card. +Manual clipboard updates always replace generated sentence audio, even when `ankiConnect.behavior.overwriteAudio` is disabled. The word audio field is left unchanged because the word itself does not change in this flow. This is useful when auto-update is disabled or when you want explicit control over which subtitle line gets attached to the card. diff --git a/src/anki-integration/card-creation-manual-update.test.ts b/src/anki-integration/card-creation-manual-update.test.ts index fe1bea29..8ac6efe2 100644 --- a/src/anki-integration/card-creation-manual-update.test.ts +++ b/src/anki-integration/card-creation-manual-update.test.ts @@ -126,7 +126,7 @@ function createManualUpdateService(overrides: Partial = {}): { }; } -test('manual clipboard subtitle update replaces expression and sentence audio even when overwriteAudio is disabled', async () => { +test('manual clipboard subtitle update replaces sentence audio without touching expression audio', async () => { const { service, updatedFields, mergeCalls, storedMedia } = createManualUpdateService(); await service.updateLastAddedFromClipboard('字幕'); @@ -134,10 +134,10 @@ test('manual clipboard subtitle update replaces expression and sentence audio ev assert.equal(updatedFields.length, 1); assert.equal(storedMedia.length, 1); const audioValue = `[sound:${storedMedia[0]}]`; - assert.equal(updatedFields[0]?.ExpressionAudio, audioValue); assert.equal(updatedFields[0]?.SentenceAudio, audioValue); + assert.equal('ExpressionAudio' in updatedFields[0]!, false); assert.deepEqual( mergeCalls.map((call) => call.overwrite), - [true, true], + [true], ); }); diff --git a/src/anki-integration/card-creation.ts b/src/anki-integration/card-creation.ts index 2d2fc536..43bb40a1 100644 --- a/src/anki-integration/card-creation.ts +++ b/src/anki-integration/card-creation.ts @@ -219,10 +219,6 @@ export class CardCreationService { this.deps.getConfig(), ); const sentenceAudioField = this.getResolvedSentenceAudioFieldName(noteInfo); - const expressionAudioField = this.deps.resolveConfiguredFieldName( - noteInfo, - this.deps.getConfig().fields?.audio || 'ExpressionAudio', - ); const sentenceField = this.deps.getEffectiveSentenceCardConfig().sentenceField; const sentence = blocks.join(' '); @@ -252,22 +248,15 @@ export class CardCreationService { if (audioBuffer) { await this.deps.client.storeMediaFile(audioFilename, audioBuffer); - if (sentenceAudioField || expressionAudioField) { + if (sentenceAudioField) { const audioValue = `[sound:${audioFilename}]`; - const audioFields = new Set( - [sentenceAudioField, expressionAudioField].filter( - (fieldName): fieldName is string => Boolean(fieldName), - ), + const existingAudio = noteInfo.fields[sentenceAudioField]?.value || ''; + // Manual clipboard updates intentionally replace old captured sentence audio. + updatedFields[sentenceAudioField] = this.deps.mergeFieldValue( + existingAudio, + audioValue, + true, ); - for (const audioField of audioFields) { - const existingAudio = noteInfo.fields[audioField]?.value || ''; - // Manual clipboard updates intentionally replace old captured audio. - updatedFields[audioField] = this.deps.mergeFieldValue( - existingAudio, - audioValue, - true, - ); - } } miscInfoFilename = audioFilename; updatePerformed = true; diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index d0f30737..77b0f147 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -301,6 +301,31 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copul } }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes grammar-ending patterns without enumerating variants', () => { + const tokens = [ + makeToken({ + surface: 'ですわ', + headword: 'です', + reading: 'デスワ', + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + }), + makeToken({ + surface: 'ではないですか', + headword: 'ない', + reading: 'デハナイデスカ', + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + }), + ]; + + for (const token of tokens) { + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface); + } +}); + test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => { const token = makeToken({ surface: 'そうだ', diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index 2eab6c80..d8193365 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -18,57 +18,6 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; -const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ - 'ああ', - 'ええ', - 'うう', - 'おお', - 'はあ', - 'はは', - 'へえ', - 'ふう', - 'ほう', -]); -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ - 'だ', - 'です', - 'でした', - 'だった', - 'では', - 'じゃ', - 'でしょう', - 'だろう', -] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [ - '', - 'か', - 'ね', - 'よ', - 'な', - 'けど', - 'よね', - 'かな', - 'かね', -] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set( - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) => - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) => - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map( - (particle) => `${prefix}${core}${particle}`, - ), - ), - ), -); -const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([ - 'って', - 'ってよ', - 'ってね', - 'ってな', - 'ってさ', - 'ってか', - 'ってば', -]); const jlptLevelLookupCaches = new WeakMap< (text: string) => JlptLevel | null, @@ -104,10 +53,6 @@ function normalizePos1Tag(pos1: string | undefined): string { return typeof pos1 === 'string' ? pos1.trim() : ''; } -const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']); -const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']); -const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']); - function splitNormalizedTagParts(normalizedTag: string): string[] { if (!normalizedTag) { return []; @@ -129,57 +74,6 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet exclusions.has(part)); } -function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean { - const parts = splitNormalizedTagParts(normalizedPos1); - if (parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part))) { - return true; - } - - return parts.length > 0 && parts.every((part) => SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(part)); -} - -function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean { - const normalizedSurface = normalizeJlptTextForExclusion(token.surface); - const normalizedHeadword = normalizeJlptTextForExclusion(token.headword); - if ( - !normalizedSurface || - !normalizedHeadword || - !normalizedSurface.startsWith(normalizedHeadword) - ) { - return false; - } - - const suffix = normalizedSurface.slice(normalizedHeadword.length); - if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) { - return false; - } - - const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1)); - if (pos1Parts.length < 2) { - return false; - } - - const [leadingPos1, ...trailingPos1] = pos1Parts; - if (!leadingPos1 || SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(leadingPos1)) { - return false; - } - - return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞'); -} - -function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean { - const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1)); - if ( - pos1Parts.length === 0 || - !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part)) - ) { - return false; - } - - const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3)); - return pos3Parts.includes('助動詞語幹'); -} - function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet { if (options.pos1Exclusions) { return options.pos1Exclusions; @@ -609,44 +503,6 @@ function isJlptEligibleToken(token: MergedToken): boolean { return true; } -function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean { - const candidates = [token.surface, token.reading, resolveJlptLookupText(token)].filter( - (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, - ); - - for (const candidate of candidates) { - const trimmedCandidate = candidate.trim(); - if (!trimmedCandidate) { - continue; - } - - const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate); - if (!normalizedCandidate) { - continue; - } - - if ( - SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) || - SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate) || - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmedCandidate) || - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalizedCandidate) - ) { - return true; - } - - if ( - isTrailingSmallTsuKanaSfx(trimmedCandidate) || - isTrailingSmallTsuKanaSfx(normalizedCandidate) || - isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) || - isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate) - ) { - return true; - } - } - - return false; -} - export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean { return sharedShouldExcludeTokenFromSubtitleAnnotations(token); } @@ -771,9 +627,7 @@ export function annotateTokens( }); return { ...strippedToken, - isKnown: - nPlusOneEnabled && - computeExcludedTokenKnownStatus(token, deps.isKnownWord), + isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord), }; } diff --git a/src/core/services/tokenizer/grammar-ending.ts b/src/core/services/tokenizer/grammar-ending.ts new file mode 100644 index 00000000..3f833b15 --- /dev/null +++ b/src/core/services/tokenizer/grammar-ending.ts @@ -0,0 +1,124 @@ +const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; +const KATAKANA_CODEPOINT_START = 0x30a1; +const KATAKANA_CODEPOINT_END = 0x30f6; + +const SENTENCE_FINAL_PARTICLE_SUFFIXES = ['', 'か', 'ね', 'よ', 'な', 'わ'] as const; +const EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'] as const; +const EXPLANATORY_ENDING_CORES = [ + 'だ', + 'です', + 'でした', + 'だった', + 'では', + 'じゃ', + 'でしょう', + 'だろう', +] as const; +const EXPLANATORY_ENDING_TRAILING_PARTICLES = [ + '', + 'か', + 'ね', + 'よ', + 'な', + 'けど', + 'よね', + 'かな', + 'かね', +] as const; +const EXPLANATORY_ENDING_THOUGHT_SUFFIXES = ['か', 'かな', 'かね'] as const; +const NEGATIVE_COPULA_PREFIXES = ['じゃ', 'では'] as const; + +export function normalizeGrammarEndingText(text: string): string { + const raw = text.trim(); + if (!raw) { + return ''; + } + + let normalized = ''; + for (const char of raw) { + const code = char.codePointAt(0); + if (code === undefined) { + continue; + } + + if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { + normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); + continue; + } + + normalized += char; + } + + return normalized; +} + +function matchesSuffix(text: string, suffixes: readonly string[]): boolean { + return suffixes.some((suffix) => text === suffix); +} + +function matchesPoliteCopulaEnding(text: string): boolean { + if (!text.startsWith('です')) { + return false; + } + + return matchesSuffix(text.slice('です'.length), SENTENCE_FINAL_PARTICLE_SUFFIXES); +} + +function matchesNegativeCopulaEnding(text: string): boolean { + for (const prefix of NEGATIVE_COPULA_PREFIXES) { + const negativeStem = `${prefix}ない`; + if (!text.startsWith(negativeStem)) { + continue; + } + + const suffix = text.slice(negativeStem.length); + return ( + matchesSuffix(suffix, SENTENCE_FINAL_PARTICLE_SUFFIXES) || matchesPoliteCopulaEnding(suffix) + ); + } + + return false; +} + +function matchesExplanatoryEnding(text: string): boolean { + for (const prefix of EXPLANATORY_ENDING_PREFIXES) { + if (EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some((suffix) => text === `${prefix}${suffix}`)) { + return true; + } + + if (!text.startsWith(prefix)) { + continue; + } + + const suffix = text.slice(prefix.length); + for (const core of EXPLANATORY_ENDING_CORES) { + if (!suffix.startsWith(core)) { + continue; + } + + if (matchesSuffix(suffix.slice(core.length), EXPLANATORY_ENDING_TRAILING_PARTICLES)) { + return true; + } + } + } + + return false; +} + +export function isStandaloneGrammarEndingText(text: string): boolean { + const normalized = normalizeGrammarEndingText(text); + if (!normalized) { + return false; + } + + return matchesPoliteCopulaEnding(normalized) || matchesNegativeCopulaEnding(normalized); +} + +export function isSubtitleGrammarEndingText(text: string): boolean { + const normalized = normalizeGrammarEndingText(text); + if (!normalized) { + return false; + } + + return isStandaloneGrammarEndingText(normalized) || matchesExplanatoryEnding(normalized); +} diff --git a/src/core/services/tokenizer/parser-selection-stage.test.ts b/src/core/services/tokenizer/parser-selection-stage.test.ts index 59d29a36..59f9c5e1 100644 --- a/src/core/services/tokenizer/parser-selection-stage.test.ts +++ b/src/core/services/tokenizer/parser-selection-stage.test.ts @@ -219,6 +219,38 @@ test('splits trailing ja-nai grammar endings from preceding content', () => { ); }); +test('splits trailing negative-copula grammar endings by pattern', () => { + const parseResults = [ + makeParseItem('scanning-parser', [ + [ + { text: '問題', reading: 'もんだい', headword: '問題' }, + { text: 'ではないですか', reading: 'ではないですか', headword: 'ない' }, + ], + ]), + ]; + + const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); + assert.deepEqual( + tokens?.map((token) => ({ + surface: token.surface, + reading: token.reading, + headword: token.headword, + })), + [ + { + surface: '問題', + reading: 'もんだい', + headword: '問題', + }, + { + surface: 'ではないですか', + reading: 'ではないですか', + headword: 'ない', + }, + ], + ); +}); + test('merges trailing katakana continuation without headword into previous token', () => { const parseResults = [ makeParseItem('scanning-parser', [ diff --git a/src/core/services/tokenizer/parser-selection-stage.ts b/src/core/services/tokenizer/parser-selection-stage.ts index 4298f5bd..f51b8fb1 100644 --- a/src/core/services/tokenizer/parser-selection-stage.ts +++ b/src/core/services/tokenizer/parser-selection-stage.ts @@ -1,4 +1,5 @@ import { MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types'; +import { isStandaloneGrammarEndingText } from './grammar-ending'; interface YomitanParseHeadword { term?: unknown; @@ -24,24 +25,6 @@ export interface YomitanParseCandidate { tokens: MergedToken[]; } -const STANDALONE_GRAMMAR_ENDINGS = new Set([ - 'です', - 'ですか', - 'ですね', - 'ですよ', - 'ですな', - 'じゃない', - 'じゃないか', - 'じゃないね', - 'じゃないよ', - 'じゃないな', - 'じゃないです', - 'じゃないですか', - 'じゃないですね', - 'じゃないですよ', - 'じゃないですな', -]); - function isObject(value: unknown): value is Record { return Boolean(value && typeof value === 'object'); } @@ -164,7 +147,7 @@ function isStandaloneGrammarEndingSegment(segment: YomitanParseSegment): boolean const headword = extractYomitanHeadword(segment).trim(); return ( headword.length > 0 && - (STANDALONE_GRAMMAR_ENDINGS.has(surface) || STANDALONE_GRAMMAR_ENDINGS.has(headword)) + (isStandaloneGrammarEndingText(surface) || isStandaloneGrammarEndingText(headword)) ); } diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index a1446901..508dc98e 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -8,6 +8,7 @@ import { } from '../../../token-pos2-exclusions'; import { MergedToken, PartOfSpeech } from '../../../types'; import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter'; +import { isSubtitleGrammarEndingText } from './grammar-ending'; const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; @@ -58,61 +59,6 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'ものか', ...STANDALONE_GRAMMAR_PARTICLE_PHRASES, ]); -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ - 'だ', - 'です', - 'でした', - 'だった', - 'では', - 'じゃ', - 'でしょう', - 'だろう', -] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [ - '', - 'か', - 'ね', - 'よ', - 'な', - 'けど', - 'よね', - 'かな', - 'かね', -] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [ - 'か', - 'かな', - 'かね', -] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = ['', 'か', 'ね', 'よ', 'な'] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [ - '', - 'か', - 'ね', - 'よ', - 'な', - 'です', - 'ですか', - 'ですよ', - 'ですね', - 'ですな', -] as const; -const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set( - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) => - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) => - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map( - (particle) => `${prefix}${core}${particle}`, - ), - ), - ), -); -const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set( - SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`), -); -const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set( - SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`), -); const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([ 'って', 'ってよ', @@ -460,25 +406,11 @@ function isExcludedByTerm(token: MergedToken): boolean { continue; } - if ( - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.some((prefix) => - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some( - (suffix) => normalized === `${prefix}${suffix}`, - ), - ) - ) { - return true; - } - if ( SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) || SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) || - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) || - SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) || - SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) || - SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) || - SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) || - SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) || + isSubtitleGrammarEndingText(trimmed) || + isSubtitleGrammarEndingText(normalized) || shouldIgnoreJlptByTerm(trimmed) || shouldIgnoreJlptByTerm(normalized) ) {