Compare commits

..

13 Commits

Author SHA1 Message Date
sudacode 756b368020 Fix kana-only N+1 tokenizer regression test
- Use a pure-kana fixture for the subtitle token N+1 case
- Update task notes for the latest CodeRabbit follow-up
2026-05-02 16:05:10 -07:00
sudacode c7996e3daa Suppress subtitle annotations for grammar fragments
- Hide annotation metadata for auxiliary inflection and ja-nai endings
- Preserve lexical `くれる` forms and add regression coverage
2026-05-02 15:43:26 -07:00
sudacode 8a1ad6e4f3 fix: suppress N+1 for kana-only candidates and fix minSentenceWords coun
- Treat kana-only tokens with surrounding subtitle punctuation (…, ―, etc.) as kana-only so they are not promoted to N+1 targets
- Exclude unknown tokens filtered from N+1 targeting from the minSentenceWords count so filtered kana-only unknowns cannot satisfy sentence length threshold
- Add regression tests for kana-only candidate suppression and filtered-unknown padding cases
2026-05-02 15:43:26 -07:00
sudacode 833afd451e Cancel pending Linux MPV fullscreen overlay refresh bursts
- return a cancel handle from the Linux refresh burst scheduler
- clear pending refresh bursts when overlays hide or windows close
- tighten the burst test polling to wait for the async refresh
2026-05-02 15:43:26 -07:00
sudacode d9a8636486 fix: accept modified digits for multi-line sentence mining 2026-05-02 15:43:26 -07:00
sudacode 35ba8778f3 fix: address CodeRabbit review comments 2026-05-02 15:43:26 -07:00
sudacode bdbacb1304 fix: address fullscreen and n-plus-one review notes 2026-05-02 15:43:26 -07:00
sudacode dccc83f120 fix: refresh overlay on Hyprland fullscreen 2026-05-02 15:43:26 -07:00
sudacode c016ce0249 fix: exclude kana-only n+1 targets 2026-05-02 15:43:26 -07:00
sudacode 901f65440d fix: restore jlpt subtitle underlines 2026-05-02 15:43:26 -07:00
sudacode c289435251 fix(tokenizer): preserve annotation and enrichment behavior 2026-05-02 15:43:26 -07:00
sudacode 4b3ebd6ef6 feat(tokenizer): use Yomitan word classes for subtitle POS filtering
- Carry matched headword wordClasses from termsFind into YomitanScanToken
- Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation
- MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1
- Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations
- Respect source-text punctuation gaps when counting N+1 sentence words
- Preserve known-word highlight on excluded kanji-containing tokens
- Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
2026-05-02 15:43:26 -07:00
sudacode 13e2b5f8c8 Handle mpv reload buffering as same media
- Keep overlay alive across same-media mpv reloads
- Avoid rearming startup gate and repeating AniSkip lookups
- Add regression coverage for reload/end-file/file-loaded sequence
2026-05-02 15:42:54 -07:00
13 changed files with 749 additions and 9 deletions
@@ -2,9 +2,10 @@
id: TASK-305
title: Use Yomitan word classes for subtitle token POS filtering
status: Done
assignee: []
assignee:
- Codex
created_date: '2026-04-26 05:56'
updated_date: '2026-04-26 05:59'
updated_date: '2026-05-02 22:47'
labels:
- tokenizer
- yomitan
@@ -34,22 +35,30 @@ Subtitle annotation filtering currently uses Yomitan token spans, then enriches
3. Map recognized Yomitan wordClasses to SubMiner coarse PartOfSpeech/POS metadata before annotation filtering.
4. Keep MeCab whole-line enrichment as fallback/detail-fill for missing POS fields.
5. Run focused tokenizer tests and typecheck.
2026-05-02 review follow-up: inspect latest CodeRabbit review on PR #57, classify each finding as actionable/not actionable, patch scoped issues, run focused verification, then update final notes. User request to address/assess the review is the approval for this follow-up.
<!-- SECTION:PLAN:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Implemented app-only wordClasses extraction from termsFind results; no vendored Yomitan changes required. Recognized classes currently map prt, aux, v*, adj-i/adj-ix, adj-na, and noun-like classes to SubMiner POS metadata. MeCab enrichment now skips only tokens with complete pos1/pos2/pos3 and otherwise fills missing fields while preserving existing coarse pos1. Verification: bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts; bun run typecheck.
2026-05-02 CodeRabbit latest review assessment: only current actionable finding was in src/core/services/tokenizer/annotation-stage.test.ts, where a kana-only regression fixture used mixed-script/punctuation surface text. Earlier CodeRabbit findings in this PR were already marked addressed by prior commits. Patched the fixture to use pure-kana surface/headword and renamed the test to match the exercised behavior. Verification: bun test src/core/services/tokenizer/annotation-stage.test.ts; bun run typecheck.
<!-- SECTION:NOTES:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Implemented app-only Yomitan wordClasses support for subtitle token annotation filtering. The scanner now carries matched headword wordClasses from termsFind results, tokenizer maps recognized classes into SubMiner coarse POS metadata before annotation, and MeCab whole-line enrichment continues to fill missing detailed POS fields without requiring vendored Yomitan changes.
Implemented app-only Yomitan wordClasses support for subtitle token annotation filtering. The scanner carries matched headword wordClasses from termsFind results, tokenizer maps recognized classes into SubMiner coarse POS metadata before annotation, and MeCab whole-line enrichment continues to fill missing detailed POS fields without requiring vendored Yomitan changes.
2026-05-02 CodeRabbit follow-up:
- Assessed the latest CodeRabbit review on PR #57. Only one new actionable finding remained: the kana-only N+1 regression test used a mixed/punctuated surface.
- Updated the fixture in src/core/services/tokenizer/annotation-stage.test.ts to use a pure-kana unknown target and renamed the test accordingly.
Tests run:
- bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts
- bun test src/core/services/tokenizer/annotation-stage.test.ts
- bun run typecheck
Note: the working tree already had unrelated tokenizer/annotation edits and task-304 before this work; those were left intact.
Note: earlier CodeRabbit findings on this PR were already marked addressed in prior commits; no further latest-review issues were left unresolved in this pass.
<!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1,43 @@
---
id: TASK-311
title: Suppress auxiliary inflection fragments from subtitle annotations
status: Done
assignee: []
created_date: '2026-05-02 09:07'
updated_date: '2026-05-02 09:10'
labels:
- tokenizer
- annotations
- bug
dependencies: []
priority: medium
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Suppress standalone Japanese auxiliary/inflection subtitle fragments such as `れる` and `れた` from frequency/JLPT/N+1/known annotation styling while keeping lexical verbs such as `くれ` / `くれる` annotatable. Tokens must remain hoverable; only annotation metadata should be stripped.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 `れる` and `れた`-style standalone helper fragments render as plain hoverable subtitle tokens.
- [x] #2 Lexical verbs like `くれ` / `くれる` remain eligible for annotation.
- [x] #3 Regression tests cover unit filter behavior and tokenizer integration.
<!-- AC:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Implemented with TDD. Added failing coverage first for standalone `れる`/`れた` auxiliary fragments and a lexical `くれ`/`くれる` guard. Updated the shared subtitle annotation filter to strip annotation metadata for kana-only auxiliary inflection fragments identified by MeCab POS (`助動詞` only, or `動詞/接尾` with optional trailing `助動詞`) while preserving lexical `くれ` as `くれる` when tagged `動詞/自立`. Added tokenizer integration coverage for `れた` and neighboring lexical N+1 behavior.
<!-- SECTION:NOTES:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Suppressed annotation metadata for standalone auxiliary inflection fragments such as `れる` and `れた` in subtitle tokens, leaving them hoverable but plain. Preserved lexical `くれ` -> `くれる` verb metadata when MeCab tags it as `動詞/自立`.
Added unit and tokenizer regression coverage, plus a release fragment in `changes/311-auxiliary-inflection-annotation-filter.md`.
Validation: targeted annotation/tokenizer tests passed; `bun run typecheck` passed; `bun run changelog:lint` passed. `bun run test:fast` was attempted twice and failed in unrelated `src/core/services/subsync.test.ts` cross-file state (`window.electronAPI` undefined), while `bun test src/core/services/subsync.test.ts` passes by itself.
<!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1,42 @@
---
id: TASK-312
title: Suppress ja-nai explanatory ending subtitle annotations
status: Done
assignee: []
created_date: '2026-05-02 09:55'
updated_date: '2026-05-02 10:03'
labels:
- tokenizer
- annotations
- bug
dependencies: []
priority: medium
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Suppress subtitle annotation styling for grammar-only explanatory endings like `じゃない` and `じゃないですか` while preserving nearby lexical content annotations.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 `じゃない` and `じゃないですか`-style endings render as plain hoverable subtitle tokens.
- [x] #2 The reported phrase `みたいなのあるじゃないですか` does not annotate `じゃない`/`じゃないですか` as lexical/frequency content.
- [x] #3 Regression tests cover unit filter behavior and tokenizer integration without suppressing lexical content tokens.
- [x] #4 Standalone polite copula endings such as `です` / `ですよ` render as plain hoverable subtitle tokens even if POS metadata is missing or too lexical.
<!-- AC:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Added failing coverage first for `じゃない` / `じゃないですか` and `ですよ` leaking annotation metadata when POS metadata is missing or too lexical. Implemented term-family exclusions in the shared subtitle annotation filter for the `じゃない` explanatory family and polite copula suffix endings (`ですか`, `ですね`, `ですよ`, `ですな`). Kept bare `です` term-only behavior unchanged to preserve existing no-POS frequency tests; POS-tagged `です` is already stripped by the grammar POS exclusion path.
<!-- SECTION:NOTES:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Suppressed subtitle annotation metadata for grammar-only endings like `じゃないですか` and `ですよ`, while preserving nearby lexical content annotations. Added unit and tokenizer regression coverage for the reported `みたいなのあるじゃないですか` and `感じですよ` shapes, plus changelog fragment `changes/312-grammar-ending-annotation-filter.md`.
Validation: `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run changelog:lint`; `git diff --check`.
<!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1,28 @@
---
id: TASK-313
title: Fix mpv buffering reload overlay lifecycle
status: To Do
assignee: []
created_date: '2026-05-02 22:12'
labels:
- bug
- mpv
- overlay
dependencies: []
priority: high
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
macOS local playback can emit an mpv reload/end-file/file-loaded sequence during buffering. SubMiner should treat same-media reload churn as a continuation, not a fresh playback session, so the visible overlay remains available and startup-only tokenization/AniSkip work is not repeated unnecessarily.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [ ] #1 Same-media mpv reload buffering does not hide the visible overlay.
- [ ] #2 Same-media mpv reload buffering does not re-arm the pause-until-ready startup gate or wait for a second tokenization-ready signal.
- [ ] #3 Same-media mpv reload buffering does not repeat AniSkip lookup work for the already-loaded media.
- [ ] #4 Normal new-file playback still clears per-media state, applies managed subtitle defaults, auto-starts/updates the overlay, and runs needed startup checks.
- [ ] #5 Regression coverage exercises the buffering reload/end-file/file-loaded sequence in the mpv plugin lifecycle.
<!-- AC:END -->
@@ -0,0 +1,4 @@
type: fixed
area: overlay
- Suppressed subtitle annotation styling for standalone auxiliary inflection fragments such as `れる` and `れた` while keeping lexical `くれる` forms eligible for lookup metadata.
@@ -0,0 +1,4 @@
type: fixed
area: overlay
- Suppressed subtitle annotation styling for grammar-only endings such as `じゃないですか` and standalone polite copula tails like `です` / `ですよ`.
@@ -0,0 +1,4 @@
type: fixed
area: mpv
- Kept the visible overlay alive across same-media mpv reloads during buffering, avoiding duplicate startup gates and AniSkip lookups.
+49
View File
@@ -11,6 +11,29 @@ function M.create(ctx)
local subminer_log = ctx.log.subminer_log
local show_osd = ctx.log.show_osd
local function resolve_media_identity()
local path = mp.get_property("path")
if type(path) == "string" and path ~= "" then
return path
end
local filename = mp.get_property("filename")
if type(filename) == "string" and filename ~= "" then
return filename
end
local media_title = mp.get_property("media-title")
if type(media_title) == "string" and media_title ~= "" then
return media_title
end
return nil
end
local function is_reload_end_file(reason)
return reason == "reload" or reason == "redirect"
end
local function schedule_aniskip_fetch(trigger_source, delay_seconds)
local delay = tonumber(delay_seconds) or 0
mp.add_timeout(delay, function()
@@ -41,6 +64,25 @@ function M.create(ctx)
end
local function on_file_loaded()
local media_identity = resolve_media_identity()
local same_media_reload = (
media_identity ~= nil
and state.pending_reload_media_identity ~= nil
and media_identity == state.pending_reload_media_identity
)
state.pending_reload_media_identity = nil
state.current_media_identity = media_identity
if same_media_reload then
subminer_log("debug", "lifecycle", "Skipping startup lifecycle for same-media mpv reload")
if state.overlay_running and resolve_auto_start_enabled() and process.has_matching_mpv_ipc_socket(opts.socket_path) then
process.run_control_command_async("show-visible-overlay", {
socket_path = opts.socket_path,
})
end
return
end
aniskip.clear_aniskip_state()
process.disarm_auto_play_ready_gate()
local has_matching_socket = rearm_managed_subtitle_defaults()
@@ -73,6 +115,8 @@ function M.create(ctx)
aniskip.clear_aniskip_state()
hover.clear_hover_overlay()
process.disarm_auto_play_ready_gate()
state.current_media_identity = nil
state.pending_reload_media_identity = nil
end
local function register_lifecycle_hooks()
@@ -85,6 +129,11 @@ function M.create(ctx)
process.disarm_auto_play_ready_gate()
hover.clear_hover_overlay()
local reason = type(event) == "table" and event.reason or nil
if is_reload_end_file(reason) then
state.pending_reload_media_identity = state.current_media_identity or resolve_media_identity()
return
end
state.pending_reload_media_identity = nil
if state.overlay_running and reason ~= "quit" then
process.hide_visible_overlay()
end
+2
View File
@@ -33,6 +33,8 @@ function M.new()
auto_play_ready_timeout = nil,
auto_play_ready_osd_timer = nil,
suppress_ready_overlay_restore = false,
current_media_identity = nil,
pending_reload_media_identity = nil,
session_binding_generation = 0,
session_binding_names = {},
session_numeric_binding_names = {},
+53
View File
@@ -461,6 +461,20 @@ local function has_async_curl_for(async_calls, needle)
return false
end
local function count_async_curl_for(async_calls, needle)
local count = 0
for _, call in ipairs(async_calls) do
local args = call.args or {}
if args[1] == "curl" then
local url = args[#args] or ""
if type(url) == "string" and url:find(needle, 1, true) then
count = count + 1
end
end
end
return count
end
local function has_property_set(property_sets, name, value)
for _, call in ipairs(property_sets) do
if call.name == name and call.value == value then
@@ -578,6 +592,45 @@ do
)
end
do
local media_path = "/media/Sample Show S01E01.mkv"
local recorded, err = run_plugin_scenario({
process_list = "",
option_overrides = {
binary_path = binary_path,
auto_start = "yes",
auto_start_visible_overlay = "yes",
auto_start_pause_until_ready = "yes",
socket_path = "/tmp/subminer-socket",
},
input_ipc_server = "/tmp/subminer-socket",
path = media_path,
media_title = "Sample Show S01E01",
mal_lookup_stdout = "__MAL_FOUND__",
aniskip_stdout = "__ANISKIP_FOUND__",
files = {
[binary_path] = true,
},
})
assert_true(recorded ~= nil, "plugin failed to load for same-media reload scenario: " .. tostring(err))
fire_event(recorded, "file-loaded")
recorded.script_messages["subminer-autoplay-ready"]()
fire_event(recorded, "end-file", { reason = "reload" })
fire_event(recorded, "file-loaded")
assert_true(
count_control_calls(recorded.async_calls, "--hide-visible-overlay") == 0,
"same-media reload should not hide the visible overlay"
)
assert_true(
count_property_set(recorded.property_sets, "pause", true) == 1,
"same-media reload should not re-arm pause-until-ready"
)
assert_true(
count_async_curl_for(recorded.async_calls, "api.aniskip.com") == 1,
"same-media reload should not repeat AniSkip lookup"
)
end
do
local recorded, err = run_plugin_scenario({
process_list = "",
+313
View File
@@ -4227,6 +4227,211 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
);
});
test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
const result = await tokenizeSubtitle(
'みたいなのあるじゃないですか',
makeDepsFromYomitanTokens(
[
{ surface: 'みたいな', reading: 'みたいな', headword: 'みたい' },
{ surface: 'の', reading: 'の', headword: 'の' },
{ surface: 'ある', reading: 'ある', headword: 'ある' },
{ surface: 'じゃないですか', reading: 'じゃないですか', headword: 'じゃない' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
getJlptLevel: (text) =>
text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
isKnownWord: (text) => text === 'みたい' || text === 'の',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => [
{
headword: 'みたい',
surface: 'みたい',
reading: 'ミタイ',
startPos: 0,
endPos: 3,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
pos3: '形容動詞語幹',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'だ',
surface: 'な',
reading: 'ナ',
startPos: 3,
endPos: 4,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
pos2: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'の',
surface: 'の',
reading: '',
startPos: 4,
endPos: 5,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'ある',
surface: 'ある',
reading: 'アル',
startPos: 5,
endPos: 7,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'じゃない',
surface: 'じゃない',
reading: 'ジャナイ',
startPos: 7,
endPos: 11,
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '接続詞|形容詞',
pos2: '*|自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'です',
surface: 'です',
reading: 'デス',
startPos: 11,
endPos: 13,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
pos2: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'か',
surface: 'か',
reading: 'カ',
startPos: 13,
endPos: 14,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '副助詞/並立助詞/終助詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
const tokenSummary = result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
}));
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'じゃないですか'),
{
surface: 'じゃないですか',
headword: 'じゃない',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
);
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'ある'),
{
surface: 'ある',
headword: 'ある',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: 240,
jlptLevel: 'N5',
},
);
});
test('tokenizeSubtitle clears annotations for standalone polite copula endings without POS metadata', async () => {
const result = await tokenizeSubtitle(
'現実は感じですよ',
makeDepsFromYomitanTokens(
[
{ surface: '現実', reading: 'げんじつ', headword: '現実' },
{ surface: 'は', reading: 'は', headword: 'は' },
{ surface: '感じ', reading: 'かんじ', headword: '感じ' },
{ surface: 'ですよ', reading: 'ですよ', headword: 'です' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === '現実' ? 600 : text === '感じ' ? 240 : text === 'です' ? 50 : null,
getJlptLevel: (text) =>
text === '現実' ? 'N3' : text === '感じ' ? 'N4' : text === 'です' ? 'N5' : null,
isKnownWord: (text) => text === '現実' || text === 'は' || text === 'です',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => null,
},
),
);
const tokenSummary = result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
}));
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'ですよ'),
{
surface: 'ですよ',
headword: 'です',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
);
assert.deepEqual(
tokenSummary?.find((token) => token.surface === '感じ'),
{
surface: '感じ',
headword: '感じ',
isKnown: false,
isNPlusOneTarget: true,
frequencyRank: 240,
jlptLevel: 'N4',
},
);
});
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
const result = await tokenizeSubtitle(
'さっきの俺と違うことに気付かないのかい?',
@@ -4446,6 +4651,114 @@ test('tokenizeSubtitle clears annotations for ことに while preserving lexical
);
});
test('tokenizeSubtitle clears annotations for auxiliary inflection fragments while preserving lexical N+1 target', async () => {
const result = await tokenizeSubtitle(
'私れた猫',
makeDepsFromYomitanTokens(
[
{ surface: '私', reading: 'わたし', headword: '私' },
{ surface: 'れた', reading: 'れた', headword: 'れる' },
{ surface: '猫', reading: 'ねこ', headword: '猫' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === '私' ? 50 : text === 'れる' ? 18 : text === '猫' ? 900 : null,
getJlptLevel: (text) =>
text === '私' ? 'N5' : text === 'れる' ? 'N4' : text === '猫' ? 'N5' : null,
isKnownWord: (text) => text === '私' || text === 'れる',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => [
{
headword: '私',
surface: '私',
reading: 'ワタシ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '代名詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'れる',
surface: 'れ',
reading: 'レ',
startPos: 1,
endPos: 2,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '接尾',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'た',
surface: 'た',
reading: 'タ',
startPos: 2,
endPos: 3,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
pos2: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '猫',
surface: '猫',
reading: 'ネコ',
startPos: 3,
endPos: 4,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
const tokenSummary = result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
}));
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'れた'),
{
surface: 'れた',
headword: 'れる',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
);
assert.deepEqual(
tokenSummary?.find((token) => token.surface === '猫'),
{
surface: '猫',
headword: '猫',
isKnown: false,
isNPlusOneTarget: true,
frequencyRank: 900,
jlptLevel: 'N5',
},
);
});
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
let mecabCalls = 0;
const result = await tokenizeSubtitle(
@@ -258,6 +258,48 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory contrast en
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes ja-nai explanatory endings', () => {
const tokens = [
makeToken({
surface: 'じゃない',
headword: 'じゃない',
reading: 'ジャナイ',
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '接続詞|形容詞',
pos2: '*|自立',
}),
makeToken({
surface: 'じゃないですか',
headword: 'じゃない',
reading: 'ジャナイデスカ',
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '接続詞|形容詞|助動詞|助詞',
pos2: '*|自立|*|副助詞/並立助詞/終助詞',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copula suffix endings without POS tags', () => {
const tokens = [
makeToken({
surface: 'ですよ',
headword: 'です',
reading: 'デスヨ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
const token = makeToken({
surface: 'そうだ',
@@ -627,7 +669,7 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi
assert.equal(result[2]?.isNPlusOneTarget, false);
});
test('annotateTokens does not mark kana-only unknown target with subtitle punctuation as N+1', () => {
test('annotateTokens does not mark kana-only unknown target as N+1', () => {
const tokens = [
makeToken({
surface: '何やら',
@@ -646,12 +688,12 @@ test('annotateTokens does not mark kana-only unknown target with subtitle punctu
endPos: 6,
}),
makeToken({
surface: 'スイッチ…',
headword: 'スイッチ',
surface: 'すいっち',
headword: 'すいっち',
reading: 'スイッチ',
pos1: '名詞',
startPos: 6,
endPos: 11,
endPos: 10,
}),
];
@@ -1204,6 +1246,78 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
const tokens = [
makeToken({
surface: 'れる',
headword: 'れる',
reading: 'レル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '接尾',
startPos: 0,
endPos: 2,
frequencyRank: 18,
}),
makeToken({
surface: 'れた',
headword: 'れる',
reading: 'レタ',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助動詞',
pos2: '接尾|*',
startPos: 2,
endPos: 4,
frequencyRank: 19,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'れる',
getJlptLevel: (text) => (text === 'れる' ? 'N4' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
for (const token of result) {
assert.equal(token.isKnown, false, token.surface);
assert.equal(token.isNPlusOneTarget, false, token.surface);
assert.equal(token.frequencyRank, undefined, token.surface);
assert.equal(token.jlptLevel, undefined, token.surface);
}
});
test('annotateTokens keeps lexical くれる forms eligible for annotation', () => {
const tokens = [
makeToken({
surface: 'くれ',
headword: 'くれる',
reading: 'クレ',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
startPos: 0,
endPos: 2,
frequencyRank: 20,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
getJlptLevel: (text) => (text === 'くれる' ? 'N4' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, 20);
assert.equal(result[0]?.jlptLevel, 'N4');
});
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
const tokens = [
makeToken({
@@ -63,6 +63,24 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [
'かな',
'かね',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = [
'か',
'ね',
'よ',
'な',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [
'',
'か',
'ね',
'よ',
'な',
'です',
'ですか',
'ですよ',
'ですね',
'ですな',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
@@ -72,6 +90,12 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
),
),
);
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`),
);
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`),
);
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
'って',
'ってよ',
@@ -83,6 +107,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
]);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
const AUXILIARY_INFLECTION_TRAILING_POS1 = new Set(['助動詞']);
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
'か',
'が',
@@ -312,6 +337,44 @@ function isKanaOnlyText(text: string): boolean {
return normalized.length > 0 && [...normalized].every(isKanaChar);
}
function isLexicalKureruVerb(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
return (
normalizedSurface === 'くれ' &&
normalizedHeadword === 'くれる' &&
pos1Parts.length === 1 &&
pos1Parts[0] === '動詞' &&
pos2Parts.length === 1 &&
pos2Parts[0] === '自立'
);
}
function isStandaloneAuxiliaryInflectionFragment(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
if (!isKanaOnlyText(normalizedSurface)) {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
if (pos1Parts.length === 0) {
return false;
}
if (pos1Parts.every((part) => part === '助動詞')) {
return true;
}
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
return (
pos1Parts[0] === '動詞' &&
pos2Parts[0] === '接尾' &&
pos1Parts.slice(1).every((part) => AUXILIARY_INFLECTION_TRAILING_POS1.has(part))
);
}
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
@@ -370,6 +433,10 @@ function isExcludedByTerm(token: MergedToken): boolean {
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) ||
shouldIgnoreJlptByTerm(trimmed) ||
shouldIgnoreJlptByTerm(normalized)
) {
@@ -426,6 +493,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
return true;
}
if (isStandaloneAuxiliaryInflectionFragment(token)) {
return true;
}
if (isStandaloneSuruTeGrammarHelper(token)) {
return true;
}
@@ -442,6 +513,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
return true;
}
if (isLexicalKureruVerb(token)) {
return false;
}
return isExcludedByTerm(token);
}