mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-28 00:55:16 -07:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
756b368020
|
|||
|
c7996e3daa
|
|||
|
8a1ad6e4f3
|
|||
|
833afd451e
|
|||
|
d9a8636486
|
|||
|
35ba8778f3
|
|||
|
bdbacb1304
|
|||
|
dccc83f120
|
|||
|
c016ce0249
|
|||
|
901f65440d
|
|||
|
c289435251
|
|||
|
4b3ebd6ef6
|
|||
|
13e2b5f8c8
|
+14
-5
@@ -2,9 +2,10 @@
|
||||
id: TASK-305
|
||||
title: Use Yomitan word classes for subtitle token POS filtering
|
||||
status: Done
|
||||
assignee: []
|
||||
assignee:
|
||||
- Codex
|
||||
created_date: '2026-04-26 05:56'
|
||||
updated_date: '2026-04-26 05:59'
|
||||
updated_date: '2026-05-02 22:47'
|
||||
labels:
|
||||
- tokenizer
|
||||
- yomitan
|
||||
@@ -34,22 +35,30 @@ Subtitle annotation filtering currently uses Yomitan token spans, then enriches
|
||||
3. Map recognized Yomitan wordClasses to SubMiner coarse PartOfSpeech/POS metadata before annotation filtering.
|
||||
4. Keep MeCab whole-line enrichment as fallback/detail-fill for missing POS fields.
|
||||
5. Run focused tokenizer tests and typecheck.
|
||||
|
||||
2026-05-02 review follow-up: inspect latest CodeRabbit review on PR #57, classify each finding as actionable/not actionable, patch scoped issues, run focused verification, then update final notes. User request to address/assess the review is the approval for this follow-up.
|
||||
<!-- SECTION:PLAN:END -->
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
<!-- SECTION:NOTES:BEGIN -->
|
||||
Implemented app-only wordClasses extraction from termsFind results; no vendored Yomitan changes required. Recognized classes currently map prt, aux, v*, adj-i/adj-ix, adj-na, and noun-like classes to SubMiner POS metadata. MeCab enrichment now skips only tokens with complete pos1/pos2/pos3 and otherwise fills missing fields while preserving existing coarse pos1. Verification: bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts; bun run typecheck.
|
||||
|
||||
2026-05-02 CodeRabbit latest review assessment: only current actionable finding was in src/core/services/tokenizer/annotation-stage.test.ts, where a kana-only regression fixture used mixed-script/punctuation surface text. Earlier CodeRabbit findings in this PR were already marked addressed by prior commits. Patched the fixture to use pure-kana surface/headword and renamed the test to match the exercised behavior. Verification: bun test src/core/services/tokenizer/annotation-stage.test.ts; bun run typecheck.
|
||||
<!-- SECTION:NOTES:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
Implemented app-only Yomitan wordClasses support for subtitle token annotation filtering. The scanner now carries matched headword wordClasses from termsFind results, tokenizer maps recognized classes into SubMiner coarse POS metadata before annotation, and MeCab whole-line enrichment continues to fill missing detailed POS fields without requiring vendored Yomitan changes.
|
||||
Implemented app-only Yomitan wordClasses support for subtitle token annotation filtering. The scanner carries matched headword wordClasses from termsFind results, tokenizer maps recognized classes into SubMiner coarse POS metadata before annotation, and MeCab whole-line enrichment continues to fill missing detailed POS fields without requiring vendored Yomitan changes.
|
||||
|
||||
2026-05-02 CodeRabbit follow-up:
|
||||
- Assessed the latest CodeRabbit review on PR #57. Only one new actionable finding remained: the kana-only N+1 regression test used a mixed/punctuated surface.
|
||||
- Updated the fixture in src/core/services/tokenizer/annotation-stage.test.ts to use a pure-kana unknown target and renamed the test accordingly.
|
||||
|
||||
Tests run:
|
||||
- bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts
|
||||
- bun test src/core/services/tokenizer/annotation-stage.test.ts
|
||||
- bun run typecheck
|
||||
|
||||
Note: the working tree already had unrelated tokenizer/annotation edits and task-304 before this work; those were left intact.
|
||||
Note: earlier CodeRabbit findings on this PR were already marked addressed in prior commits; no further latest-review issues were left unresolved in this pass.
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
|
||||
+43
@@ -0,0 +1,43 @@
|
||||
---
|
||||
id: TASK-311
|
||||
title: Suppress auxiliary inflection fragments from subtitle annotations
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-05-02 09:07'
|
||||
updated_date: '2026-05-02 09:10'
|
||||
labels:
|
||||
- tokenizer
|
||||
- annotations
|
||||
- bug
|
||||
dependencies: []
|
||||
priority: medium
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
Suppress standalone Japanese auxiliary/inflection subtitle fragments such as `れる` and `れた` from frequency/JLPT/N+1/known annotation styling while keeping lexical verbs such as `くれ` / `くれる` annotatable. Tokens must remain hoverable; only annotation metadata should be stripped.
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
<!-- AC:BEGIN -->
|
||||
- [x] #1 `れる` and `れた`-style standalone helper fragments render as plain hoverable subtitle tokens.
|
||||
- [x] #2 Lexical verbs like `くれ` / `くれる` remain eligible for annotation.
|
||||
- [x] #3 Regression tests cover unit filter behavior and tokenizer integration.
|
||||
<!-- AC:END -->
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
<!-- SECTION:NOTES:BEGIN -->
|
||||
Implemented with TDD. Added failing coverage first for standalone `れる`/`れた` auxiliary fragments and a lexical `くれ`/`くれる` guard. Updated the shared subtitle annotation filter to strip annotation metadata for kana-only auxiliary inflection fragments identified by MeCab POS (`助動詞` only, or `動詞/接尾` with optional trailing `助動詞`) while preserving lexical `くれ` as `くれる` when tagged `動詞/自立`. Added tokenizer integration coverage for `れた` and neighboring lexical N+1 behavior.
|
||||
<!-- SECTION:NOTES:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
Suppressed annotation metadata for standalone auxiliary inflection fragments such as `れる` and `れた` in subtitle tokens, leaving them hoverable but plain. Preserved lexical `くれ` -> `くれる` verb metadata when MeCab tags it as `動詞/自立`.
|
||||
|
||||
Added unit and tokenizer regression coverage, plus a release fragment in `changes/311-auxiliary-inflection-annotation-filter.md`.
|
||||
|
||||
Validation: targeted annotation/tokenizer tests passed; `bun run typecheck` passed; `bun run changelog:lint` passed. `bun run test:fast` was attempted twice and failed in unrelated `src/core/services/subsync.test.ts` cross-file state (`window.electronAPI` undefined), while `bun test src/core/services/subsync.test.ts` passes by itself.
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
@@ -0,0 +1,42 @@
|
||||
---
|
||||
id: TASK-312
|
||||
title: Suppress ja-nai explanatory ending subtitle annotations
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-05-02 09:55'
|
||||
updated_date: '2026-05-02 10:03'
|
||||
labels:
|
||||
- tokenizer
|
||||
- annotations
|
||||
- bug
|
||||
dependencies: []
|
||||
priority: medium
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
Suppress subtitle annotation styling for grammar-only explanatory endings like `じゃない` and `じゃないですか` while preserving nearby lexical content annotations.
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
<!-- AC:BEGIN -->
|
||||
- [x] #1 `じゃない` and `じゃないですか`-style endings render as plain hoverable subtitle tokens.
|
||||
- [x] #2 The reported phrase `みたいなのあるじゃないですか` does not annotate `じゃない`/`じゃないですか` as lexical/frequency content.
|
||||
- [x] #3 Regression tests cover unit filter behavior and tokenizer integration without suppressing lexical content tokens.
|
||||
- [x] #4 Standalone polite copula endings such as `です` / `ですよ` render as plain hoverable subtitle tokens even if POS metadata is missing or too lexical.
|
||||
<!-- AC:END -->
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
<!-- SECTION:NOTES:BEGIN -->
|
||||
Added failing coverage first for `じゃない` / `じゃないですか` and `ですよ` leaking annotation metadata when POS metadata is missing or too lexical. Implemented term-family exclusions in the shared subtitle annotation filter for the `じゃない` explanatory family and polite copula suffix endings (`ですか`, `ですね`, `ですよ`, `ですな`). Kept bare `です` term-only behavior unchanged to preserve existing no-POS frequency tests; POS-tagged `です` is already stripped by the grammar POS exclusion path.
|
||||
<!-- SECTION:NOTES:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
Suppressed subtitle annotation metadata for grammar-only endings like `じゃないですか` and `ですよ`, while preserving nearby lexical content annotations. Added unit and tokenizer regression coverage for the reported `みたいなのあるじゃないですか` and `感じですよ` shapes, plus changelog fragment `changes/312-grammar-ending-annotation-filter.md`.
|
||||
|
||||
Validation: `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run changelog:lint`; `git diff --check`.
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
@@ -0,0 +1,28 @@
|
||||
---
|
||||
id: TASK-313
|
||||
title: Fix mpv buffering reload overlay lifecycle
|
||||
status: To Do
|
||||
assignee: []
|
||||
created_date: '2026-05-02 22:12'
|
||||
labels:
|
||||
- bug
|
||||
- mpv
|
||||
- overlay
|
||||
dependencies: []
|
||||
priority: high
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
macOS local playback can emit an mpv reload/end-file/file-loaded sequence during buffering. SubMiner should treat same-media reload churn as a continuation, not a fresh playback session, so the visible overlay remains available and startup-only tokenization/AniSkip work is not repeated unnecessarily.
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
<!-- AC:BEGIN -->
|
||||
- [ ] #1 Same-media mpv reload buffering does not hide the visible overlay.
|
||||
- [ ] #2 Same-media mpv reload buffering does not re-arm the pause-until-ready startup gate or wait for a second tokenization-ready signal.
|
||||
- [ ] #3 Same-media mpv reload buffering does not repeat AniSkip lookup work for the already-loaded media.
|
||||
- [ ] #4 Normal new-file playback still clears per-media state, applies managed subtitle defaults, auto-starts/updates the overlay, and runs needed startup checks.
|
||||
- [ ] #5 Regression coverage exercises the buffering reload/end-file/file-loaded sequence in the mpv plugin lifecycle.
|
||||
<!-- AC:END -->
|
||||
@@ -0,0 +1,4 @@
|
||||
type: fixed
|
||||
area: overlay
|
||||
|
||||
- Suppressed subtitle annotation styling for standalone auxiliary inflection fragments such as `れる` and `れた` while keeping lexical `くれる` forms eligible for lookup metadata.
|
||||
@@ -0,0 +1,4 @@
|
||||
type: fixed
|
||||
area: overlay
|
||||
|
||||
- Suppressed subtitle annotation styling for grammar-only endings such as `じゃないですか` and standalone polite copula tails like `です` / `ですよ`.
|
||||
@@ -0,0 +1,4 @@
|
||||
type: fixed
|
||||
area: mpv
|
||||
|
||||
- Kept the visible overlay alive across same-media mpv reloads during buffering, avoiding duplicate startup gates and AniSkip lookups.
|
||||
@@ -11,6 +11,29 @@ function M.create(ctx)
|
||||
local subminer_log = ctx.log.subminer_log
|
||||
local show_osd = ctx.log.show_osd
|
||||
|
||||
local function resolve_media_identity()
|
||||
local path = mp.get_property("path")
|
||||
if type(path) == "string" and path ~= "" then
|
||||
return path
|
||||
end
|
||||
|
||||
local filename = mp.get_property("filename")
|
||||
if type(filename) == "string" and filename ~= "" then
|
||||
return filename
|
||||
end
|
||||
|
||||
local media_title = mp.get_property("media-title")
|
||||
if type(media_title) == "string" and media_title ~= "" then
|
||||
return media_title
|
||||
end
|
||||
|
||||
return nil
|
||||
end
|
||||
|
||||
local function is_reload_end_file(reason)
|
||||
return reason == "reload" or reason == "redirect"
|
||||
end
|
||||
|
||||
local function schedule_aniskip_fetch(trigger_source, delay_seconds)
|
||||
local delay = tonumber(delay_seconds) or 0
|
||||
mp.add_timeout(delay, function()
|
||||
@@ -41,6 +64,25 @@ function M.create(ctx)
|
||||
end
|
||||
|
||||
local function on_file_loaded()
|
||||
local media_identity = resolve_media_identity()
|
||||
local same_media_reload = (
|
||||
media_identity ~= nil
|
||||
and state.pending_reload_media_identity ~= nil
|
||||
and media_identity == state.pending_reload_media_identity
|
||||
)
|
||||
state.pending_reload_media_identity = nil
|
||||
state.current_media_identity = media_identity
|
||||
|
||||
if same_media_reload then
|
||||
subminer_log("debug", "lifecycle", "Skipping startup lifecycle for same-media mpv reload")
|
||||
if state.overlay_running and resolve_auto_start_enabled() and process.has_matching_mpv_ipc_socket(opts.socket_path) then
|
||||
process.run_control_command_async("show-visible-overlay", {
|
||||
socket_path = opts.socket_path,
|
||||
})
|
||||
end
|
||||
return
|
||||
end
|
||||
|
||||
aniskip.clear_aniskip_state()
|
||||
process.disarm_auto_play_ready_gate()
|
||||
local has_matching_socket = rearm_managed_subtitle_defaults()
|
||||
@@ -73,6 +115,8 @@ function M.create(ctx)
|
||||
aniskip.clear_aniskip_state()
|
||||
hover.clear_hover_overlay()
|
||||
process.disarm_auto_play_ready_gate()
|
||||
state.current_media_identity = nil
|
||||
state.pending_reload_media_identity = nil
|
||||
end
|
||||
|
||||
local function register_lifecycle_hooks()
|
||||
@@ -85,6 +129,11 @@ function M.create(ctx)
|
||||
process.disarm_auto_play_ready_gate()
|
||||
hover.clear_hover_overlay()
|
||||
local reason = type(event) == "table" and event.reason or nil
|
||||
if is_reload_end_file(reason) then
|
||||
state.pending_reload_media_identity = state.current_media_identity or resolve_media_identity()
|
||||
return
|
||||
end
|
||||
state.pending_reload_media_identity = nil
|
||||
if state.overlay_running and reason ~= "quit" then
|
||||
process.hide_visible_overlay()
|
||||
end
|
||||
|
||||
@@ -33,6 +33,8 @@ function M.new()
|
||||
auto_play_ready_timeout = nil,
|
||||
auto_play_ready_osd_timer = nil,
|
||||
suppress_ready_overlay_restore = false,
|
||||
current_media_identity = nil,
|
||||
pending_reload_media_identity = nil,
|
||||
session_binding_generation = 0,
|
||||
session_binding_names = {},
|
||||
session_numeric_binding_names = {},
|
||||
|
||||
@@ -461,6 +461,20 @@ local function has_async_curl_for(async_calls, needle)
|
||||
return false
|
||||
end
|
||||
|
||||
local function count_async_curl_for(async_calls, needle)
|
||||
local count = 0
|
||||
for _, call in ipairs(async_calls) do
|
||||
local args = call.args or {}
|
||||
if args[1] == "curl" then
|
||||
local url = args[#args] or ""
|
||||
if type(url) == "string" and url:find(needle, 1, true) then
|
||||
count = count + 1
|
||||
end
|
||||
end
|
||||
end
|
||||
return count
|
||||
end
|
||||
|
||||
local function has_property_set(property_sets, name, value)
|
||||
for _, call in ipairs(property_sets) do
|
||||
if call.name == name and call.value == value then
|
||||
@@ -578,6 +592,45 @@ do
|
||||
)
|
||||
end
|
||||
|
||||
do
|
||||
local media_path = "/media/Sample Show S01E01.mkv"
|
||||
local recorded, err = run_plugin_scenario({
|
||||
process_list = "",
|
||||
option_overrides = {
|
||||
binary_path = binary_path,
|
||||
auto_start = "yes",
|
||||
auto_start_visible_overlay = "yes",
|
||||
auto_start_pause_until_ready = "yes",
|
||||
socket_path = "/tmp/subminer-socket",
|
||||
},
|
||||
input_ipc_server = "/tmp/subminer-socket",
|
||||
path = media_path,
|
||||
media_title = "Sample Show S01E01",
|
||||
mal_lookup_stdout = "__MAL_FOUND__",
|
||||
aniskip_stdout = "__ANISKIP_FOUND__",
|
||||
files = {
|
||||
[binary_path] = true,
|
||||
},
|
||||
})
|
||||
assert_true(recorded ~= nil, "plugin failed to load for same-media reload scenario: " .. tostring(err))
|
||||
fire_event(recorded, "file-loaded")
|
||||
recorded.script_messages["subminer-autoplay-ready"]()
|
||||
fire_event(recorded, "end-file", { reason = "reload" })
|
||||
fire_event(recorded, "file-loaded")
|
||||
assert_true(
|
||||
count_control_calls(recorded.async_calls, "--hide-visible-overlay") == 0,
|
||||
"same-media reload should not hide the visible overlay"
|
||||
)
|
||||
assert_true(
|
||||
count_property_set(recorded.property_sets, "pause", true) == 1,
|
||||
"same-media reload should not re-arm pause-until-ready"
|
||||
)
|
||||
assert_true(
|
||||
count_async_curl_for(recorded.async_calls, "api.aniskip.com") == 1,
|
||||
"same-media reload should not repeat AniSkip lookup"
|
||||
)
|
||||
end
|
||||
|
||||
do
|
||||
local recorded, err = run_plugin_scenario({
|
||||
process_list = "",
|
||||
|
||||
@@ -4227,6 +4227,211 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'みたいなのあるじゃないですか',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: 'みたいな', reading: 'みたいな', headword: 'みたい' },
|
||||
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||
{ surface: 'ある', reading: 'ある', headword: 'ある' },
|
||||
{ surface: 'じゃないですか', reading: 'じゃないですか', headword: 'じゃない' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) =>
|
||||
text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
|
||||
getJlptLevel: (text) =>
|
||||
text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
|
||||
isKnownWord: (text) => text === 'みたい' || text === 'の',
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'みたい',
|
||||
surface: 'みたい',
|
||||
reading: 'ミタイ',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '形容動詞語幹',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'だ',
|
||||
surface: 'な',
|
||||
reading: 'ナ',
|
||||
startPos: 3,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
pos2: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'の',
|
||||
surface: 'の',
|
||||
reading: 'ノ',
|
||||
startPos: 4,
|
||||
endPos: 5,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'ある',
|
||||
surface: 'ある',
|
||||
reading: 'アル',
|
||||
startPos: 5,
|
||||
endPos: 7,
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'じゃない',
|
||||
surface: 'じゃない',
|
||||
reading: 'ジャナイ',
|
||||
startPos: 7,
|
||||
endPos: 11,
|
||||
partOfSpeech: PartOfSpeech.i_adjective,
|
||||
pos1: '接続詞|形容詞',
|
||||
pos2: '*|自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'です',
|
||||
surface: 'です',
|
||||
reading: 'デス',
|
||||
startPos: 11,
|
||||
endPos: 13,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
pos2: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'か',
|
||||
surface: 'か',
|
||||
reading: 'カ',
|
||||
startPos: 13,
|
||||
endPos: 14,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '副助詞/並立助詞/終助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
const tokenSummary = result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
}));
|
||||
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === 'じゃないですか'),
|
||||
{
|
||||
surface: 'じゃないですか',
|
||||
headword: 'じゃない',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: undefined,
|
||||
jlptLevel: undefined,
|
||||
},
|
||||
);
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === 'ある'),
|
||||
{
|
||||
surface: 'ある',
|
||||
headword: 'ある',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: 240,
|
||||
jlptLevel: 'N5',
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears annotations for standalone polite copula endings without POS metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'現実は感じですよ',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '現実', reading: 'げんじつ', headword: '現実' },
|
||||
{ surface: 'は', reading: 'は', headword: 'は' },
|
||||
{ surface: '感じ', reading: 'かんじ', headword: '感じ' },
|
||||
{ surface: 'ですよ', reading: 'ですよ', headword: 'です' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) =>
|
||||
text === '現実' ? 600 : text === '感じ' ? 240 : text === 'です' ? 50 : null,
|
||||
getJlptLevel: (text) =>
|
||||
text === '現実' ? 'N3' : text === '感じ' ? 'N4' : text === 'です' ? 'N5' : null,
|
||||
isKnownWord: (text) => text === '現実' || text === 'は' || text === 'です',
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
tokenizeWithMecab: async () => null,
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
const tokenSummary = result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
}));
|
||||
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === 'ですよ'),
|
||||
{
|
||||
surface: 'ですよ',
|
||||
headword: 'です',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: undefined,
|
||||
jlptLevel: undefined,
|
||||
},
|
||||
);
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === '感じ'),
|
||||
{
|
||||
surface: '感じ',
|
||||
headword: '感じ',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: true,
|
||||
frequencyRank: 240,
|
||||
jlptLevel: 'N4',
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'さっきの俺と違うことに気付かないのかい?',
|
||||
@@ -4446,6 +4651,114 @@ test('tokenizeSubtitle clears annotations for ことに while preserving lexical
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears annotations for auxiliary inflection fragments while preserving lexical N+1 target', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'私れた猫',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '私', reading: 'わたし', headword: '私' },
|
||||
{ surface: 'れた', reading: 'れた', headword: 'れる' },
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) =>
|
||||
text === '私' ? 50 : text === 'れる' ? 18 : text === '猫' ? 900 : null,
|
||||
getJlptLevel: (text) =>
|
||||
text === '私' ? 'N5' : text === 'れる' ? 'N4' : text === '猫' ? 'N5' : null,
|
||||
isKnownWord: (text) => text === '私' || text === 'れる',
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '私',
|
||||
surface: '私',
|
||||
reading: 'ワタシ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '代名詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'れる',
|
||||
surface: 'れ',
|
||||
reading: 'レ',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '接尾',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'た',
|
||||
surface: 'た',
|
||||
reading: 'タ',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
pos2: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 3,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
const tokenSummary = result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
}));
|
||||
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === 'れた'),
|
||||
{
|
||||
surface: 'れた',
|
||||
headword: 'れる',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: undefined,
|
||||
jlptLevel: undefined,
|
||||
},
|
||||
);
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === '猫'),
|
||||
{
|
||||
surface: '猫',
|
||||
headword: '猫',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: true,
|
||||
frequencyRank: 900,
|
||||
jlptLevel: 'N5',
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
||||
let mecabCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
|
||||
@@ -258,6 +258,48 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory contrast en
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes ja-nai explanatory endings', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'じゃない',
|
||||
headword: 'じゃない',
|
||||
reading: 'ジャナイ',
|
||||
partOfSpeech: PartOfSpeech.i_adjective,
|
||||
pos1: '接続詞|形容詞',
|
||||
pos2: '*|自立',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'じゃないですか',
|
||||
headword: 'じゃない',
|
||||
reading: 'ジャナイデスカ',
|
||||
partOfSpeech: PartOfSpeech.i_adjective,
|
||||
pos1: '接続詞|形容詞|助動詞|助詞',
|
||||
pos2: '*|自立|*|副助詞/並立助詞/終助詞',
|
||||
}),
|
||||
];
|
||||
|
||||
for (const token of tokens) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copula suffix endings without POS tags', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'ですよ',
|
||||
headword: 'です',
|
||||
reading: 'デスヨ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
}),
|
||||
];
|
||||
|
||||
for (const token of tokens) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
|
||||
const token = makeToken({
|
||||
surface: 'そうだ',
|
||||
@@ -627,7 +669,7 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens does not mark kana-only unknown target with subtitle punctuation as N+1', () => {
|
||||
test('annotateTokens does not mark kana-only unknown target as N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '何やら',
|
||||
@@ -646,12 +688,12 @@ test('annotateTokens does not mark kana-only unknown target with subtitle punctu
|
||||
endPos: 6,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'スイッチ…',
|
||||
headword: 'スイッチ',
|
||||
surface: 'すいっち',
|
||||
headword: 'すいっち',
|
||||
reading: 'スイッチ',
|
||||
pos1: '名詞',
|
||||
startPos: 6,
|
||||
endPos: 11,
|
||||
endPos: 10,
|
||||
}),
|
||||
];
|
||||
|
||||
@@ -1204,6 +1246,78 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'れる',
|
||||
headword: 'れる',
|
||||
reading: 'レル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '接尾',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 18,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'れた',
|
||||
headword: 'れる',
|
||||
reading: 'レタ',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助動詞',
|
||||
pos2: '接尾|*',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
frequencyRank: 19,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'れる',
|
||||
getJlptLevel: (text) => (text === 'れる' ? 'N4' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
for (const token of result) {
|
||||
assert.equal(token.isKnown, false, token.surface);
|
||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('annotateTokens keeps lexical くれる forms eligible for annotation', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'くれ',
|
||||
headword: 'くれる',
|
||||
reading: 'クレ',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 20,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
getJlptLevel: (text) => (text === 'くれる' ? 'N4' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, 20);
|
||||
assert.equal(result[0]?.jlptLevel, 'N4');
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
@@ -63,6 +63,24 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [
|
||||
'かな',
|
||||
'かね',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = [
|
||||
'か',
|
||||
'ね',
|
||||
'よ',
|
||||
'な',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [
|
||||
'',
|
||||
'か',
|
||||
'ね',
|
||||
'よ',
|
||||
'な',
|
||||
'です',
|
||||
'ですか',
|
||||
'ですよ',
|
||||
'ですね',
|
||||
'ですな',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
|
||||
@@ -72,6 +90,12 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
||||
),
|
||||
),
|
||||
);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set(
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`),
|
||||
);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set(
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`),
|
||||
);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||
'って',
|
||||
'ってよ',
|
||||
@@ -83,6 +107,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||
]);
|
||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
|
||||
const AUXILIARY_INFLECTION_TRAILING_POS1 = new Set(['助動詞']);
|
||||
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
||||
'か',
|
||||
'が',
|
||||
@@ -312,6 +337,44 @@ function isKanaOnlyText(text: string): boolean {
|
||||
return normalized.length > 0 && [...normalized].every(isKanaChar);
|
||||
}
|
||||
|
||||
function isLexicalKureruVerb(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
|
||||
return (
|
||||
normalizedSurface === 'くれ' &&
|
||||
normalizedHeadword === 'くれる' &&
|
||||
pos1Parts.length === 1 &&
|
||||
pos1Parts[0] === '動詞' &&
|
||||
pos2Parts.length === 1 &&
|
||||
pos2Parts[0] === '自立'
|
||||
);
|
||||
}
|
||||
|
||||
function isStandaloneAuxiliaryInflectionFragment(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
if (!isKanaOnlyText(normalizedSurface)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||
if (pos1Parts.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pos1Parts.every((part) => part === '助動詞')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
|
||||
return (
|
||||
pos1Parts[0] === '動詞' &&
|
||||
pos2Parts[0] === '接尾' &&
|
||||
pos1Parts.slice(1).every((part) => AUXILIARY_INFLECTION_TRAILING_POS1.has(part))
|
||||
);
|
||||
}
|
||||
|
||||
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
@@ -370,6 +433,10 @@ function isExcludedByTerm(token: MergedToken): boolean {
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) ||
|
||||
shouldIgnoreJlptByTerm(trimmed) ||
|
||||
shouldIgnoreJlptByTerm(normalized)
|
||||
) {
|
||||
@@ -426,6 +493,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isStandaloneAuxiliaryInflectionFragment(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isStandaloneSuruTeGrammarHelper(token)) {
|
||||
return true;
|
||||
}
|
||||
@@ -442,6 +513,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isLexicalKureruVerb(token)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return isExcludedByTerm(token);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user