Compare commits

..

17 Commits

Author SHA1 Message Date
f28821a8cb fix: correct session help subtitle binding labels 2026-04-25 21:37:22 -07:00
a75c83e25e feat: open session help from tray 2026-04-25 21:32:10 -07:00
e51bb74e1b fix(config): validate null hover background 2026-04-25 21:30:31 -07:00
5bccb55afc fix: exit managed playback on mpv socket close 2026-04-25 20:53:53 -07:00
ea8071f676 fix(overlay): show annotated subtitle hover affordance 2026-04-25 20:43:07 -07:00
ac93a5bd2e fix: address PR review follow-ups 2026-04-25 20:34:18 -07:00
ba48db6255 fix(character-dictionary): normalize fallback titles and wire close button
- Trim blank fallback titles in toAniListMediaCandidate; fall back to
  \"AniList <id>\" when all title fields and the fallback string are empty
- Add fetch unit test covering the trimmed-fallback path
- Extract wireDomEvents from createCharacterDictionaryModal so event
  listeners are bound explicitly after construction
- Call wireDomEvents in renderer init alongside other modal wiring
- Extend modal test to cover close-button click dismissing the modal
2026-04-25 20:03:51 -07:00
992856ac5e fix(launcher): reject --candidates and --select when used together
- Validate mutually exclusive dictionary CLI flags; exit 1 with clear error
- Add parseArgs test covering the conflicting-flags rejection path
- Fix test helper to overwrite capture file (>) instead of appending (>>)
2026-04-25 20:03:44 -07:00
de19c40118 refactor(character-dictionary): extract applyCharacterDictionarySelection helper
- Add `applyCharacterDictionarySelection` in its own module with injected deps
- Catches sync errors and emits a warning instead of propagating
- Remove duplicated inline logic from IPC and CLI startup handlers in main.ts
- Add unit test covering sync-failure resilience
2026-04-25 20:03:38 -07:00
a05a698774 fix(mpv): avoid crash notification on video close 2026-04-25 19:45:02 -07:00
7a08382c23 fix: overwrite manual subtitle audio fields 2026-04-25 19:42:33 -07:00
2c01baafc9 fix: exclude kana grammar helper annotations 2026-04-25 19:41:36 -07:00
76b546c8f6 fix: honor subtitle annotation style priority 2026-04-25 17:10:54 -07:00
c9df5b7624 feat: add primary subtitle bar toggle 2026-04-25 17:09:42 -07:00
055bd76718 feat: add manual AniList selection for character dictionaries 2026-04-25 15:53:20 -07:00
60435fee10 fix: exclude standalone interjection annotations 2026-04-25 15:52:31 -07:00
5b326978e9 fix: restore linux multi-copy digit capture 2026-04-25 15:49:54 -07:00
11 changed files with 39 additions and 926 deletions

View File

@@ -1,27 +0,0 @@
---
id: TASK-304
title: Fix N+1 sentence boundary counting across Yomitan punctuation gaps
status: In Progress
assignee: []
created_date: '2026-04-26 05:33'
labels:
- bug
- tokenizer
- annotations
dependencies: []
priority: medium
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
N+1 target selection should respect sentence-ending punctuation from the original subtitle text even when Yomitan token output omits punctuation tokens. Current behavior can treat multiple subtitle sentences as one token span and incorrectly satisfy the minimum content-token threshold.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [ ] #1 A subtitle like `てんめ!ふざけんなよ!` does not mark `ふざけん`/similar single-content-token second sentence as N+1 when the minimum sentence word count is 3.
- [ ] #2 N+1 sentence segmentation uses original subtitle text offsets or equivalent source-boundary data, not only punctuation tokens returned by Yomitan.
- [ ] #3 Existing annotation exclusion behavior for particles/grammar tokens remains unchanged.
- [ ] #4 Regression tests cover Yomitan-style token streams where punctuation is absent from the token list.
<!-- AC:END -->

View File

@@ -1,55 +0,0 @@
---
id: TASK-305
title: Use Yomitan word classes for subtitle token POS filtering
status: Done
assignee: []
created_date: '2026-04-26 05:56'
updated_date: '2026-04-26 05:59'
labels:
- tokenizer
- yomitan
dependencies: []
priority: medium
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Subtitle annotation filtering currently uses Yomitan token spans, then enriches those spans by running MeCab over the full normalized subtitle line. Add support for carrying Yomitan headword wordClasses from termsFind into SubMiner tokens so dictionary-backed tokens can provide coarse POS/tag metadata without vendored Yomitan changes. MeCab whole-line enrichment should remain a fallback/source of detailed POS data when Yomitan classes are absent.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 Yomitan scanner tokens preserve matched headword wordClasses when termsFind returns them.
- [x] #2 Subtitle tokenization maps recognized Yomitan wordClasses to coarse PartOfSpeech/POS metadata before annotation filtering.
- [x] #3 Whole-line MeCab enrichment remains available for missing or more detailed POS metadata and does not break existing subtitle annotation behavior.
- [x] #4 Focused tokenizer tests cover wordClasses extraction and POS mapping.
<!-- AC:END -->
## Implementation Plan
<!-- SECTION:PLAN:BEGIN -->
1. Add focused regression coverage for Yomitan scanner wordClasses payload and subtitle POS mapping.
2. Extend the app-owned Yomitan scanner payload to carry matched headword wordClasses when present.
3. Map recognized Yomitan wordClasses to SubMiner coarse PartOfSpeech/POS metadata before annotation filtering.
4. Keep MeCab whole-line enrichment as fallback/detail-fill for missing POS fields.
5. Run focused tokenizer tests and typecheck.
<!-- SECTION:PLAN:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Implemented app-only wordClasses extraction from termsFind results; no vendored Yomitan changes required. Recognized classes currently map prt, aux, v*, adj-i/adj-ix, adj-na, and noun-like classes to SubMiner POS metadata. MeCab enrichment now skips only tokens with complete pos1/pos2/pos3 and otherwise fills missing fields while preserving existing coarse pos1. Verification: bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts; bun run typecheck.
<!-- SECTION:NOTES:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Implemented app-only Yomitan wordClasses support for subtitle token annotation filtering. The scanner now carries matched headword wordClasses from termsFind results, tokenizer maps recognized classes into SubMiner coarse POS metadata before annotation, and MeCab whole-line enrichment continues to fill missing detailed POS fields without requiring vendored Yomitan changes.
Tests run:
- bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts
- bun run typecheck
Note: the working tree already had unrelated tokenizer/annotation edits and task-304 before this work; those were left intact.
<!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -25,7 +25,6 @@ interface YomitanTokenInput {
reading?: string; reading?: string;
headword?: string; headword?: string;
isNameMatch?: boolean; isNameMatch?: boolean;
wordClasses?: string[];
} }
function makeDepsFromYomitanTokens( function makeDepsFromYomitanTokens(
@@ -56,7 +55,6 @@ function makeDepsFromYomitanTokens(
startPos, startPos,
endPos, endPos,
isNameMatch: token.isNameMatch ?? false, isNameMatch: token.isNameMatch ?? false,
wordClasses: token.wordClasses,
}; };
}); });
}, },
@@ -1554,7 +1552,7 @@ test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4'); assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
}); });
test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token', async () => { test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
const result = await tokenizeSubtitle( const result = await tokenizeSubtitle(
'は', 'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], { makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
@@ -1563,7 +1561,7 @@ test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token'
); );
assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, undefined); assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
}); });
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => { test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
@@ -3036,58 +3034,6 @@ test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
assert.equal(frequencyCalls, 0); assert.equal(frequencyCalls, 0);
}); });
test('tokenizeSubtitle uses Yomitan word classes to classify standalone particles', async () => {
let mecabCalls = 0;
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
tokenizeWithMecab: async () => {
mecabCalls += 1;
return null;
},
}),
);
assert.equal(mecabCalls, 1);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
assert.equal(result.tokens?.[0]?.pos1, '助詞');
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
tokenizeWithMecab: async () => [
{
headword: 'は',
surface: 'は',
reading: 'ハ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '係助詞',
pos3: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
}),
);
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
assert.equal(result.tokens?.[0]?.pos1, '助詞');
assert.equal(result.tokens?.[0]?.pos2, '係助詞');
});
test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => { test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
let knownCalls = 0; let knownCalls = 0;
let mecabCalls = 0; let mecabCalls = 0;
@@ -3164,60 +3110,6 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
}); });
test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
const result = await tokenizeSubtitle(
'その点',
makeDepsFromYomitanTokens(
[
{ surface: 'その', reading: 'その', headword: 'その' },
{ surface: '点', reading: 'てん', headword: '点' },
],
{
isKnownWord: (text) => text === '点' || text === 'てん',
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '点' ? 1384 : null),
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
tokenizeWithMecab: async () => [
{
headword: 'その',
surface: 'その',
reading: 'ソノ',
startPos: 0,
endPos: 2,
partOfSpeech: PartOfSpeech.other,
pos1: '連体詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '点',
surface: '点',
reading: 'テン',
startPos: 2,
endPos: 3,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
pos3: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.isKnown, false);
assert.equal(result.tokens?.[1]?.surface, '点');
assert.equal(result.tokens?.[1]?.isKnown, true);
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
});
test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => { test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
const result = await tokenizeSubtitle( const result = await tokenizeSubtitle(
'ぐはっ', 'ぐはっ',

View File

@@ -96,7 +96,6 @@ interface TokenizerAnnotationOptions {
minSentenceWordsForNPlusOne: number | undefined; minSentenceWordsForNPlusOne: number | undefined;
pos1Exclusions: ReadonlySet<string>; pos1Exclusions: ReadonlySet<string>;
pos2Exclusions: ReadonlySet<string>; pos2Exclusions: ReadonlySet<string>;
sourceText?: string;
} }
let parserEnrichmentWorkerRuntimeModulePromise: Promise< let parserEnrichmentWorkerRuntimeModulePromise: Promise<
@@ -334,66 +333,6 @@ function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
})); }));
} }
function normalizeYomitanWordClasses(wordClasses: unknown): string[] {
if (!Array.isArray(wordClasses)) {
return [];
}
const normalized: string[] = [];
for (const wordClass of wordClasses) {
if (typeof wordClass !== 'string') {
continue;
}
const trimmed = wordClass.trim();
if (trimmed && !normalized.includes(trimmed)) {
normalized.push(trimmed);
}
}
return normalized;
}
function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
partOfSpeech: PartOfSpeech;
pos1?: string;
} {
if (wordClasses.includes('prt')) {
return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
}
if (wordClasses.includes('aux')) {
return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
}
if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
return { partOfSpeech: PartOfSpeech.verb, pos1: '動詞' };
}
if (wordClasses.includes('adj-i') || wordClasses.includes('adj-ix')) {
return { partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞' };
}
if (wordClasses.includes('adj-na')) {
return { partOfSpeech: PartOfSpeech.na_adjective, pos1: '名詞' };
}
if (
wordClasses.some(
(wordClass) =>
wordClass === 'n' ||
wordClass === 'num' ||
wordClass === 'ctr' ||
wordClass === 'pn' ||
wordClass.startsWith('n-'),
)
) {
return { partOfSpeech: PartOfSpeech.noun, pos1: '名詞' };
}
return { partOfSpeech: PartOfSpeech.other };
}
function getYomitanWordClassPosMetadata(wordClasses: unknown): {
partOfSpeech: PartOfSpeech;
pos1?: string;
} {
return resolvePartOfSpeechFromYomitanWordClasses(normalizeYomitanWordClasses(wordClasses));
}
function resolveFrequencyLookupText( function resolveFrequencyLookupText(
token: MergedToken, token: MergedToken,
matchMode: FrequencyDictionaryMatchMode, matchMode: FrequencyDictionaryMatchMode,
@@ -684,23 +623,19 @@ async function parseWithYomitanInternalParser(
} }
const normalizedSelectedTokens = normalizeSelectedYomitanTokens( const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
selectedTokens.map( selectedTokens.map(
(token): MergedToken => { (token): MergedToken => ({
const posMetadata = getYomitanWordClassPosMetadata(token.wordClasses); surface: token.surface,
return { reading: token.reading,
surface: token.surface, headword: token.headword,
reading: token.reading, startPos: token.startPos,
headword: token.headword, endPos: token.endPos,
startPos: token.startPos, partOfSpeech: PartOfSpeech.other,
endPos: token.endPos, isMerged: true,
partOfSpeech: posMetadata.partOfSpeech, isKnown: false,
pos1: posMetadata.pos1, isNPlusOneTarget: false,
isMerged: true, isNameMatch: token.isNameMatch ?? false,
isKnown: false, frequencyRank: token.frequencyRank,
isNPlusOneTarget: false, }),
isNameMatch: token.isNameMatch ?? false,
frequencyRank: token.frequencyRank,
};
},
), ),
); );
@@ -781,11 +716,12 @@ export async function tokenizeSubtitle(
.replace(/\s+/g, ' ') .replace(/\s+/g, ' ')
.trim(); .trim();
const annotationOptions = getAnnotationOptions(deps); const annotationOptions = getAnnotationOptions(deps);
annotationOptions.sourceText = tokenizeText;
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions); const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
if (yomitanTokens && yomitanTokens.length > 0) { if (yomitanTokens && yomitanTokens.length > 0) {
const annotatedTokens = await applyAnnotationStage(yomitanTokens, deps, annotationOptions); const annotatedTokens = await stripSubtitleAnnotationMetadata(
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
);
return { return {
text: displayText, text: displayText,
tokens: annotatedTokens.length > 0 ? annotatedTokens : null, tokens: annotatedTokens.length > 0 ? annotatedTokens : null,

View File

@@ -366,132 +366,6 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
}); });
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone して grammar helper fragments', () => {
const token = makeToken({
surface: 'して',
headword: 'する',
reading: 'シテ',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助詞',
pos2: '自立|接続助詞',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes inflected standalone して grammar helper fragments', () => {
const token = makeToken({
surface: 'してる',
headword: 'する',
reading: 'シテル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助動詞',
pos2: '自立|非自立',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particle fragments without POS tags', () => {
const token = makeToken({
surface: 'と',
headword: 'と',
reading: 'ト',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone connective particle fragments without POS tags', () => {
const token = makeToken({
surface: 'たって',
headword: 'たって',
reading: 'タッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes rhetorical もんか grammar particle phrases', () => {
for (const surface of ['もんか', 'ものか']) {
const token = makeToken({
surface,
headword: surface,
reading: surface === 'もんか' ? 'モンカ' : 'モノカ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助詞',
pos2: '非自立|副助詞/並立助詞/終助詞',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary fragments', () => {
const token = makeToken({
surface: 'くれ',
headword: '暮れ',
reading: 'クレ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
for (const token of [
makeToken({
surface: 'って',
headword: 'って',
reading: 'ッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
makeToken({
surface: 'べき',
headword: 'べき',
reading: 'ベキ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
]) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fragments', () => {
for (const token of [
makeToken({
surface: 'ふ',
headword: '不',
reading: 'フ',
partOfSpeech: PartOfSpeech.other,
pos1: '接頭詞',
pos2: '',
}),
makeToken({
surface: 'フ',
headword: '負',
reading: 'フ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
}),
]) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => { test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
const token = makeToken({ const token = makeToken({
surface: 'は', surface: 'は',
@@ -662,57 +536,6 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.isNPlusOneTarget, false);
}); });
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
const tokens = [
makeToken({
surface: '私',
headword: '私',
pos1: '名詞',
startPos: 0,
endPos: 1,
}),
makeToken({
surface: '猫',
headword: '猫',
pos1: '名詞',
startPos: 1,
endPos: 2,
}),
makeToken({
surface: '犬',
headword: '犬',
pos1: '名詞',
startPos: 2,
endPos: 3,
}),
makeToken({
surface: 'ふざけん',
headword: 'ふざける',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
startPos: 4,
endPos: 8,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '私' || text === '猫' || text === '犬',
}),
{
minSentenceWordsForNPlusOne: 3,
sourceText: '私猫犬!ふざけんなよ!',
},
);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
assert.equal(result[2]?.isNPlusOneTarget, false);
assert.equal(result[3]?.isNPlusOneTarget, false);
});
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => { test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
const tokens = [ const tokens = [
makeToken({ makeToken({
@@ -787,52 +610,14 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
}), }),
]; ];
const result = annotateTokens( const result = annotateTokens(tokens, makeDeps(), {
tokens, minSentenceWordsForNPlusOne: 1,
makeDeps({ });
isKnownWord: (text) => text === 'た' || text === '負',
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
}),
{
minSentenceWordsForNPlusOne: 1,
},
);
assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.isNPlusOneTarget, false);
}); });
test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => {
const tokens = [
makeToken({
surface: '点',
reading: 'てん',
headword: '点',
partOfSpeech: PartOfSpeech.other,
pos1: '名詞',
pos2: '非自立',
pos3: '一般',
startPos: 2,
endPos: 3,
frequencyRank: 1384,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '点' || text === 'てん',
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, true);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => { test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
const tokens = [ const tokens = [
makeToken({ makeToken({
@@ -880,7 +665,7 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.frequencyRank, undefined);
}); });
test('annotateTokens clears all annotations from single hiragana and katakana surface fragments', () => { test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
const tokens = [ const tokens = [
makeToken({ makeToken({
surface: 'た', surface: 'た',
@@ -894,12 +679,12 @@ test('annotateTokens clears all annotations from single hiragana and katakana su
endPos: 1, endPos: 1,
}), }),
makeToken({ makeToken({
surface: '', surface: '',
reading: '', reading: '',
headword: '', headword: '',
pos1: '名詞', pos1: '',
pos2: '', pos2: '',
partOfSpeech: PartOfSpeech.noun, partOfSpeech: PartOfSpeech.other,
frequencyRank: 22, frequencyRank: 22,
startPos: 1, startPos: 1,
endPos: 2, endPos: 2,
@@ -921,14 +706,8 @@ test('annotateTokens clears all annotations from single hiragana and katakana su
minSentenceWordsForNPlusOne: 1, minSentenceWordsForNPlusOne: 1,
}); });
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
assert.equal(result[1]?.isKnown, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
assert.equal(result[1]?.frequencyRank, undefined); assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[1]?.jlptLevel, undefined);
assert.equal(result[2]?.frequencyRank, 23); assert.equal(result[2]?.frequencyRank, 23);
}); });
@@ -1077,219 +856,6 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
assert.equal(result[0]?.jlptLevel, undefined); assert.equal(result[0]?.jlptLevel, undefined);
}); });
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
const tokens = [
makeToken({
surface: 'してる',
headword: 'する',
reading: 'シテル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助動詞',
pos2: '自立|非自立',
startPos: 0,
endPos: 3,
frequencyRank: 22,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'する',
getJlptLevel: (text) => (text === 'する' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
const tokens = [
makeToken({
surface: 'と',
headword: 'と',
reading: 'ト',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 0,
endPos: 1,
frequencyRank: 4,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'と',
getJlptLevel: (text) => (text === 'と' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens does not mark standalone connective particles as N+1', () => {
const tokens = [
makeToken({
surface: '逃げる',
headword: '逃げる',
reading: 'ニゲル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
startPos: 0,
endPos: 3,
}),
makeToken({
surface: 'たって',
headword: 'たって',
reading: 'タッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 3,
endPos: 6,
frequencyRank: 28,
}),
makeToken({
surface: '無駄',
headword: '無駄',
reading: 'ムダ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '形容動詞語幹',
startPos: 6,
endPos: 8,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '逃げる' || text === '無駄',
getJlptLevel: (text) => (text === 'たって' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[1]?.isKnown, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[1]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
const tokens = [
makeToken({
surface: 'もんか',
headword: 'もんか',
reading: 'モンカ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助詞',
pos2: '非自立|副助詞/並立助詞/終助詞',
startPos: 0,
endPos: 3,
frequencyRank: 69629,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'もんか',
getJlptLevel: (text) => (text === 'もんか' ? 'N2' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
const tokens = [
makeToken({
surface: 'くれ',
headword: '暮れ',
reading: 'クレ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
startPos: 0,
endPos: 2,
frequencyRank: 12877,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '暮れ',
getJlptLevel: (text) => (text === '暮れ' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
const tokens = [
makeToken({
surface: 'って',
headword: 'って',
reading: 'ッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 0,
endPos: 2,
frequencyRank: 28,
}),
makeToken({
surface: 'べき',
headword: 'べき',
reading: 'ベキ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 2,
endPos: 4,
frequencyRank: 268,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'って' || text === 'べき',
getJlptLevel: (text) => (text === 'って' || text === 'べき' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
for (const token of result) {
assert.equal(token.isKnown, false, token.surface);
assert.equal(token.isNPlusOneTarget, false, token.surface);
assert.equal(token.frequencyRank, undefined, token.surface);
assert.equal(token.jlptLevel, undefined, token.surface);
}
});
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => { test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
const tokens = [ const tokens = [
makeToken({ makeToken({

View File

@@ -89,7 +89,6 @@ export interface AnnotationStageOptions {
minSentenceWordsForNPlusOne?: number; minSentenceWordsForNPlusOne?: number;
pos1Exclusions?: ReadonlySet<string>; pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>; pos2Exclusions?: ReadonlySet<string>;
sourceText?: string;
} }
function resolveKnownWordText( function resolveKnownWordText(
@@ -671,36 +670,6 @@ function computeTokenKnownStatus(
return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading); return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
} }
function computeExcludedTokenKnownStatus(
token: MergedToken,
isKnownWord: (text: string) => boolean,
): boolean {
const normalizedSurface = token.surface.trim();
if (!hasKanjiChar(normalizedSurface)) {
return false;
}
if (normalizedSurface && isKnownWord(normalizedSurface)) {
return true;
}
const normalizedReading = token.reading.trim();
if (
normalizedReading &&
normalizedReading !== normalizedSurface &&
isKnownWord(normalizedReading)
) {
return true;
}
const normalizedHeadword = token.headword.trim();
return (
normalizedHeadword.length > 0 &&
normalizedHeadword === normalizedSurface &&
isKnownWord(normalizedHeadword)
);
}
function filterTokenFrequencyRank( function filterTokenFrequencyRank(
token: MergedToken, token: MergedToken,
pos1Exclusions: ReadonlySet<string>, pos1Exclusions: ReadonlySet<string>,
@@ -763,16 +732,10 @@ export function annotateTokens(
pos2Exclusions, pos2Exclusions,
}) })
) { ) {
const strippedToken = sharedStripSubtitleAnnotationMetadata(token, { return sharedStripSubtitleAnnotationMetadata(token, {
pos1Exclusions, pos1Exclusions,
pos2Exclusions, pos2Exclusions,
}); });
return {
...strippedToken,
isKnown:
nPlusOneEnabled &&
computeExcludedTokenKnownStatus(token, deps.isKnownWord),
};
} }
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true; const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
@@ -816,7 +779,6 @@ export function annotateTokens(
sanitizedMinSentenceWordsForNPlusOne, sanitizedMinSentenceWordsForNPlusOne,
pos1Exclusions, pos1Exclusions,
pos2Exclusions, pos2Exclusions,
options.sourceText,
); );
if (!nameMatchEnabled) { if (!nameMatchEnabled) {

View File

@@ -303,9 +303,7 @@ function fillMissingPos1BySurfaceSequence(
let cursor = 0; let cursor = 0;
return tokens.map((token) => { return tokens.map((token) => {
const hasCompletePosMetadata = if (token.pos1 && token.pos1.trim().length > 0) {
token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim();
if (hasCompletePosMetadata) {
return token; return token;
} }
@@ -329,9 +327,9 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1; cursor = best.index + 1;
return { return {
...token, ...token,
pos1: token.pos1 ?? best.pos1, pos1: best.pos1,
pos2: token.pos2 ?? best.pos2, pos2: best.pos2,
pos3: token.pos3 ?? best.pos3, pos3: best.pos3,
}; };
}); });
} }
@@ -384,7 +382,7 @@ export function enrichTokensWithMecabPos1(
const metadataByTokenIndex = new Map<number, MecabPosMetadata>(); const metadataByTokenIndex = new Map<number, MecabPosMetadata>();
for (const [index, token] of tokens.entries()) { for (const [index, token] of tokens.entries()) {
if (token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim()) { if (token.pos1) {
continue; continue;
} }
@@ -412,9 +410,9 @@ export function enrichTokensWithMecabPos1(
return { return {
...token, ...token,
pos1: token.pos1 ?? metadata.pos1, pos1: metadata.pos1,
pos2: token.pos2 ?? metadata.pos2, pos2: metadata.pos2,
pos3: token.pos3 ?? metadata.pos3, pos3: metadata.pos3,
}; };
}); });

View File

@@ -19,18 +19,11 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ええ', 'ええ',
'うう', 'うう',
'おお', 'おお',
'くれ',
'たって',
'って',
'だって',
'はあ', 'はあ',
'はは', 'はは',
'べき',
'へえ', 'へえ',
'ふう', 'ふう',
'ほう', 'ほう',
'もんか',
'ものか',
]); ]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -79,26 +72,6 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
]); ]);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']); const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']); const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
'か',
'が',
'さ',
'し',
'ぞ',
'ぜ',
'と',
'な',
'に',
'ね',
'の',
'は',
'へ',
'も',
'や',
'よ',
'を',
]);
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
export interface SubtitleAnnotationFilterOptions { export interface SubtitleAnnotationFilterOptions {
pos1Exclusions?: ReadonlySet<string>; pos1Exclusions?: ReadonlySet<string>;
@@ -305,38 +278,6 @@ function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean {
return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part)); return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part));
} }
function isKanaOnlyText(text: string): boolean {
const normalized = normalizeKana(text);
return normalized.length > 0 && [...normalized].every(isKanaChar);
}
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
if (!normalizedSurface.startsWith('して') || normalizedHeadword !== 'する') {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
return isKanaOnlyText(normalizedSurface) && (pos1Parts.length === 0 || pos1Parts.includes('動詞'));
}
function isStandaloneGrammarParticle(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
return (
normalizedSurface === normalizedHeadword &&
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
);
}
function isSingleKanaSurfaceFragment(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const chars = [...normalizedSurface];
return chars.length === 1 && chars.every(isKanaChar);
}
function isExcludedByTerm(token: MergedToken): boolean { function isExcludedByTerm(token: MergedToken): boolean {
const candidates = [token.surface, token.reading, token.headword].filter( const candidates = [token.surface, token.reading, token.headword].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
@@ -424,18 +365,6 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
return true; return true;
} }
if (isStandaloneSuruTeGrammarHelper(token)) {
return true;
}
if (isStandaloneGrammarParticle(token)) {
return true;
}
if (isSingleKanaSurfaceFragment(token)) {
return true;
}
if (isExcludedTrailingParticleMergedToken(token)) { if (isExcludedTrailingParticleMergedToken(token)) {
return true; return true;
} }

View File

@@ -1049,60 +1049,6 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true); assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
}); });
test('requestYomitanScanTokens preserves matched headword word classes', async () => {
let scannerScript = '';
const deps = createDeps(async (script) => {
if (script.includes('termsFind')) {
scannerScript = script;
return [];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
},
},
],
};
}
return null;
});
await requestYomitanScanTokens('は', deps, { error: () => undefined });
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text;
if (text !== 'は') {
return { originalTextLength: 0, dictionaryEntries: [] };
}
return {
originalTextLength: 1,
dictionaryEntries: [
{
headwords: [
{
term: 'は',
reading: 'は',
wordClasses: ['prt'],
sources: [{ originalText: 'は', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
});
assert.deepEqual((result as Array<{ wordClasses?: string[] }>)[0]?.wordClasses, ['prt']);
});
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => { test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
const deps = createDeps(async (script) => { const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) { if (script.includes('optionsGetFull')) {

View File

@@ -53,7 +53,6 @@ export interface YomitanScanToken {
endPos: number; endPos: number;
isNameMatch?: boolean; isNameMatch?: boolean;
frequencyRank?: number; frequencyRank?: number;
wordClasses?: string[];
} }
interface YomitanProfileMetadata { interface YomitanProfileMetadata {
@@ -92,10 +91,7 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
typeof entry.startPos === 'number' && typeof entry.startPos === 'number' &&
typeof entry.endPos === 'number' && typeof entry.endPos === 'number' &&
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') && (entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number') && (entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
(entry.wordClasses === undefined ||
(Array.isArray(entry.wordClasses) &&
entry.wordClasses.every((wordClass) => typeof wordClass === 'string'))),
) )
); );
} }
@@ -979,11 +975,6 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
return best; return best;
} }
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) { function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
function normalizeWordClasses(headword) {
if (!Array.isArray(headword?.wordClasses)) { return undefined; }
const classes = headword.wordClasses.filter((wordClass) => typeof wordClass === "string" && wordClass.trim().length > 0);
return classes.length > 0 ? classes : undefined;
}
function appendDictionaryNames(target, value) { function appendDictionaryNames(target, value) {
if (!value || typeof value !== 'object') { if (!value || typeof value !== 'object') {
return; return;
@@ -1042,7 +1033,6 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
return { return {
term: preferredMatch.headword.term, term: preferredMatch.headword.term,
reading: preferredMatch.headword.reading, reading: preferredMatch.headword.reading,
wordClasses: normalizeWordClasses(preferredMatch.headword),
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry), isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
frequencyRank: getBestFrequencyRankForMatches( frequencyRank: getBestFrequencyRankForMatches(
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches, exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
@@ -1109,7 +1099,7 @@ ${YOMITAN_SCANNING_HELPERS}
if (preferredHeadword && typeof preferredHeadword.term === "string") { if (preferredHeadword && typeof preferredHeadword.term === "string") {
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : ""; const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source); const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
const tokenPayload = { tokens.push({
surface: segments.map((segment) => segment.text).join("") || source, surface: segments.map((segment) => segment.text).join("") || source,
reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""), reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
headword: preferredHeadword.term, headword: preferredHeadword.term,
@@ -1120,11 +1110,7 @@ ${YOMITAN_SCANNING_HELPERS}
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank) typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
? Math.max(1, Math.floor(preferredHeadword.frequencyRank)) ? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
: undefined, : undefined,
}; });
if (Array.isArray(preferredHeadword.wordClasses) && preferredHeadword.wordClasses.length > 0) {
tokenPayload.wordClasses = preferredHeadword.wordClasses;
}
tokens.push(tokenPayload);
i += originalTextLength; i += originalTextLength;
continue; continue;
} }

View File

@@ -347,25 +347,11 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
return SENTENCE_BOUNDARY_SURFACES.has(token.surface); return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
} }
function hasSentenceBoundaryInSourceGap(
sourceText: string | undefined,
previousEnd: number | null,
nextStart: number,
): boolean {
if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
return false;
}
const gap = sourceText.slice(previousEnd, nextStart);
return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
}
export function markNPlusOneTargets( export function markNPlusOneTargets(
tokens: MergedToken[], tokens: MergedToken[],
minSentenceWords = 3, minSentenceWords = 3,
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1, pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2, pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
sourceText?: string,
): MergedToken[] { ): MergedToken[] {
if (tokens.length === 0) { if (tokens.length === 0) {
return []; return [];
@@ -377,7 +363,6 @@ export function markNPlusOneTargets(
})); }));
let sentenceStart = 0; let sentenceStart = 0;
let previousTokenEnd: number | null = null;
const minimumSentenceWords = Number.isInteger(minSentenceWords) const minimumSentenceWords = Number.isInteger(minSentenceWords)
? Math.max(1, minSentenceWords) ? Math.max(1, minSentenceWords)
: 3; : 3;
@@ -408,15 +393,10 @@ export function markNPlusOneTargets(
for (let i = 0; i < markedTokens.length; i++) { for (let i = 0; i < markedTokens.length; i++) {
const token = markedTokens[i]; const token = markedTokens[i];
if (!token) continue; if (!token) continue;
if (hasSentenceBoundaryInSourceGap(sourceText, previousTokenEnd, token.startPos)) {
markSentence(sentenceStart, i);
sentenceStart = i;
}
if (isSentenceBoundaryToken(token)) { if (isSentenceBoundaryToken(token)) {
markSentence(sentenceStart, i); markSentence(sentenceStart, i);
sentenceStart = i + 1; sentenceStart = i + 1;
} }
previousTokenEnd = token.endPos;
} }
if (sentenceStart < markedTokens.length) { if (sentenceStart < markedTokens.length) {