mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
fix: preserve known highlighting for filtered tokens
This commit is contained in:
@@ -0,0 +1,53 @@
|
|||||||
|
---
|
||||||
|
id: TASK-333
|
||||||
|
title: Suppress aru subtitle annotations
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-05-04 04:39'
|
||||||
|
updated_date: '2026-05-04 05:02'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- annotations
|
||||||
|
- bug
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Add `ある` / `有る` to the subtitle annotation suppression path so `aru` tokens remain hoverable and never receive N+1, JLPT, frequency, or name-match annotation metadata. Known-word highlighting is special: if a filtered `aru` token is known and known highlighting is enabled, it should still render as known.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 `ある` and kanji headword/surface variants such as `有る` are excluded by the subtitle annotation filter.
|
||||||
|
- [x] #2 Annotation stripping clears N+1, JLPT, frequency, and name metadata for `aru` tokens while preserving token hover data.
|
||||||
|
- [x] #3 Known-word highlighting still applies to filtered tokens, including `aru`, when known-word lookup marks them known.
|
||||||
|
- [x] #4 Regression coverage fails before the fix and passes after.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Add `ある`/`有る`/`在る` to the shared subtitle annotation hard-exclusion terms.
|
||||||
|
2. Preserve/recompute known-word status for filtered tokens while stripping N+1, JLPT, frequency, and name metadata.
|
||||||
|
3. Add RED/GREEN unit and tokenizer regression coverage, plus a changelog fragment.
|
||||||
|
4. Run targeted tests and full handoff gate.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
TDD path: added failing annotation-stage coverage first. Initial implementation made targeted tests pass, then broader tokenizer coverage revealed an older fixture expecting `ある` to remain lexical; updated that integration expectation to the new requested behavior. Follow-up correction: known-word highlighting is the lone annotation exception for filtered tokens, so the strip path now preserves known state and `annotateTokens` recomputes known status for filtered tokens while still clearing N+1/JLPT/frequency/name metadata.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Suppressed non-known subtitle annotations for `aru` existence verbs by adding `ある`, `有る`, and `在る` to the shared hard-exclusion list. Corrected the filtered-token path so known-word highlighting still applies whenever known highlighting is enabled; filtered tokens now keep/gain `isKnown` but still lose N+1, JLPT, frequency, and name metadata.
|
||||||
|
|
||||||
|
Added and updated annotation-stage and tokenizer regression coverage for `aru`, particles, helper fragments, interjections, and other filtered known tokens. Added `changes/333-aru-annotation-filter.md`.
|
||||||
|
|
||||||
|
Validation passed: RED failures observed before implementation/correction; `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run format:check:src`; `bun run changelog:lint`; `bun run test:fast`; `bun run test:env`; `bun run build`; `bun run test:smoke:dist`.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: tokenizer
|
||||||
|
|
||||||
|
- Suppressed N+1, JLPT, frequency, and name styling for `ある` / `有る` existence verbs while still allowing known-word highlighting.
|
||||||
@@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati
|
|||||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
||||||
assert.equal(result.tokens?.[1]?.surface, 'です');
|
assert.equal(result.tokens?.[1]?.surface, 'です');
|
||||||
assert.equal(result.tokens?.[1]?.isKnown, false);
|
assert.equal(result.tokens?.[1]?.isKnown, true);
|
||||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||||
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
||||||
@@ -3893,7 +3893,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper
|
|||||||
{
|
{
|
||||||
surface: 'これで',
|
surface: 'これで',
|
||||||
headword: 'これ',
|
headword: 'これ',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4008,7 +4008,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings'
|
|||||||
{
|
{
|
||||||
surface: 'のかな',
|
surface: 'のかな',
|
||||||
headword: 'の',
|
headword: 'の',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4306,7 +4306,7 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
|
test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru verbs', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'みたいなのあるじゃないですか',
|
'みたいなのあるじゃないですか',
|
||||||
makeDepsFromYomitanTokens(
|
makeDepsFromYomitanTokens(
|
||||||
@@ -4322,7 +4322,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p
|
|||||||
text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
|
text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
|
||||||
getJlptLevel: (text) =>
|
getJlptLevel: (text) =>
|
||||||
text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
|
text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
|
||||||
isKnownWord: (text) => text === 'みたい' || text === 'の',
|
isKnownWord: (text) => text === 'みたい' || text === 'の' || text === 'ある',
|
||||||
getMinSentenceWordsForNPlusOne: () => 1,
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
tokenizeWithMecab: async () => [
|
tokenizeWithMecab: async () => [
|
||||||
{
|
{
|
||||||
@@ -4447,10 +4447,10 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p
|
|||||||
{
|
{
|
||||||
surface: 'ある',
|
surface: 'ある',
|
||||||
headword: 'ある',
|
headword: 'ある',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: 240,
|
frequencyRank: undefined,
|
||||||
jlptLevel: 'N5',
|
jlptLevel: undefined,
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
@@ -4492,7 +4492,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w
|
|||||||
{
|
{
|
||||||
surface: 'ですよ',
|
surface: 'ですよ',
|
||||||
headword: 'です',
|
headword: 'です',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4819,7 +4819,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi
|
|||||||
{
|
{
|
||||||
surface: 'れた',
|
surface: 'れた',
|
||||||
headword: 'れる',
|
headword: 'れる',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4956,7 +4956,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
|
|||||||
{
|
{
|
||||||
surface: 'てく',
|
surface: 'てく',
|
||||||
headword: 'てく',
|
headword: 'てく',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4967,7 +4967,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
|
|||||||
{
|
{
|
||||||
surface: 'れた',
|
surface: 'れた',
|
||||||
headword: 'れる',
|
headword: 'れる',
|
||||||
isKnown: false,
|
isKnown: true,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
|
|||||||
@@ -608,6 +608,29 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary f
|
|||||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes aru existence verbs', () => {
|
||||||
|
for (const token of [
|
||||||
|
makeToken({
|
||||||
|
surface: 'ある',
|
||||||
|
headword: 'ある',
|
||||||
|
reading: 'アル',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: '有る',
|
||||||
|
headword: '有る',
|
||||||
|
reading: 'アル',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
}),
|
||||||
|
]) {
|
||||||
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
|
||||||
for (const token of [
|
for (const token of [
|
||||||
makeToken({
|
makeToken({
|
||||||
@@ -654,7 +677,7 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fra
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
test('stripSubtitleAnnotationMetadata keeps known hover data while clearing non-known annotation fields', () => {
|
||||||
const token = makeToken({
|
const token = makeToken({
|
||||||
surface: 'は',
|
surface: 'は',
|
||||||
headword: 'は',
|
headword: 'は',
|
||||||
@@ -670,7 +693,6 @@ test('stripSubtitleAnnotationMetadata keeps token hover data while clearing anno
|
|||||||
|
|
||||||
assert.deepEqual(stripSubtitleAnnotationMetadata(token), {
|
assert.deepEqual(stripSubtitleAnnotationMetadata(token), {
|
||||||
...token,
|
...token,
|
||||||
isKnown: false,
|
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
isNameMatch: false,
|
isNameMatch: false,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -876,8 +898,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
|||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[1]?.isKnown, false);
|
assert.equal(result[1]?.isKnown, true);
|
||||||
assert.equal(result[2]?.isKnown, false);
|
assert.equal(result[2]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1330,13 +1352,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for kana-only non-independent noun helper merges', () => {
|
test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'ことに',
|
surface: 'ことに',
|
||||||
@@ -1360,13 +1382,13 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
|
test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'れる',
|
surface: 'れる',
|
||||||
@@ -1402,14 +1424,14 @@ test('annotateTokens clears all annotations for standalone auxiliary inflection
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result) {
|
for (const token of result) {
|
||||||
assert.equal(token.isKnown, false, token.surface);
|
assert.equal(token.isKnown, true, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for auxiliary-only te-kureru helper spans', () => {
|
test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'てく',
|
surface: 'てく',
|
||||||
@@ -1445,7 +1467,7 @@ test('annotateTokens clears all annotations for auxiliary-only te-kureru helper
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result) {
|
for (const token of result) {
|
||||||
assert.equal(token.isKnown, false, token.surface);
|
assert.equal(token.isKnown, true, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
@@ -1481,7 +1503,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', ()
|
|||||||
assert.equal(result[0]?.jlptLevel, 'N4');
|
assert.equal(result[0]?.jlptLevel, 'N4');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
|
test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'してる',
|
surface: 'してる',
|
||||||
@@ -1505,13 +1527,13 @@ test('annotateTokens clears all annotations for standalone して helper fragmen
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
|
test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'と',
|
surface: 'と',
|
||||||
@@ -1535,7 +1557,7 @@ test('annotateTokens clears all annotations for standalone particle fragments wi
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
@@ -1591,7 +1613,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () =
|
|||||||
assert.equal(result[1]?.jlptLevel, undefined);
|
assert.equal(result[1]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
|
test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'もんか',
|
surface: 'もんか',
|
||||||
@@ -1615,13 +1637,13 @@ test('annotateTokens clears all annotations for rhetorical もんか grammar par
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
|
test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'くれ',
|
surface: 'くれ',
|
||||||
@@ -1645,13 +1667,50 @@ test('annotateTokens clears all annotations for bare くれ auxiliary fragments'
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
|
test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: '有る',
|
||||||
|
headword: '有る',
|
||||||
|
reading: 'アル',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
frequencyRank: 8447,
|
||||||
|
isKnown: true,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
isNameMatch: true,
|
||||||
|
jlptLevel: 'N5',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === '有る' || text === 'ある',
|
||||||
|
getJlptLevel: (text) => (text === '有る' || text === 'ある' ? 'N5' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.surface, '有る');
|
||||||
|
assert.equal(result[0]?.headword, '有る');
|
||||||
|
assert.equal(result[0]?.isKnown, true);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.isNameMatch, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'って',
|
surface: 'って',
|
||||||
@@ -1687,14 +1746,14 @@ test('annotateTokens clears all annotations for standalone quote particle and au
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result) {
|
for (const token of result) {
|
||||||
assert.equal(token.isKnown, false, token.surface);
|
assert.equal(token.isKnown, true, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'あ',
|
surface: 'あ',
|
||||||
@@ -1724,7 +1783,7 @@ test('annotateTokens clears all annotations from standalone あ interjections wi
|
|||||||
assert.equal(result[0]?.surface, 'あ');
|
assert.equal(result[0]?.surface, 'あ');
|
||||||
assert.equal(result[0]?.headword, 'あ');
|
assert.equal(result[0]?.headword, 'あ');
|
||||||
assert.equal(result[0]?.reading, 'あ');
|
assert.equal(result[0]?.reading, 'あ');
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
@@ -1786,7 +1845,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result.slice(0, 2)) {
|
for (const token of result.slice(0, 2)) {
|
||||||
assert.equal(token.isKnown, false, token.surface);
|
assert.equal(token.isKnown, true, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
|
|||||||
@@ -559,36 +559,6 @@ function computeTokenKnownStatus(
|
|||||||
return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
|
return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
|
||||||
}
|
}
|
||||||
|
|
||||||
function computeExcludedTokenKnownStatus(
|
|
||||||
token: MergedToken,
|
|
||||||
isKnownWord: (text: string) => boolean,
|
|
||||||
): boolean {
|
|
||||||
const normalizedSurface = token.surface.trim();
|
|
||||||
if (!hasKanjiChar(normalizedSurface)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (normalizedSurface && isKnownWord(normalizedSurface)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const normalizedReading = token.reading.trim();
|
|
||||||
if (
|
|
||||||
normalizedReading &&
|
|
||||||
normalizedReading !== normalizedSurface &&
|
|
||||||
isKnownWord(normalizedReading)
|
|
||||||
) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const normalizedHeadword = token.headword.trim();
|
|
||||||
return (
|
|
||||||
normalizedHeadword.length > 0 &&
|
|
||||||
normalizedHeadword === normalizedSurface &&
|
|
||||||
isKnownWord(normalizedHeadword)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function filterTokenFrequencyRank(
|
function filterTokenFrequencyRank(
|
||||||
token: MergedToken,
|
token: MergedToken,
|
||||||
pos1Exclusions: ReadonlySet<string>,
|
pos1Exclusions: ReadonlySet<string>,
|
||||||
@@ -657,7 +627,9 @@ export function annotateTokens(
|
|||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
...strippedToken,
|
...strippedToken,
|
||||||
isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord),
|
isKnown: nPlusOneEnabled
|
||||||
|
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
|
||||||
|
: false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
|
|||||||
export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||||
'あ',
|
'あ',
|
||||||
'ああ',
|
'ああ',
|
||||||
|
'ある',
|
||||||
'あなた',
|
'あなた',
|
||||||
'あんた',
|
'あんた',
|
||||||
'ええ',
|
'ええ',
|
||||||
@@ -51,6 +52,8 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
|||||||
'何だ',
|
'何だ',
|
||||||
'何も',
|
'何も',
|
||||||
'如何した',
|
'如何した',
|
||||||
|
'有る',
|
||||||
|
'在る',
|
||||||
'様',
|
'様',
|
||||||
'確かに',
|
'確かに',
|
||||||
'誰も',
|
'誰も',
|
||||||
@@ -507,7 +510,6 @@ export function stripSubtitleAnnotationMetadata(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
...token,
|
...token,
|
||||||
isKnown: false,
|
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
isNameMatch: false,
|
isNameMatch: false,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
|
|||||||
Reference in New Issue
Block a user