mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
fix: suppress known highlights for subtitle particles
This commit is contained in:
@@ -0,0 +1,72 @@
|
|||||||
|
---
|
||||||
|
id: TASK-338
|
||||||
|
title: Fix known-word highlight on standalone subtitle particles
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- codex
|
||||||
|
created_date: '2026-05-04 05:52'
|
||||||
|
updated_date: '2026-05-04 05:57'
|
||||||
|
labels:
|
||||||
|
- bug
|
||||||
|
- subtitle
|
||||||
|
- tokenizer
|
||||||
|
dependencies: []
|
||||||
|
references:
|
||||||
|
- src/core/services/tokenizer/annotation-stage.ts
|
||||||
|
- src/core/services/tokenizer/subtitle-annotation-filter.ts
|
||||||
|
- src/renderer/subtitle-render.ts
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Standalone grammar particles such as に should not render as known-word green when they appear in the known-word cache as readings for other words. Keep known-word coloring for lexical tokens, but prevent grammar-excluded subtitle tokens from getting known-green.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 Standalone grammar particles like に do not retain isKnown after subtitle annotation filtering.
|
||||||
|
- [x] #2 Lexical known-word tokens still render as known when not grammar-excluded.
|
||||||
|
- [x] #3 Focused regression test covers the particle false-positive path.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Add a focused regression in `src/core/services/tokenizer/annotation-stage.test.ts` showing standalone particle `に` is grammar-excluded and does not retain `isKnown` even when `isKnownWord('に')` is true.
|
||||||
|
2. Run the focused tokenizer annotation test and confirm the new test fails for the current behavior.
|
||||||
|
3. Patch `src/core/services/tokenizer/annotation-stage.ts` so grammar-excluded tokens clear known status while still stripping N+1/frequency/JLPT/name metadata.
|
||||||
|
4. Run the focused test file, then inspect diff and update task acceptance criteria.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Implemented tokenizer annotation filtering so grammar-excluded subtitle tokens clear known-word status instead of retaining green known coloring. Added focused regression for known-word-cache particle false positive and updated existing expectations for unified annotation clearing. Verification: `bun test src/core/services/tokenizer/annotation-stage.test.ts --test-name-pattern "clears known status from standalone particles"` failed before the production patch; after patch, `bun test src/core/services/tokenizer/annotation-stage.test.ts`, `bun test src/core/services/tokenizer.test.ts`, combined tokenizer tests, `bun run typecheck`, `bun run changelog:lint`, and `bun run test:fast` passed.
|
||||||
|
|
||||||
|
Full handoff gate follow-up: `bun run test:env` and `bun run build` passed. `bun run test:smoke:dist` failed outside this tokenizer change in `dist/core/services/overlay-manager.test.js` because current dirty overlay-window code calls `window.getTitle()` on a test mock that does not provide it.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Summary:
|
||||||
|
- Cleared `isKnown` for grammar-excluded subtitle tokens in the tokenizer annotation stage, preventing standalone particles such as `に` from rendering as known just because a known-word deck contains a matching reading.
|
||||||
|
- Added a focused regression test for the known-word-cache false positive and updated tokenizer expectations so helper/grammar spans consistently clear all subtitle annotations.
|
||||||
|
- Added changelog fragment `changes/338-known-word-particle-highlights.md`.
|
||||||
|
|
||||||
|
Verification:
|
||||||
|
- `bun test src/core/services/tokenizer/annotation-stage.test.ts --test-name-pattern "clears known status from standalone particles"` failed before the production patch.
|
||||||
|
- `bun test src/core/services/tokenizer/annotation-stage.test.ts`
|
||||||
|
- `bun test src/core/services/tokenizer.test.ts`
|
||||||
|
- `bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts`
|
||||||
|
- `bun run typecheck`
|
||||||
|
- `bun run changelog:lint`
|
||||||
|
- `bun run test:fast`
|
||||||
|
- `bun run test:env`
|
||||||
|
- `bun run build`
|
||||||
|
|
||||||
|
Blocked/External:
|
||||||
|
- `bun run test:smoke:dist` currently fails outside this tokenizer change in `dist/core/services/overlay-manager.test.js`: dirty overlay-window code calls `window.getTitle()` on a test mock without that method.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: tokenizer
|
||||||
|
|
||||||
|
- Prevented standalone grammar and helper tokens such as `に` from being colored as known words when readings from known-word decks match them.
|
||||||
@@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati
|
|||||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
||||||
assert.equal(result.tokens?.[1]?.surface, 'です');
|
assert.equal(result.tokens?.[1]?.surface, 'です');
|
||||||
assert.equal(result.tokens?.[1]?.isKnown, true);
|
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||||
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
||||||
@@ -3230,7 +3230,7 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
|||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
|
test('tokenizeSubtitle clears known-word highlight for exact non-independent kanji noun tokens', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'その点',
|
'その点',
|
||||||
makeDepsFromYomitanTokens(
|
makeDepsFromYomitanTokens(
|
||||||
@@ -3278,7 +3278,7 @@ test('tokenizeSubtitle preserves known-word highlight for exact non-independent
|
|||||||
assert.equal(result.tokens?.length, 2);
|
assert.equal(result.tokens?.length, 2);
|
||||||
assert.equal(result.tokens?.[0]?.isKnown, false);
|
assert.equal(result.tokens?.[0]?.isKnown, false);
|
||||||
assert.equal(result.tokens?.[1]?.surface, '点');
|
assert.equal(result.tokens?.[1]?.surface, '点');
|
||||||
assert.equal(result.tokens?.[1]?.isKnown, true);
|
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||||
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
||||||
@@ -3335,7 +3335,7 @@ test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle keeps excluded interjections hoverable while clearing only their annotation metadata', async () => {
|
test('tokenizeSubtitle keeps excluded interjections hoverable while clearing annotation metadata', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'ぐはっ 猫',
|
'ぐはっ 猫',
|
||||||
makeDeps({
|
makeDeps({
|
||||||
@@ -3409,7 +3409,7 @@ test('tokenizeSubtitle keeps excluded interjections hoverable while clearing onl
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearing only their annotation metadata', async () => {
|
test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearing annotation metadata', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'猫んです',
|
'猫んです',
|
||||||
makeDepsFromYomitanTokens(
|
makeDepsFromYomitanTokens(
|
||||||
@@ -3480,7 +3480,7 @@ test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearin
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clearing only their annotation metadata', async () => {
|
test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clearing annotation metadata', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'私はこの猫です',
|
'私はこの猫です',
|
||||||
makeDeps({
|
makeDeps({
|
||||||
@@ -3599,7 +3599,7 @@ test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clea
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable while clearing only their annotation metadata', async () => {
|
test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable while clearing annotation metadata', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'どうしてもって',
|
'どうしてもって',
|
||||||
makeDepsFromYomitanTokens(
|
makeDepsFromYomitanTokens(
|
||||||
@@ -3893,7 +3893,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper
|
|||||||
{
|
{
|
||||||
surface: 'これで',
|
surface: 'これで',
|
||||||
headword: 'これ',
|
headword: 'これ',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4008,7 +4008,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings'
|
|||||||
{
|
{
|
||||||
surface: 'のかな',
|
surface: 'のかな',
|
||||||
headword: 'の',
|
headword: 'の',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4447,7 +4447,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru
|
|||||||
{
|
{
|
||||||
surface: 'ある',
|
surface: 'ある',
|
||||||
headword: 'ある',
|
headword: 'ある',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4492,7 +4492,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w
|
|||||||
{
|
{
|
||||||
surface: 'ですよ',
|
surface: 'ですよ',
|
||||||
headword: 'です',
|
headword: 'です',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4819,7 +4819,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi
|
|||||||
{
|
{
|
||||||
surface: 'れた',
|
surface: 'れた',
|
||||||
headword: 'れる',
|
headword: 'れる',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4956,7 +4956,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
|
|||||||
{
|
{
|
||||||
surface: 'てく',
|
surface: 'てく',
|
||||||
headword: 'てく',
|
headword: 'てく',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
@@ -4967,7 +4967,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
|
|||||||
{
|
{
|
||||||
surface: 'れた',
|
surface: 'れた',
|
||||||
headword: 'れる',
|
headword: 'れる',
|
||||||
isKnown: true,
|
isKnown: false,
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
jlptLevel: undefined,
|
jlptLevel: undefined,
|
||||||
|
|||||||
@@ -898,8 +898,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
|||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[1]?.isKnown, true);
|
assert.equal(result[1]?.isKnown, false);
|
||||||
assert.equal(result[2]?.isKnown, true);
|
assert.equal(result[2]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1113,7 +1113,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
|||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => {
|
test('annotateTokens clears known-word status for non-independent kanji noun tokens', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: '点',
|
surface: '点',
|
||||||
@@ -1138,7 +1138,7 @@ test('annotateTokens preserves exact known-word status for non-independent kanji
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
@@ -1352,13 +1352,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => {
|
test('annotateTokens clears known status and other annotations for kana-only non-independent noun helper merges', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'ことに',
|
surface: 'ことに',
|
||||||
@@ -1382,13 +1382,13 @@ test('annotateTokens keeps known status while clearing other annotations for kan
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => {
|
test('annotateTokens clears known status and other annotations for standalone auxiliary inflection fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'れる',
|
surface: 'れる',
|
||||||
@@ -1424,14 +1424,14 @@ test('annotateTokens keeps known status while clearing other annotations for sta
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result) {
|
for (const token of result) {
|
||||||
assert.equal(token.isKnown, true, token.surface);
|
assert.equal(token.isKnown, false, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => {
|
test('annotateTokens clears known status and other annotations for auxiliary-only te-kureru helper spans', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'てく',
|
surface: 'てく',
|
||||||
@@ -1467,7 +1467,7 @@ test('annotateTokens keeps known status while clearing other annotations for aux
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result) {
|
for (const token of result) {
|
||||||
assert.equal(token.isKnown, true, token.surface);
|
assert.equal(token.isKnown, false, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
@@ -1503,7 +1503,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', ()
|
|||||||
assert.equal(result[0]?.jlptLevel, 'N4');
|
assert.equal(result[0]?.jlptLevel, 'N4');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => {
|
test('annotateTokens clears known status and other annotations for standalone して helper fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'してる',
|
surface: 'してる',
|
||||||
@@ -1527,13 +1527,13 @@ test('annotateTokens keeps known status while clearing other annotations for sta
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => {
|
test('annotateTokens clears known status and other annotations for standalone particle fragments without POS tags', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'と',
|
surface: 'と',
|
||||||
@@ -1557,12 +1557,54 @@ test('annotateTokens keeps known status while clearing other annotations for sta
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens clears known status from standalone particles even when the known-word cache contains them', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'に',
|
||||||
|
headword: 'に',
|
||||||
|
reading: 'ニ',
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '格助詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
frequencyRank: 2,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: '泉',
|
||||||
|
headword: '泉',
|
||||||
|
reading: 'イズミ',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 2,
|
||||||
|
frequencyRank: 50,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === 'に' || text === '泉',
|
||||||
|
getJlptLevel: (text) => (text === 'に' ? 'N5' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.isKnown, false);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
|
assert.equal(result[1]?.isKnown, true);
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens does not mark standalone connective particles as N+1', () => {
|
test('annotateTokens does not mark standalone connective particles as N+1', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
@@ -1613,7 +1655,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () =
|
|||||||
assert.equal(result[1]?.jlptLevel, undefined);
|
assert.equal(result[1]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => {
|
test('annotateTokens clears known status and other annotations for rhetorical もんか grammar particle phrases', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'もんか',
|
surface: 'もんか',
|
||||||
@@ -1637,13 +1679,13 @@ test('annotateTokens keeps known status while clearing other annotations for rhe
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => {
|
test('annotateTokens clears known status and other annotations for bare くれ auxiliary fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'くれ',
|
surface: 'くれ',
|
||||||
@@ -1667,13 +1709,13 @@ test('annotateTokens keeps known status while clearing other annotations for bar
|
|||||||
{ minSentenceWordsForNPlusOne: 1 },
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => {
|
test('annotateTokens clears known status and other annotations for aru existence verbs', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: '有る',
|
surface: '有る',
|
||||||
@@ -1703,14 +1745,14 @@ test('annotateTokens keeps known status while clearing other annotations for aru
|
|||||||
|
|
||||||
assert.equal(result[0]?.surface, '有る');
|
assert.equal(result[0]?.surface, '有る');
|
||||||
assert.equal(result[0]?.headword, '有る');
|
assert.equal(result[0]?.headword, '有る');
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.isNameMatch, false);
|
assert.equal(result[0]?.isNameMatch, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => {
|
test('annotateTokens clears known status and other annotations for standalone quote particle and auxiliary grammar terms', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'って',
|
surface: 'って',
|
||||||
@@ -1746,14 +1788,14 @@ test('annotateTokens keeps known status while clearing other annotations for sta
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result) {
|
for (const token of result) {
|
||||||
assert.equal(token.isKnown, true, token.surface);
|
assert.equal(token.isKnown, false, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => {
|
test('annotateTokens clears known status and other annotations from standalone あ interjections without POS tags', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'あ',
|
surface: 'あ',
|
||||||
@@ -1783,7 +1825,7 @@ test('annotateTokens keeps known status while clearing other annotations from st
|
|||||||
assert.equal(result[0]?.surface, 'あ');
|
assert.equal(result[0]?.surface, 'あ');
|
||||||
assert.equal(result[0]?.headword, 'あ');
|
assert.equal(result[0]?.headword, 'あ');
|
||||||
assert.equal(result[0]?.reading, 'あ');
|
assert.equal(result[0]?.reading, 'あ');
|
||||||
assert.equal(result[0]?.isKnown, true);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
@@ -1845,7 +1887,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const token of result.slice(0, 2)) {
|
for (const token of result.slice(0, 2)) {
|
||||||
assert.equal(token.isKnown, true, token.surface);
|
assert.equal(token.isKnown, false, token.surface);
|
||||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
|
|||||||
@@ -627,9 +627,7 @@ export function annotateTokens(
|
|||||||
});
|
});
|
||||||
return {
|
return {
|
||||||
...strippedToken,
|
...strippedToken,
|
||||||
isKnown: nPlusOneEnabled
|
isKnown: false,
|
||||||
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
|
|
||||||
: false,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user