mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 12:11:28 -07:00
Decouple stats daemon and preserve final mine OSD status
- Run `subminer stats -b` as a dedicated daemon process, independent from the overlay app - Stop Anki progress spinner before showing final `✓`/`x` mine result so it is not overwritten - Keep grammar/noise subtitle tokens hoverable while stripping annotation metadata
This commit is contained in:
@@ -130,6 +130,30 @@ test('serializeSubtitleMarkup preserves tooltip attrs and name-match precedence'
|
||||
assert.doesNotMatch(markup, /data-frequency-rank="12"|data-jlpt-level="N5"|word-jlpt-n5/);
|
||||
});
|
||||
|
||||
test('serializeSubtitleMarkup keeps filtered tokens hoverable without annotation attrs', () => {
|
||||
const payload: SubtitleData = {
|
||||
text: 'は',
|
||||
tokens: [
|
||||
{
|
||||
surface: 'は',
|
||||
reading: 'は',
|
||||
headword: 'は',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: false,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const markup = serializeSubtitleMarkup(payload, frequencyOptions);
|
||||
assert.equal(markup, '<span class="word" data-reading="は" data-headword="は">は</span>');
|
||||
});
|
||||
|
||||
test('serializeSubtitleWebsocketMessage emits sentence payload', () => {
|
||||
const payload: SubtitleData = {
|
||||
text: '字幕',
|
||||
|
||||
@@ -1305,7 +1305,7 @@ test('tokenizeSubtitle ignores frequency lookup failures', async () => {
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as particle by mecab pos1', async () => {
|
||||
test('tokenizeSubtitle keeps standalone particle token hoverable while clearing annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDeps({
|
||||
@@ -1350,9 +1350,33 @@ test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as pa
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
assert.equal(result.text, 'は');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
pos1: token.pos1,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
isNameMatch: token.isNameMatch,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyRank: token.frequencyRank,
|
||||
})),
|
||||
[
|
||||
{
|
||||
surface: 'は',
|
||||
reading: 'は',
|
||||
headword: 'は',
|
||||
pos1: '助詞',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: false,
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
},
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps frequency rank when mecab tags classify token as content-bearing', async () => {
|
||||
@@ -1460,7 +1484,7 @@ test('tokenizeSubtitle skips JLPT level for excluded demonstratives', async () =
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes repeated kana interjections from annotation payloads entirely', async () => {
|
||||
test('tokenizeSubtitle keeps repeated kana interjections tokenized while clearing annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'ああ',
|
||||
makeDeps({
|
||||
@@ -1491,7 +1515,29 @@ test('tokenizeSubtitle excludes repeated kana interjections from annotation payl
|
||||
}),
|
||||
);
|
||||
|
||||
assert.deepEqual(result, { text: 'ああ', tokens: null });
|
||||
assert.equal(result.text, 'ああ');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
reading: token.reading,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyRank: token.frequencyRank,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
})),
|
||||
[
|
||||
{
|
||||
surface: 'ああ',
|
||||
headword: 'ああ',
|
||||
reading: 'ああ',
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
||||
@@ -2578,7 +2624,15 @@ test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets
|
||||
const gaToken = result.tokens?.find((token) => token.surface === 'が');
|
||||
const desuToken = result.tokens?.find((token) => token.surface === 'です');
|
||||
assert.equal(gaToken?.pos1, '助詞');
|
||||
assert.equal(gaToken?.isKnown, false);
|
||||
assert.equal(gaToken?.isNPlusOneTarget, false);
|
||||
assert.equal(gaToken?.jlptLevel, undefined);
|
||||
assert.equal(gaToken?.frequencyRank, undefined);
|
||||
assert.equal(desuToken?.pos1, '助動詞');
|
||||
assert.equal(desuToken?.isKnown, false);
|
||||
assert.equal(desuToken?.isNPlusOneTarget, false);
|
||||
assert.equal(desuToken?.jlptLevel, undefined);
|
||||
assert.equal(desuToken?.frequencyRank, undefined);
|
||||
assert.equal(targets.length, 1);
|
||||
assert.equal(targets[0]?.surface, '仮面');
|
||||
});
|
||||
@@ -3056,7 +3110,7 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes mecab-tagged interjections from annotation payloads entirely', async () => {
|
||||
test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'ぐはっ',
|
||||
makeDepsFromYomitanTokens([{ surface: 'ぐはっ', reading: 'ぐはっ', headword: 'ぐはっ' }], {
|
||||
@@ -3080,10 +3134,34 @@ test('tokenizeSubtitle excludes mecab-tagged interjections from annotation paylo
|
||||
}),
|
||||
);
|
||||
|
||||
assert.deepEqual(result, { text: 'ぐはっ', tokens: null });
|
||||
assert.equal(result.text, 'ぐはっ');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
reading: token.reading,
|
||||
pos1: token.pos1,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyRank: token.frequencyRank,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
})),
|
||||
[
|
||||
{
|
||||
surface: 'ぐはっ',
|
||||
headword: 'ぐはっ',
|
||||
reading: 'ぐはっ',
|
||||
pos1: '感動詞',
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps visible text while excluding interjections from mixed annotation payloads', async () => {
|
||||
test('tokenizeSubtitle keeps excluded interjections hoverable while clearing only their annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'ぐはっ 猫',
|
||||
makeDeps({
|
||||
@@ -3147,8 +3225,261 @@ test('tokenizeSubtitle keeps visible text while excluding interjections from mix
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
})),
|
||||
[{ surface: '猫', headword: '猫' }],
|
||||
[
|
||||
{ surface: 'ぐはっ', headword: 'ぐはっ', frequencyRank: undefined, jlptLevel: undefined },
|
||||
{ surface: '猫', headword: '猫', frequencyRank: 11, jlptLevel: 'N5' },
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearing only their annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫んです',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
{ surface: 'んです', reading: 'んです', headword: 'ん' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '猫' ? 11 : 500),
|
||||
getJlptLevel: (text) => (text === '猫' ? 'N5' : null),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'ん',
|
||||
surface: 'ん',
|
||||
reading: 'ン',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'です',
|
||||
surface: 'です',
|
||||
reading: 'デス',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '猫んです');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyRank: token.frequencyRank,
|
||||
})),
|
||||
[
|
||||
{ surface: '猫', headword: '猫', jlptLevel: 'N5', frequencyRank: 11 },
|
||||
{ surface: 'んです', headword: 'ん', jlptLevel: undefined, frequencyRank: undefined },
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clearing only their annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'私はこの猫です',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '私' ? 50 : text === '猫' ? 11 : 500),
|
||||
getJlptLevel: (text) => (text === '私' ? 'N5' : text === '猫' ? 'N5' : null),
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }],
|
||||
[{ text: 'は', reading: 'は', headwords: [[{ term: 'は' }]] }],
|
||||
[{ text: 'この', reading: 'この', headwords: [[{ term: 'この' }]] }],
|
||||
[{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }],
|
||||
[{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }],
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '私',
|
||||
surface: '私',
|
||||
reading: 'ワタシ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '代名詞',
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'は',
|
||||
surface: 'は',
|
||||
reading: 'ハ',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '係助詞',
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'この',
|
||||
surface: 'この',
|
||||
reading: 'コノ',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '連体詞',
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 4,
|
||||
endPos: 5,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'です',
|
||||
surface: 'です',
|
||||
reading: 'デス',
|
||||
startPos: 5,
|
||||
endPos: 7,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '私はこの猫です');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
})),
|
||||
[
|
||||
{ surface: '私', headword: '私', frequencyRank: 50, jlptLevel: 'N5' },
|
||||
{ surface: 'は', headword: 'は', frequencyRank: undefined, jlptLevel: undefined },
|
||||
{ surface: 'この', headword: 'この', frequencyRank: undefined, jlptLevel: undefined },
|
||||
{ surface: '猫', headword: '猫', frequencyRank: 11, jlptLevel: 'N5' },
|
||||
{ surface: 'です', headword: 'です', frequencyRank: undefined, jlptLevel: undefined },
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable while clearing only their annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'どうしてもって',
|
||||
makeDepsFromYomitanTokens([{ surface: 'どうしてもって', reading: 'どうしてもって', headword: 'どうしても' }], {
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === 'どうしても' ? 123 : null),
|
||||
getJlptLevel: (text) => (text === 'どうしても' ? 'N3' : null),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'どうしても',
|
||||
surface: 'どうしても',
|
||||
reading: 'ドウシテモ',
|
||||
startPos: 0,
|
||||
endPos: 5,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '副詞',
|
||||
pos2: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'って',
|
||||
surface: 'って',
|
||||
reading: 'ッテ',
|
||||
startPos: 5,
|
||||
endPos: 7,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '格助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, 'どうしてもって');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyRank: token.frequencyRank,
|
||||
})),
|
||||
[
|
||||
{
|
||||
surface: 'どうしてもって',
|
||||
headword: 'どうしても',
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
},
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
@@ -178,7 +178,7 @@ async function applyAnnotationStage(
|
||||
);
|
||||
}
|
||||
|
||||
async function filterSubtitleAnnotationTokens(tokens: MergedToken[]): Promise<MergedToken[]> {
|
||||
async function stripSubtitleAnnotationMetadata(tokens: MergedToken[]): Promise<MergedToken[]> {
|
||||
if (tokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
@@ -188,9 +188,7 @@ async function filterSubtitleAnnotationTokens(tokens: MergedToken[]): Promise<Me
|
||||
}
|
||||
|
||||
const annotationStage = await annotationStageModulePromise;
|
||||
return tokens.filter(
|
||||
(token) => !annotationStage.shouldExcludeTokenFromSubtitleAnnotations(token),
|
||||
);
|
||||
return tokens.map((token) => annotationStage.stripSubtitleAnnotationMetadata(token));
|
||||
}
|
||||
|
||||
export function createTokenizerDepsRuntime(
|
||||
@@ -721,12 +719,12 @@ export async function tokenizeSubtitle(
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
const filteredTokens = await filterSubtitleAnnotationTokens(
|
||||
const annotatedTokens = await stripSubtitleAnnotationMetadata(
|
||||
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
||||
);
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: filteredTokens.length > 0 ? filteredTokens : null,
|
||||
tokens: annotatedTokens.length > 0 ? annotatedTokens : null,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import test from 'node:test';
|
||||
import { MergedToken, PartOfSpeech } from '../../../types';
|
||||
import { annotateTokens, AnnotationStageDeps } from './annotation-stage';
|
||||
import {
|
||||
annotateTokens,
|
||||
AnnotationStageDeps,
|
||||
shouldExcludeTokenFromSubtitleAnnotations,
|
||||
stripSubtitleAnnotationMetadata,
|
||||
} from './annotation-stage';
|
||||
|
||||
function makeToken(overrides: Partial<MergedToken> = {}): MergedToken {
|
||||
return {
|
||||
@@ -150,6 +155,170 @@ test('annotateTokens handles JLPT disabled and eligibility exclusion paths', ()
|
||||
assert.equal(excludedLookupCalls, 0);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending variants', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'んです',
|
||||
headword: 'ん',
|
||||
reading: 'ンデス',
|
||||
pos1: '名詞|助動詞',
|
||||
pos2: '非自立',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'のだ',
|
||||
headword: 'の',
|
||||
reading: 'ノダ',
|
||||
pos1: '名詞|助動詞',
|
||||
pos2: '非自立',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'んだ',
|
||||
headword: 'ん',
|
||||
reading: 'ンダ',
|
||||
pos1: '名詞|助動詞',
|
||||
pos2: '非自立',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'のです',
|
||||
headword: 'の',
|
||||
reading: 'ノデス',
|
||||
pos1: '名詞|助動詞',
|
||||
pos2: '非自立',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'なんです',
|
||||
headword: 'だ',
|
||||
reading: 'ナンデス',
|
||||
pos1: '助動詞|名詞|助動詞',
|
||||
pos2: '|非自立',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'んでした',
|
||||
headword: 'ん',
|
||||
reading: 'ンデシタ',
|
||||
pos1: '助動詞|助動詞|助動詞',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'のでは',
|
||||
headword: 'の',
|
||||
reading: 'ノデハ',
|
||||
pos1: '助詞|接続詞',
|
||||
}),
|
||||
];
|
||||
|
||||
for (const token of tokens) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => {
|
||||
const token = makeToken({
|
||||
surface: '問題',
|
||||
headword: '問題',
|
||||
reading: 'モンダイ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), false);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particles auxiliaries and adnominals', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'は',
|
||||
headword: 'は',
|
||||
reading: 'ハ',
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'です',
|
||||
headword: 'です',
|
||||
reading: 'デス',
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'この',
|
||||
headword: 'この',
|
||||
reading: 'コノ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '連体詞',
|
||||
}),
|
||||
];
|
||||
|
||||
for (const token of tokens) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations keeps mixed content tokens with trailing helpers', () => {
|
||||
const token = makeToken({
|
||||
surface: '行きます',
|
||||
headword: '行く',
|
||||
reading: 'イキマス',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助動詞',
|
||||
pos2: '自立',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), false);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens with trailing quote particles', () => {
|
||||
const token = makeToken({
|
||||
surface: 'どうしてもって',
|
||||
headword: 'どうしても',
|
||||
reading: 'ドウシテモッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '副詞|助詞',
|
||||
pos2: '一般|格助詞',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||
const token = makeToken({
|
||||
surface: 'は',
|
||||
headword: 'は',
|
||||
reading: 'ハ',
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
isKnown: true,
|
||||
isNPlusOneTarget: true,
|
||||
isNameMatch: true,
|
||||
jlptLevel: 'N5',
|
||||
frequencyRank: 12,
|
||||
});
|
||||
|
||||
assert.deepEqual(stripSubtitleAnnotationMetadata(token), {
|
||||
...token,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: false,
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
test('stripSubtitleAnnotationMetadata leaves content tokens unchanged', () => {
|
||||
const token = makeToken({
|
||||
surface: '猫',
|
||||
headword: '猫',
|
||||
reading: 'ネコ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
isKnown: true,
|
||||
jlptLevel: 'N5',
|
||||
frequencyRank: 42,
|
||||
});
|
||||
|
||||
assert.strictEqual(stripSubtitleAnnotationMetadata(token), token);
|
||||
});
|
||||
|
||||
test('annotateTokens prioritizes name matches over n+1, frequency, and JLPT when enabled', () => {
|
||||
let jlptLookupCalls = 0;
|
||||
const tokens = [
|
||||
|
||||
@@ -25,6 +25,45 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'ふう',
|
||||
'ほう',
|
||||
]);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||
'だ',
|
||||
'です',
|
||||
'でした',
|
||||
'だった',
|
||||
'では',
|
||||
'じゃ',
|
||||
'でしょう',
|
||||
'だろう',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
|
||||
'',
|
||||
'か',
|
||||
'ね',
|
||||
'よ',
|
||||
'な',
|
||||
'よね',
|
||||
'かな',
|
||||
'かね',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
|
||||
(particle) => `${prefix}${core}${particle}`,
|
||||
),
|
||||
),
|
||||
),
|
||||
);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||
'って',
|
||||
'ってよ',
|
||||
'ってね',
|
||||
'ってな',
|
||||
'ってさ',
|
||||
'ってか',
|
||||
'ってば',
|
||||
]);
|
||||
|
||||
const jlptLevelLookupCaches = new WeakMap<
|
||||
(text: string) => JlptLevel | null,
|
||||
@@ -60,6 +99,7 @@ function normalizePos1Tag(pos1: string | undefined): string {
|
||||
}
|
||||
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
|
||||
const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
|
||||
|
||||
function splitNormalizedTagParts(normalizedTag: string): string[] {
|
||||
if (!normalizedTag) {
|
||||
@@ -84,7 +124,36 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
|
||||
|
||||
function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean {
|
||||
const parts = splitNormalizedTagParts(normalizedPos1);
|
||||
return parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part));
|
||||
if (parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return parts.length > 0 && parts.every((part) => SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(part));
|
||||
}
|
||||
|
||||
function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeJlptTextForExclusion(token.surface);
|
||||
const normalizedHeadword = normalizeJlptTextForExclusion(token.headword);
|
||||
if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const suffix = normalizedSurface.slice(normalizedHeadword.length);
|
||||
if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||
if (pos1Parts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const [leadingPos1, ...trailingPos1] = pos1Parts;
|
||||
if (!leadingPos1 || SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(leadingPos1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
|
||||
}
|
||||
|
||||
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
||||
@@ -520,12 +589,7 @@ function isJlptEligibleToken(token: MergedToken): boolean {
|
||||
}
|
||||
|
||||
function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
|
||||
const candidates = [
|
||||
resolveJlptLookupText(token),
|
||||
token.surface,
|
||||
token.headword,
|
||||
token.reading,
|
||||
].filter(
|
||||
const candidates = [token.surface, token.reading, resolveJlptLookupText(token)].filter(
|
||||
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||
);
|
||||
|
||||
@@ -542,7 +606,9 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
|
||||
|
||||
if (
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate)
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmedCandidate) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalizedCandidate)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
@@ -565,9 +631,28 @@ export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): b
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return isExcludedFromSubtitleAnnotationsByTerm(token);
|
||||
}
|
||||
|
||||
export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
|
||||
if (!shouldExcludeTokenFromSubtitleAnnotations(token)) {
|
||||
return token;
|
||||
}
|
||||
|
||||
return {
|
||||
...token,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: false,
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function computeTokenKnownStatus(
|
||||
token: MergedToken,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
|
||||
Reference in New Issue
Block a user