fix: preserve ordinal frequency annotations

This commit is contained in:
2026-05-03 21:07:46 -07:00
parent 4bd8fc3db4
commit 3284c40ab5
6 changed files with 258 additions and 2 deletions
+63
View File
@@ -4077,6 +4077,69 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin
assert.equal(result.tokens?.[0]?.frequencyRank, 5468);
});
test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () => {
const result = await tokenizeSubtitle(
'第二走者',
makeDepsFromYomitanTokens(
[
{ surface: '第二', reading: 'だいに', headword: '第二' },
{ surface: '走者', reading: 'そうしゃ', headword: '走者' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '第二' ? 1820 : text === '走者' ? 41555 : null),
tokenizeWithMecab: async () => [
{
headword: '第',
surface: '第',
reading: 'ダイ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.other,
pos1: '接頭詞',
pos2: '数接続',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '二',
surface: '二',
reading: 'ニ',
startPos: 1,
endPos: 2,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '数',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '走者',
surface: '走者',
reading: 'ソウシャ',
startPos: 2,
endPos: 4,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 1,
},
),
);
assert.equal(result.tokens?.[0]?.surface, '第二');
assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
assert.equal(result.tokens?.[0]?.pos2, '数接続|数');
assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
});
test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
const result = await tokenizeSubtitle(
'最近辛いものが続いとるんですけど',
@@ -149,6 +149,24 @@ function shouldAllowContentLedMergedTokenFrequency(
return true;
}
function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
const normalizedSurface = token.surface.trim();
const normalizedHeadword = token.headword.trim();
if (!normalizedSurface.startsWith('第') && !normalizedHeadword.startsWith('第')) {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
return (
pos1Parts.length >= 2 &&
pos1Parts[0] === '接頭詞' &&
pos1Parts.slice(1).some((part) => part === '名詞') &&
pos2Parts[0] === '数接続' &&
pos2Parts.slice(1).some((part) => part === '数')
);
}
function isFrequencyExcludedByPos(
token: MergedToken,
pos1Exclusions: ReadonlySet<string>,
@@ -168,12 +186,21 @@ function isFrequencyExcludedByPos(
pos1Exclusions,
pos2Exclusions,
);
const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken) {
if (
isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
!allowContentLedMergedToken &&
!allowOrdinalPrefixNounToken
) {
return true;
}
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken) {
if (
isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
!allowContentLedMergedToken &&
!allowOrdinalPrefixNounToken
) {
return true;
}
@@ -891,6 +891,105 @@ test('requestYomitanScanTokens can use frequency from later exact secondary-matc
]);
});
test('requestYomitanScanTokens uses exact frequency entry when selected reading differs', async () => {
let scannerScript = '';
const deps = createDeps(async (script) => {
if (script.includes('termsFind')) {
scannerScript = script;
return [];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
dictionaryPriorityByName: {
'JPDBv2㋕': 0,
Jiten: 1,
CC100: 2,
},
dictionaryFrequencyModeByName: {
'JPDBv2㋕': 'rank-based',
Jiten: 'rank-based',
CC100: 'rank-based',
},
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
{ name: 'Jiten', enabled: true, id: 1 },
{ name: 'CC100', enabled: true, id: 2 },
],
},
},
],
};
}
return null;
});
await requestYomitanScanTokens('第二走者', deps, {
error: () => undefined,
});
const result = (await runInjectedYomitanScript(scannerScript, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text ?? '';
if (!text.startsWith('第二')) {
return { originalTextLength: 0, dictionaryEntries: [] };
}
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: '第二',
reading: 'だいに',
sources: [{ originalText: '第二', isPrimary: true, matchType: 'exact' }],
},
],
frequencies: [],
},
{
headwords: [
{
term: '第二',
reading: '',
sources: [{ originalText: '第二', isPrimary: false, matchType: 'exact' }],
},
],
frequencies: [
{
headwordIndex: 0,
dictionary: 'JPDBv2㋕',
frequency: 189513,
displayValue: '1820,189513句',
},
],
},
],
};
})) as Array<Record<string, unknown>>;
assert.deepEqual(result?.[0], {
surface: '第二',
reading: 'だいに',
headword: '第二',
startPos: 0,
endPos: 2,
isNameMatch: false,
frequencyRank: 1820,
});
});
test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {
@@ -960,6 +960,9 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
const preferredReading =
typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
if (!matchReading || !preferredReading) {
return true;
}
return matchReading === preferredReading;
}
function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {