mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-23 00:11:28 -07:00
feat(stats): add v1 immersion stats dashboard (#19)
This commit is contained in:
@@ -19,6 +19,7 @@
|
||||
import { PartOfSpeech, Token, MergedToken } from './types';
|
||||
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
|
||||
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
|
||||
import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';
|
||||
|
||||
export function isNoun(tok: Token): boolean {
|
||||
return tok.partOfSpeech === PartOfSpeech.noun;
|
||||
@@ -169,13 +170,17 @@ export function mergeTokens(
|
||||
isKnownWord: (text: string) => boolean = () => false,
|
||||
knownWordMatchMode: 'headword' | 'surface' = 'headword',
|
||||
shouldLookupKnownWords = true,
|
||||
sourceText?: string,
|
||||
): MergedToken[] {
|
||||
if (!tokens || tokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const result: MergedToken[] = [];
|
||||
const normalizedSourceText =
|
||||
typeof sourceText === 'string' ? sourceText.replace(/\r?\n/g, ' ').trim() : null;
|
||||
let charOffset = 0;
|
||||
let sourceCursor = 0;
|
||||
let lastStandaloneToken: Token | null = null;
|
||||
const resolveKnownMatch = (text: string | undefined): boolean => {
|
||||
if (!shouldLookupKnownWords || !text) {
|
||||
@@ -185,9 +190,12 @@ export function mergeTokens(
|
||||
};
|
||||
|
||||
for (const token of tokens) {
|
||||
const start = charOffset;
|
||||
const end = charOffset + token.word.length;
|
||||
const matchedStart =
|
||||
normalizedSourceText !== null ? normalizedSourceText.indexOf(token.word, sourceCursor) : -1;
|
||||
const start = matchedStart >= sourceCursor ? matchedStart : charOffset;
|
||||
const end = start + token.word.length;
|
||||
charOffset = end;
|
||||
sourceCursor = end;
|
||||
|
||||
let shouldMergeToken = false;
|
||||
|
||||
@@ -290,6 +298,10 @@ function isNPlusOneWordCountToken(
|
||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||
): boolean {
|
||||
if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||
const hasPos1 = normalizedPos1.length > 0;
|
||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||
|
||||
Reference in New Issue
Block a user