Merge paragraphs that are very close/overlapping in the other direction

This commit is contained in:
AuroraWright
2025-12-04 02:09:38 +01:00
parent 4e497ac3c3
commit 9c288a4770

View File

@@ -688,11 +688,18 @@ class TextFiltering:
if not all_lines:
return ocr_result
if self.debug_filtering:
for p in ocr_result.paragraphs:
logger.opt(colors=True).debug("<red>Engine paragraph: '{}' writing_direction: '{}'</>", [self.get_line_text(line) for line in p.lines], p.writing_direction)
# Create new paragraphs
new_paragraphs = self._create_paragraphs_from_lines(all_lines)
# Merge very close paragraphs
merged_paragraphs = self._merge_close_paragraphs(new_paragraphs)
# Group paragraphs into rows
rows = self._group_paragraphs_into_rows(new_paragraphs)
rows = self._group_paragraphs_into_rows(merged_paragraphs)
# Reorder paragraphs in each row
reordered_rows = self._reorder_paragraphs_in_rows(rows)
@@ -717,11 +724,11 @@ class TextFiltering:
return
if is_vertical:
get_start = lambda line: line['line_obj'].bounding_box.top
get_end = lambda line: line['line_obj'].bounding_box.bottom
get_start = lambda l: l['line_obj'].bounding_box.top
get_end = lambda l: l['line_obj'].bounding_box.bottom
else:
get_start = lambda line: line['line_obj'].bounding_box.left
get_end = lambda line: line['line_obj'].bounding_box.right
get_start = lambda l: l['line_obj'].bounding_box.left
get_end = lambda l: l['line_obj'].bounding_box.right
components = self._find_connected_components(
items=[lines[i] for i in indices],
@@ -734,7 +741,7 @@ class TextFiltering:
if len(component) > 1:
original_indices = [indices[i] for i in component]
paragraph_lines = [lines[i] for i in original_indices]
new_paragraph = self._create_paragraph_from_lines(paragraph_lines, is_vertical)
new_paragraph = self._create_paragraph_from_lines(paragraph_lines, is_vertical, False)
all_paragraphs.append(new_paragraph)
grouped.update(original_indices)
@@ -744,12 +751,12 @@ class TextFiltering:
# Create paragraphs out of ungrouped lines
ungrouped_lines = [line for i, line in enumerate(lines) if i not in grouped]
for line in ungrouped_lines:
new_paragraph = self._create_paragraph_from_lines([line], None)
new_paragraph = self._create_paragraph_from_lines([line], None, False)
all_paragraphs.append(new_paragraph)
return all_paragraphs
def _create_paragraph_from_lines(self, lines, is_vertical):
def _create_paragraph_from_lines(self, lines, is_vertical, merging_step):
if len(lines) > 1:
if is_vertical:
lines = sorted(lines, key=lambda x: x['line_obj'].bounding_box.right, reverse=True)
@@ -758,15 +765,15 @@ class TextFiltering:
lines = self._merge_overlapping_lines(lines, is_vertical)
if self.furigana_filter:
if not merging_step and self.furigana_filter:
lines = self._furigana_filter(lines, is_vertical)
line_objs = [l['line_obj'] for l in lines]
left = min(line.bounding_box.left for line in line_objs)
right = max(line.bounding_box.right for line in line_objs)
top = min(line.bounding_box.top for line in line_objs)
bottom = max(line.bounding_box.bottom for line in line_objs)
left = min(l.bounding_box.left for l in line_objs)
right = max(l.bounding_box.right for l in line_objs)
top = min(l.bounding_box.top for l in line_objs)
bottom = max(l.bounding_box.bottom for l in line_objs)
new_bbox = BoundingBox(
center_x=(left + right) / 2,
@@ -787,8 +794,31 @@ class TextFiltering:
writing_direction=writing_direction
)
if not merging_step:
character_size = self._calculate_character_size(lines, is_vertical)
return {
'paragraph_obj': paragraph,
'character_size': character_size
}
return paragraph
def _calculate_character_size(self, lines, is_vertical):
if is_vertical:
largest_line = max(lines, key=lambda x: x['line_obj'].bounding_box.width)
line_dimension = largest_line['line_obj'].bounding_box.height
else:
largest_line = max(lines, key=lambda x: x['line_obj'].bounding_box.height)
line_dimension = largest_line['line_obj'].bounding_box.width
char_count = len(self.get_line_text(largest_line['line_obj']))
if char_count == 0:
return 0.0
return line_dimension / char_count
def _should_group_in_same_paragraph(self, line1, line2, is_vertical):
bbox1 = line1['line_obj'].bounding_box
bbox2 = line2['line_obj'].bounding_box
@@ -986,6 +1016,80 @@ class TextFiltering:
return filtered_lines
def _merge_close_paragraphs(self, paragraphs):
if len(paragraphs) < 2:
return [p['paragraph_obj'] for p in paragraphs]
merged_paragraphs = []
def _merge_paragraphs(is_vertical):
indices = [i for i, paragraph in enumerate(paragraphs) if ((paragraph['paragraph_obj'].writing_direction == 'TOP_TO_BOTTOM') == is_vertical)]
if len(indices) == 0:
return
if len(indices) == 1:
merged_paragraphs.append(paragraphs[indices[0]]['paragraph_obj'])
return
if is_vertical:
get_start = lambda p: p['paragraph_obj'].bounding_box.left
get_end = lambda p: p['paragraph_obj'].bounding_box.right
else:
get_start = lambda p: p['paragraph_obj'].bounding_box.top
get_end = lambda p: p['paragraph_obj'].bounding_box.bottom
components = self._find_connected_components(
items=[paragraphs[i] for i in indices],
should_connect=lambda p1, p2: self._should_merge_close_paragraphs(p1, p2, is_vertical),
get_start_coord=get_start,
get_end_coord=get_end
)
for component in components:
if len(component) == 1:
merged_paragraphs.append(paragraphs[component[0]]['paragraph_obj'])
else:
component_paragraphs = [paragraphs[i] for i in component]
if self.debug_filtering:
logger.opt(colors=True).debug("<green>Merged paragraphs vertical: '{}'</>", is_vertical)
for p in component_paragraphs:
logger.opt(colors=True).debug("<green> Paragraph: '{}'</>", [self.get_line_text(line) for line in p['paragraph_obj'].lines])
merged_paragraph = self._merge_multiple_paragraphs(component_paragraphs, is_vertical)
merged_paragraphs.append(merged_paragraph)
_merge_paragraphs(True)
_merge_paragraphs(False)
return merged_paragraphs
def _should_merge_close_paragraphs(self, paragraph1, paragraph2, is_vertical):
bbox1 = paragraph1['paragraph_obj'].bounding_box
bbox2 = paragraph2['paragraph_obj'].bounding_box
character_size = max(paragraph1['character_size'], paragraph2['character_size'])
if is_vertical:
vertical_distance = self._calculate_vertical_distance(bbox1, bbox2)
horizontal_overlap = self._check_horizontal_overlap(bbox1, bbox2)
return (vertical_distance <= 3 * character_size and horizontal_overlap > 0.4)
else:
horizontal_distance = self._calculate_horizontal_distance(bbox1, bbox2)
vertical_overlap = self._check_vertical_overlap(bbox1, bbox2)
return (horizontal_distance <= 3 * character_size and vertical_overlap > 0.4)
def _merge_multiple_paragraphs(self, paragraphs, is_vertical):
merged_lines = []
for p in paragraphs:
for line in p['paragraph_obj'].lines:
merged_lines.append({
'line_obj': line,
'is_vertical': is_vertical
})
return self._create_paragraph_from_lines(merged_lines, is_vertical, True)
def _group_paragraphs_into_rows(self, paragraphs):
if len(paragraphs) < 2:
return [{'paragraphs': paragraphs, 'is_vertical': False}]