Preserve text block splits (for engines detecting them

This commit is contained in:
AuroraWright
2024-02-03 11:42:02 +01:00
parent 42303faa3e
commit 73d57a58ed

View File

@@ -78,7 +78,7 @@ def empty_post_process(text):
def post_process(text): def post_process(text):
text = ''.join(text.split()) text = ' '.join([''.join(i.split()) for i in text.splitlines()])
text = text.replace('', '...') text = text.replace('', '...')
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text) text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
text = jaconv.h2z(text, ascii=True, digit=True) text = jaconv.h2z(text, ascii=True, digit=True)
@@ -224,7 +224,7 @@ class GoogleLens:
if len(text) > 0: if len(text) > 0:
lines = text[0] lines = text[0]
for line in lines: for line in lines:
res += line + ' ' res += line + '\n'
x = (True, res) x = (True, res)
return x return x
@@ -277,7 +277,7 @@ class AppleVision:
res = '' res = ''
if success[0]: if success[0]:
for result in req.results(): for result in req.results():
res += result.text() + ' ' res += result.text() + '\n'
req.dealloc() req.dealloc()
x = (True, res) x = (True, res)
else: else:
@@ -382,7 +382,7 @@ class AzureImageAnalysis:
if read_result.read: if read_result.read:
for block in read_result.read.blocks: for block in read_result.read.blocks:
for line in block.lines: for line in block.lines:
res += line.text + ' ' res += line.text + '\n'
else: else:
return (False, 'Unknown error!') return (False, 'Unknown error!')
@@ -418,7 +418,7 @@ class EasyOCR:
res = '' res = ''
read_result = self.model.readtext(self._preprocess(img), detail=0) read_result = self.model.readtext(self._preprocess(img), detail=0)
for text in read_result: for text in read_result:
res += text + ' ' res += text + '\n'
x = (True, res) x = (True, res)
return x return x
@@ -466,7 +466,7 @@ class RapidOCR:
read_results, elapsed = self.model(self._preprocess(img)) read_results, elapsed = self.model(self._preprocess(img))
if read_results: if read_results:
for read_result in read_results: for read_result in read_results:
res += read_result[1] + ' ' res += read_result[1] + '\n'
x = (True, res) x = (True, res)
return x return x