Implemented window screen capture mode (only tested on macOS so far)

This commit is contained in:
AuroraWright
2024-01-27 08:36:49 +01:00
parent 181cbececa
commit 39b98d5edd
5 changed files with 59 additions and 7 deletions

View File

@@ -25,7 +25,7 @@ This has been tested with Python 3.11. Newer/older versions might work. It can b
It mostly functions like Manga OCR: https://github.com/kha-white/manga-ocr?tab=readme-ov-file#running-in-the-background It mostly functions like Manga OCR: https://github.com/kha-white/manga-ocr?tab=readme-ov-file#running-in-the-background
However: However:
- it supports reading images and/or writing text to a websocket when the -r=websocket and/or -w=websocket parameters are specified (port 7331 by default, configurable in the config file) - it supports reading images and/or writing text to a websocket when the -r=websocket and/or -w=websocket parameters are specified (port 7331 by default, configurable in the config file)
- it supports capturing the screen directly with -r screencapture. It will default to the entire first screen every 3 seconds, but a different screen/coordinates/delay can be specified in the config file - it supports capturing the screen directly with -r screencapture. It will default to the entire first screen every 3 seconds, but a different screen/coordinates/window/delay can be specified in the config file
- you can pause/unpause the image processing by pressing "p" or terminate the script with "t" or "q" - you can pause/unpause the image processing by pressing "p" or terminate the script with "t" or "q"
- you can switch OCR provider with its corresponding keyboard key (refer to the list above). You can also start the script paused with the -p option or with a specific provider with the -e option (refer to `owocr -h` for the list) - you can switch OCR provider with its corresponding keyboard key (refer to the list above). You can also start the script paused with the -p option or with a specific provider with the -e option (refer to `owocr -h` for the list)
- holding ctrl or cmd at any time will pause image processing temporarily - holding ctrl or cmd at any time will pause image processing temporarily

View File

@@ -7,6 +7,7 @@ import fire
import numpy as np import numpy as np
import pyperclipfix import pyperclipfix
import mss import mss
import pywinctl
import asyncio import asyncio
import websockets import websockets
import queue import queue
@@ -196,6 +197,23 @@ def on_key_release(key):
first_pressed = None first_pressed = None
def on_window_activated(active):
global screencapture_window_active
screencapture_window_active = active
def on_window_resized(size):
global sct_params
sct_params['width'] = size[0]
sct_params['height'] = size[1]
def on_window_moved(pos):
global sct_params
sct_params['left'] = pos[0]
sct_params['top'] = pos[1]
def are_images_identical(img1, img2): def are_images_identical(img1, img2):
if None in (img1, img2): if None in (img1, img2):
return img1 == img2 return img1 == img2
@@ -273,7 +291,7 @@ def run(read_from='clipboard',
websocket_port = 7331 websocket_port = 7331
notifications = False notifications = False
screen_capture_monitor = 1 screen_capture_monitor = 1
screen_capture_coords = 'whole' screen_capture_coords = ''
screen_capture_delay_secs = 3 screen_capture_delay_secs = 3
if not config: if not config:
@@ -306,7 +324,7 @@ def run(read_from='clipboard',
screen_capture_delay_secs = config.get_general('screen_capture_delay_secs') screen_capture_delay_secs = config.get_general('screen_capture_delay_secs')
if config.get_general('screen_capture_coords'): if config.get_general('screen_capture_coords'):
screen_capture_coords = config.get_general('screen_capture_coords').lower() screen_capture_coords = config.get_general('screen_capture_coords')
logger.configure(handlers=[{'sink': sys.stderr, 'format': logger_format}]) logger.configure(handlers=[{'sink': sys.stderr, 'format': logger_format}])
@@ -385,20 +403,48 @@ def run(read_from='clipboard',
else: else:
generic_clipboard_polling = True generic_clipboard_polling = True
elif read_from == 'screencapture': elif read_from == 'screencapture':
global screencapture_window_active
screencapture_window_mode = False
screencapture_window_active = True
with mss.mss() as sct: with mss.mss() as sct:
mon = sct.monitors mon = sct.monitors
if len(mon) <= screen_capture_monitor: if len(mon) <= screen_capture_monitor:
msg = '"screen_capture_monitor" has to be a valid monitor number!' msg = '"screen_capture_monitor" has to be a valid monitor number!'
raise ValueError(msg) raise ValueError(msg)
if screen_capture_coords == 'whole': if screen_capture_coords == '':
coord_left = mon[screen_capture_monitor]["left"] coord_left = mon[screen_capture_monitor]["left"]
coord_top = mon[screen_capture_monitor]["top"] coord_top = mon[screen_capture_monitor]["top"]
coord_width = mon[screen_capture_monitor]["width"] coord_width = mon[screen_capture_monitor]["width"]
coord_height = mon[screen_capture_monitor]["height"] coord_height = mon[screen_capture_monitor]["height"]
else: elif len(screen_capture_coords.split(',')) == 4:
x, y, coord_width, coord_height = [int(c.strip()) for c in screen_capture_coords.split(',')] x, y, coord_width, coord_height = [int(c.strip()) for c in screen_capture_coords.split(',')]
coord_left = mon[screen_capture_monitor]["left"] + x coord_left = mon[screen_capture_monitor]["left"] + x
coord_top = mon[screen_capture_monitor]["top"] + y coord_top = mon[screen_capture_monitor]["top"] + y
else:
window_titles = pywinctl.getAllTitles()
if screen_capture_coords in window_titles:
window_title = screen_capture_coords
else:
for window_title in window_titles:
if screen_capture_coords in window_title:
break
windows = pywinctl.getWindowsWithTitle(window_title)
if len(windows) == 0:
msg = '"screen_capture_coords" has to be empty (for the whole screen), a valid set of coordinates, or a valid window name!'
raise ValueError(msg)
screencapture_window_mode = True
target_window = windows[0]
coord_top = target_window.top
coord_left = target_window.left
coord_width = target_window.width
coord_height = target_window.height
screencapture_window_active = target_window.isActive
target_window.watchdog.start(isActiveCB=on_window_activated, resizedCB=on_window_resized, movedCB=on_window_moved)
target_window.watchdog.setTryToFind(True)
global sct_params
sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height, 'mon': screen_capture_monitor} sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height, 'mon': screen_capture_monitor}
logger.opt(ansi=True).info(f"Reading with screen capture using <{engine_color}>{engine_instances[engine_index].readable_name}</{engine_color}>{' (paused)' if paused else ''}") logger.opt(ansi=True).info(f"Reading with screen capture using <{engine_color}>{engine_instances[engine_index].readable_name}</{engine_color}>{' (paused)' if paused else ''}")
@@ -423,6 +469,8 @@ def run(read_from='clipboard',
if read_from == 'clipboard' and windows_clipboard_polling: if read_from == 'clipboard' and windows_clipboard_polling:
win32api.PostThreadMessage(windows_clipboard_thread.thread_id, win32con.WM_QUIT, 0, 0) win32api.PostThreadMessage(windows_clipboard_thread.thread_id, win32con.WM_QUIT, 0, 0)
windows_clipboard_thread.join() windows_clipboard_thread.join()
if read_from == 'screencapture' and screencapture_window_mode:
target_window.watchdog.stop()
user_input_thread.join() user_input_thread.join()
tmp_paused_listener.stop() tmp_paused_listener.stop()
break break
@@ -470,7 +518,7 @@ def run(read_from='clipboard',
if not windows_clipboard_polling: if not windows_clipboard_polling:
time.sleep(delay_secs) time.sleep(delay_secs)
elif read_from == 'screencapture': elif read_from == 'screencapture':
if not paused and not tmp_paused: if screencapture_window_active and not paused and not tmp_paused:
with mss.mss() as sct: with mss.mss() as sct:
sct_img = sct.grab(sct_params) sct_img = sct.grab(sct_params)
img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")

View File

@@ -12,8 +12,10 @@
;ignore_flag = False ;ignore_flag = False
;delete_images = False ;delete_images = False
;screen_capture_monitor = 2 ;screen_capture_monitor = 2
;note: screen_capture_coords can be empty (whole screen), have a set of coordinates (x,y,width,height) or a window name (the first matching window title will be used)
;screen_capture_coords =
;screen_capture_coords = 400,200,1500,600 ;screen_capture_coords = 400,200,1500,600
;screen_capture_coords = whole ;screen_capture_coords = OBS
;screen_capture_delay_secs = 3 ;screen_capture_delay_secs = 3
[winrtocr] [winrtocr]
;url = http://aaa.xxx.yyy.zzz:8000 ;url = http://aaa.xxx.yyy.zzz:8000

View File

@@ -8,5 +8,6 @@ pynput
websockets websockets
notify-py notify-py
mss mss
pywinctl
pywin32;platform_system=='Windows' pywin32;platform_system=='Windows'
pyobjc;platform_system=='Darwin' pyobjc;platform_system=='Darwin'

View File

@@ -29,6 +29,7 @@ setup(
"websockets", "websockets",
"notify-py", "notify-py",
"mss", "mss",
"pywinctl",
"pywin32;platform_system=='Windows'", "pywin32;platform_system=='Windows'",
"pyobjc;platform_system=='Darwin'" "pyobjc;platform_system=='Darwin'"
], ],