Implemented window screen capture mode (only tested on macOS so far)

2024-01-27 08:36:49 +01:00
parent 181cbececa
commit 39b98d5edd
5 changed files with 59 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ This has been tested with Python 3.11. Newer/older versions might work. It can b
 It mostly functions like Manga OCR: https://github.com/kha-white/manga-ocr?tab=readme-ov-file#running-in-the-background
 However:
 - it supports reading images and/or writing text to a websocket when the -r=websocket and/or -w=websocket parameters are specified (port 7331 by default, configurable in the config file)
- it supports capturing the screen directly with -r screencapture. It will default to the entire first screen every 3 seconds, but a different screen/coordinates/delay can be specified in the config file
+- it supports capturing the screen directly with -r screencapture. It will default to the entire first screen every 3 seconds, but a different screen/coordinates/window/delay can be specified in the config file
 - you can pause/unpause the image processing by pressing "p" or terminate the script with "t" or "q"
 - you can switch OCR provider with its corresponding keyboard key (refer to the list above). You can also start the script paused with the -p option or with a specific provider with the -e option (refer to `owocr -h` for the list)
 - holding ctrl or cmd at any time will pause image processing temporarily
--- a/owocr/run.py
+++ b/owocr/run.py
@@ -7,6 +7,7 @@ import fire
 import numpy as np
 import pyperclipfix
 import mss
 import pywinctl
 import asyncio
 import websockets
 import queue
@@ -196,6 +197,23 @@ def on_key_release(key):
        first_pressed = None
 def on_window_activated(active):
    global screencapture_window_active
    screencapture_window_active = active
 def on_window_resized(size):
    global sct_params
    sct_params['width'] = size[0]
    sct_params['height'] = size[1]
 def on_window_moved(pos):
    global sct_params
    sct_params['left'] = pos[0]
    sct_params['top'] = pos[1]
 def are_images_identical(img1, img2):
    if None in (img1, img2):
        return img1 == img2
@@ -273,7 +291,7 @@ def run(read_from='clipboard',
    websocket_port = 7331
    notifications = False
    screen_capture_monitor = 1
-    screen_capture_coords = 'whole'
+    screen_capture_coords = ''
    screen_capture_delay_secs = 3
    if not config:
@@ -306,7 +324,7 @@ def run(read_from='clipboard',
            screen_capture_delay_secs = config.get_general('screen_capture_delay_secs')
        if config.get_general('screen_capture_coords'):
-            screen_capture_coords = config.get_general('screen_capture_coords').lower()
+            screen_capture_coords = config.get_general('screen_capture_coords')
    logger.configure(handlers=[{'sink': sys.stderr, 'format': logger_format}])
@@ -385,20 +403,48 @@ def run(read_from='clipboard',
        else:
            generic_clipboard_polling = True
    elif read_from == 'screencapture':
        global screencapture_window_active
        screencapture_window_mode = False
        screencapture_window_active = True
        with mss.mss() as sct:
            mon = sct.monitors
        if len(mon) <= screen_capture_monitor:
            msg = '"screen_capture_monitor" has to be a valid monitor number!'
            raise ValueError(msg)
-        if screen_capture_coords == 'whole':
+        if screen_capture_coords == '':
            coord_left = mon[screen_capture_monitor]["left"]
            coord_top = mon[screen_capture_monitor]["top"]
            coord_width = mon[screen_capture_monitor]["width"]
            coord_height = mon[screen_capture_monitor]["height"]
-        else:
+        elif len(screen_capture_coords.split(',')) == 4:
            x, y, coord_width, coord_height = [int(c.strip()) for c in screen_capture_coords.split(',')]
            coord_left = mon[screen_capture_monitor]["left"] + x
            coord_top = mon[screen_capture_monitor]["top"] + y
        else:
            window_titles = pywinctl.getAllTitles()
            if screen_capture_coords in window_titles:
                window_title = screen_capture_coords
            else:
                for window_title in window_titles:
                    if screen_capture_coords in window_title:
                        break
            windows = pywinctl.getWindowsWithTitle(window_title)
            if len(windows) == 0:
                msg = '"screen_capture_coords" has to be empty (for the whole screen), a valid set of coordinates, or a valid window name!'
                raise ValueError(msg)
            screencapture_window_mode = True
            target_window = windows[0]
            coord_top = target_window.top
            coord_left = target_window.left
            coord_width = target_window.width
            coord_height = target_window.height
            screencapture_window_active = target_window.isActive
            target_window.watchdog.start(isActiveCB=on_window_activated, resizedCB=on_window_resized, movedCB=on_window_moved)
            target_window.watchdog.setTryToFind(True)
        global sct_params
        sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height, 'mon': screen_capture_monitor}
        logger.opt(ansi=True).info(f"Reading with screen capture using <{engine_color}>{engine_instances[engine_index].readable_name}</{engine_color}>{' (paused)' if paused else ''}")
@@ -423,6 +469,8 @@ def run(read_from='clipboard',
            if read_from == 'clipboard' and windows_clipboard_polling:
                win32api.PostThreadMessage(windows_clipboard_thread.thread_id, win32con.WM_QUIT, 0, 0)
                windows_clipboard_thread.join()
            if read_from == 'screencapture' and screencapture_window_mode:
                target_window.watchdog.stop()
            user_input_thread.join()
            tmp_paused_listener.stop()
            break
@@ -470,7 +518,7 @@ def run(read_from='clipboard',
            if not windows_clipboard_polling:
                time.sleep(delay_secs)
        elif read_from == 'screencapture':
-            if not paused and not tmp_paused:
+            if screencapture_window_active and not paused and not tmp_paused:
                with mss.mss() as sct:
                    sct_img = sct.grab(sct_params)
                img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
--- a/owocr_config.ini
+++ b/owocr_config.ini
@@ -12,8 +12,10 @@
 ;ignore_flag = False
 ;delete_images = False
 ;screen_capture_monitor = 2
 ;note: screen_capture_coords can be empty (whole screen), have a set of coordinates (x,y,width,height) or a window name (the first matching window title will be used)
 ;screen_capture_coords =
 ;screen_capture_coords = 400,200,1500,600
-;screen_capture_coords = whole
+;screen_capture_coords = OBS
 ;screen_capture_delay_secs = 3
 [winrtocr]
 ;url = http://aaa.xxx.yyy.zzz:8000
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,6 @@ pynput
 websockets
 notify-py
 mss
 pywinctl
 pywin32;platform_system=='Windows'
 pyobjc;platform_system=='Darwin'
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@ setup(
        "websockets",
        "notify-py",
        "mss",
        "pywinctl",
        "pywin32;platform_system=='Windows'",
        "pyobjc;platform_system=='Darwin'"
    ],