From fe188ca14b532d79964cf5e34a32e618bf8608ad Mon Sep 17 00:00:00 2001 From: AuroraWright Date: Tue, 1 Apr 2025 00:21:19 +0200 Subject: [PATCH] Screen coordinate picker (https://github.com/AuroraWright/owocr/issues/1) --- README.md | 2 +- owocr/config.py | 3 +- owocr/run.py | 110 ++++++++++++++++++------------ owocr/screen_coordinate_picker.py | 100 +++++++++++++++++++++++++++ owocr_config.ini | 12 ++-- pyproject.toml | 2 +- 6 files changed, 176 insertions(+), 53 deletions(-) create mode 100644 owocr/screen_coordinate_picker.py diff --git a/README.md b/README.md index c4a219b..802e284 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Additionally: - Scanning the clipboard takes basically zero system resources on macOS and Windows - Supports reading images and/or writing text to a websocket with the `-r=websocket` and/or `-w=websocket` parameters (the port is 7331 by default, and is configurable in the config file) - Supports reading images from a Unix domain socket (`/tmp/owocr.sock`) on macOS and Linux with `-r=unixsocket` -- Supports capturing the screen directly, or a portion of the screen or a specific window with `-r=screencapture`. By default it will read from the entire main screen every 3 seconds, but you can change it to screenshot a different screen or a portion of a screen (with a set of screen coordinates `x,y,width,height`) or just a specific window (with the window title). You can also change the delay between screenshots or specify a keyboard combo if you don't want screenshots to be taken periodically. Refer to the config file or to `owocr --help` for more details about the screen capture settings +- Supports capturing from the screen directly or from a specific window with `-r=screencapture`. By default it will open a coordinate picker so you can select an area of the screen and then read from it every 3 seconds, but you can change it to screenshot the whole screen, a manual set of coordinates `x,y,width,height` or just a specific window (with the window title). You can also change the delay between screenshots or specify a keyboard combo if you don't want screenshots to be taken periodically. Refer to the config file or to `owocr --help` for more details about the screen capture settings - You can pause/unpause the image processing by pressing "p" or terminate the script with "t" or "q" inside the terminal window - You can switch between OCR providers pressing their corresponding keyboard key inside the terminal window (refer to the list of keys in the providers list below) - You can start the script paused with the `-p` option or with a specific provider with the `-e` option (refer to `owocr -h` for the list) diff --git a/owocr/config.py b/owocr/config.py index ef5d5cd..8b14ab7 100644 --- a/owocr/config.py +++ b/owocr/config.py @@ -25,8 +25,7 @@ class Config: 'notifications': False, 'combo_pause': '', 'combo_engine_switch': '', - 'screen_capture_monitor': 1, - 'screen_capture_coords': '', + 'screen_capture_area': '', 'screen_capture_delay_secs': 3, 'screen_capture_only_active_windows': True, 'screen_capture_combo': '' diff --git a/owocr/run.py b/owocr/run.py index 3729675..b4f2f5c 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -23,13 +23,14 @@ from desktop_notifier import DesktopNotifierSync import psutil import inspect -from owocr.ocr import * -from owocr.config import Config +from .ocr import * +from .config import Config +from .screen_coordinate_picker import get_screen_selection try: import win32gui import win32ui - import win32api + import win32api import win32con import win32process import win32clipboard @@ -279,22 +280,23 @@ def capture_macos_window_screenshot(window_id): def get_windows_window_handle(window_title): def callback(hwnd, window_title_part): - if window_title_part in win32gui.GetWindowText(hwnd): - handles.append(hwnd) + window_title = win32gui.GetWindowText(hwnd) + if window_title_part in window_title: + handles.append((hwnd, window_title)) return True handle = win32gui.FindWindow(None, window_title) if handle: - return handle + return (handle, window_title) handles = [] win32gui.EnumWindows(callback, window_title) for handle in handles: - _, pid = win32process.GetWindowThreadProcessId(handle) + _, pid = win32process.GetWindowThreadProcessId(handle[0]) if psutil.Process(pid).name().lower() not in ('cmd.exe', 'powershell.exe', 'windowsterminal.exe'): return handle - return 0 + return (None, None) class TextFiltering: @@ -616,8 +618,7 @@ def run(read_from=None, auto_pause=None, combo_pause=None, combo_engine_switch=None, - screen_capture_monitor=None, - screen_capture_coords=None, + screen_capture_area=None, screen_capture_delay_secs=None, screen_capture_only_active_windows=None, screen_capture_combo=None @@ -640,10 +641,9 @@ def run(read_from=None, :param auto_pause: Automatically pause the program after the specified amount of seconds since the last successful text recognition. Will be ignored when reading with screen capture. 0 to disable. :param combo_pause: Specifies a combo to wait on for pausing the program. As an example: "++p". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key :param combo_engine_switch: Specifies a combo to wait on for switching the OCR engine. As an example: "++a". To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key - :param screen_capture_monitor: Specifies monitor to target when reading with screen capture. Will be ignored when screen_capture_coords is a window name. - :param screen_capture_coords: Specifies area to target when reading with screen capture. Can be either empty (whole screen), a set of coordinates (x,y,width,height) or a window name (the first matching window title will be used). + :param screen_capture_area: Specifies area to target when reading with screen capture. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "screen_N" (captures a whole screen, where N is the screen number starting from 1) or a window name (the first matching window title will be used). :param screen_capture_delay_secs: Specifies the delay (in seconds) between screenshots when reading with screen capture. - :param screen_capture_only_active_windows: When reading with screen capture and screen_capture_coords is a window name, specifies whether to only target the window while it's active. + :param screen_capture_only_active_windows: When reading with screen capture and screen_capture_area is a window name, specifies whether to only target the window while it's active. :param screen_capture_combo: When reading with screen capture, specifies a combo to wait on for taking a screenshot instead of using the delay. As an example: "++s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key """ @@ -768,8 +768,8 @@ def run(read_from=None, global screenshot_event screenshot_event = threading.Event() key_combos[screen_capture_combo] = on_screenshot_combo - if type(screen_capture_coords) == tuple: - screen_capture_coords = ','.join(map(str, screen_capture_coords)) + if type(screen_capture_area) == tuple: + screen_capture_area = ','.join(map(str, screen_capture_area)) global screencapture_window_active global screencapture_window_visible global sct_params @@ -777,32 +777,53 @@ def run(read_from=None, screencapture_window_active = True screencapture_window_visible = True last_result = ([], engine_index) - if screen_capture_coords == '': + if screen_capture_area == '': screencapture_mode = 0 - elif len(screen_capture_coords.split(',')) == 4: + elif screen_capture_area.startswith('screen_'): + parts = screen_capture_area.split('_') + if len(parts) != 2 or not parts[1].isdigit(): + raise ValueError('Invalid screen_capture_area') + screen_capture_monitor = int(parts[1]) screencapture_mode = 1 + elif len(screen_capture_area.split(',')) == 4: + screencapture_mode = 3 else: screencapture_mode = 2 if screencapture_mode != 2: sct = mss.mss() - mon = sct.monitors - if len(mon) <= screen_capture_monitor: - msg = '"screen_capture_monitor" must be a valid monitor number' - raise ValueError(msg) - if screencapture_mode == 0: + if screencapture_mode == 1: + mon = sct.monitors + if len(mon) <= screen_capture_monitor: + raise ValueError('Invalid monitor number in screen_capture_area') coord_left = mon[screen_capture_monitor]['left'] coord_top = mon[screen_capture_monitor]['top'] coord_width = mon[screen_capture_monitor]['width'] coord_height = mon[screen_capture_monitor]['height'] + elif screencapture_mode == 3: + coord_left, coord_top, coord_width, coord_height = [int(c.strip()) for c in screen_capture_area.split(',')] else: - x, y, coord_width, coord_height = [int(c.strip()) for c in screen_capture_coords.split(',')] - coord_left = mon[screen_capture_monitor]['left'] + x - coord_top = mon[screen_capture_monitor]['top'] + y - - sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height, 'mon': screen_capture_monitor} + logger.opt(ansi=True).info('Launching screen coordinate picker') + screen_selection = get_screen_selection() + if not screen_selection: + raise ValueError('Picker window was closed or an error occurred') + screen_capture_monitor = screen_selection['monitor'] + x, y, coord_width, coord_height = screen_selection['coordinates'] + if coord_width > 0 and coord_height > 0: + coord_top = screen_capture_monitor['top'] + y + coord_left = screen_capture_monitor['left'] + x + else: + logger.opt(ansi=True).info('Selection is empty, selecting whole screen') + coord_left = screen_capture_monitor['left'] + coord_top = screen_capture_monitor['top'] + coord_width = screen_capture_monitor['width'] + coord_height = screen_capture_monitor['height'] + + sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height} + logger.opt(ansi=True).info(f'Selected coordinates: {coord_left},{coord_top},{coord_width},{coord_height}') else: + area_invalid_error = '"screen_capture_area" must be empty, "screen_N" where N is a screen number starting from 1, a valid set of coordinates, or a valid window name' if sys.platform == 'darwin': if int(platform.mac_ver()[0].split('.')[0]) < 14: old_macos_screenshot_api = True @@ -815,35 +836,37 @@ def run(read_from=None, window_list = CGWindowListCopyWindowInfo(kCGWindowListExcludeDesktopElements, kCGNullWindowID) window_titles = [] window_ids = [] - window_id = 0 + window_index = None for i, window in enumerate(window_list): window_title = window.get(kCGWindowName, '') if psutil.Process(window['kCGWindowOwnerPID']).name() not in ('Terminal', 'iTerm2'): window_titles.append(window_title) window_ids.append(window['kCGWindowNumber']) - if screen_capture_coords in window_titles: - window_id = window_ids[window_titles.index(screen_capture_coords)] + if screen_capture_area in window_titles: + window_index = window_titles.index(screen_capture_area) else: for t in window_titles: - if screen_capture_coords in t: - window_id = window_ids[window_titles.index(t)] + if screen_capture_area in t: + window_index = window_titles.index(t) break - if not window_id: - msg = '"screen_capture_coords" must be empty (for the whole screen), a valid set of coordinates, or a valid window name' - raise ValueError(msg) + if not window_index: + raise ValueError(area_invalid_error) + + window_id = window_ids[window_index] + window_title = window_titles[window_index] if screen_capture_only_active_windows: screencapture_window_active = False macos_window_tracker = MacOSWindowTracker(window_id) macos_window_tracker.start() + logger.opt(ansi=True).info(f'Selected window: {window_title}') elif sys.platform == 'win32': - window_handle = get_windows_window_handle(screen_capture_coords) + window_handle, window_title = get_windows_window_handle(screen_capture_area) if not window_handle: - msg = '"screen_capture_coords" must be empty (for the whole screen), a valid set of coordinates, or a valid window name' - raise ValueError(msg) + raise ValueError(area_invalid_error) ctypes.windll.shcore.SetProcessDpiAwareness(1) @@ -851,21 +874,21 @@ def run(read_from=None, screencapture_window_active = False windows_window_tracker = WindowsWindowTracker(window_handle, screen_capture_only_active_windows) windows_window_tracker.start() + logger.opt(ansi=True).info(f'Selected window: {window_title}') else: sct = mss.mss() window_title = None window_titles = pywinctl.getAllTitles() - if screen_capture_coords in window_titles: - window_title = screen_capture_coords + if screen_capture_area in window_titles: + window_title = screen_capture_area else: for t in window_titles: - if screen_capture_coords in t and t != active_window_name: + if screen_capture_area in t and t != active_window_name: window_title = t break if not window_title: - msg = '"screen_capture_coords" must be empty (for the whole screen), a valid set of coordinates, or a valid window name' - raise ValueError(msg) + raise ValueError(area_invalid_error) target_window = pywinctl.getWindowsWithTitle(window_title)[0] coord_top = target_window.top @@ -881,6 +904,7 @@ def run(read_from=None, target_window.watchdog.start(isAliveCB=on_window_closed, isMinimizedCB=on_window_minimized, resizedCB=on_window_resized, movedCB=on_window_moved) sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height} + logger.opt(ansi=True).info(f'Selected window: {window_title}') filtering = TextFiltering() read_from_readable = 'screen capture' diff --git a/owocr/screen_coordinate_picker.py b/owocr/screen_coordinate_picker.py new file mode 100644 index 0000000..be8b6ce --- /dev/null +++ b/owocr/screen_coordinate_picker.py @@ -0,0 +1,100 @@ +from multiprocessing import Process, Manager +import mss +from PIL import Image, ImageTk + +try: + import tkinter as tk + selector_available = True +except: + selector_available = False + + +class ScreenSelector: + def __init__(self, result): + self.sct = mss.mss() + self.monitors = self.sct.monitors[1:] + self.root = None + self.result = result + + def on_select(self, monitor, coordinates): + self.result['monitor'] = monitor + self.result['coordinates'] = coordinates + self.root.destroy() + + def create_window(self, monitor): + screenshot = self.sct.grab(monitor) + img = Image.frombytes('RGB', screenshot.size, screenshot.rgb) + + if img.width != monitor['width']: + img = img.resize((monitor['width'], monitor['height']), Image.Resampling.LANCZOS) + + window = tk.Toplevel(self.root) + window.geometry(f"{monitor['width']}x{monitor['height']}+{monitor['left']}+{monitor['top']}") + window.overrideredirect(1) + window.attributes('-topmost', 1) + + img_tk = ImageTk.PhotoImage(img) + + canvas = tk.Canvas(window, cursor='cross', highlightthickness=0) + canvas.pack(fill=tk.BOTH, expand=True) + canvas.image = img_tk + canvas.create_image(0, 0, image=img_tk, anchor=tk.NW) + + start_x, start_y, rect = None, None, None + + def on_click(event): + nonlocal start_x, start_y, rect + start_x, start_y = event.x, event.y + rect = canvas.create_rectangle(start_x, start_y, start_x, start_y, outline='red') + + def on_drag(event): + nonlocal rect, start_x, start_y + if rect: + canvas.coords(rect, start_x, start_y, event.x, event.y) + + def on_release(event): + nonlocal start_x, start_y + end_x, end_y = event.x, event.y + + x1 = min(start_x, end_x) + y1 = min(start_y, end_y) + x2 = max(start_x, end_x) + y2 = max(start_y, end_y) + + self.on_select(monitor, (x1, y1, x2 - x1, y2 - y1)) + + canvas.bind('', on_click) + canvas.bind('', on_drag) + canvas.bind('', on_release) + + def start(self): + self.root = tk.Tk() + self.root.withdraw() + + for monitor in self.monitors: + self.create_window(monitor) + + self.root.mainloop() + self.root.update() + + +def run_screen_selector(result): + selector = ScreenSelector(result) + selector.start() + + +def get_screen_selection(): + if not selector_available: + raise ValueError('tkinter is not installed, unable to open picker') + + with Manager() as manager: + res = manager.dict() + process = Process(target=run_screen_selector, args=(res,)) + + process.start() + process.join() + + if 'monitor' in res and 'coordinates' in res: + return res.copy() + else: + return False diff --git a/owocr_config.ini b/owocr_config.ini index b85dc84..1c5208e 100644 --- a/owocr_config.ini +++ b/owocr_config.ini @@ -17,12 +17,12 @@ ;combo_pause = ++p ;note: this specifies a combo to wait on for switching the OCR engine. As an example: ++a. To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key ;combo_engine_switch = ++a -;screen_capture_monitor = 2 -;note: screen_capture_coords can be empty (whole screen), have a set of coordinates (x,y,width,height) or a window name (the first matching window title will be used) -;screen_capture_coords = -;screen_capture_coords = 400,200,1500,600 -;screen_capture_coords = OBS -;note: if screen_capture_coords is a window name, this can be changed to capture inactive windows too. On Linux, the window must then not be covered by other windows! +;note: screen_capture_area can be empty for the coordinate picker, "screen_N" (where N is the screen number starting from 1) for an entire screen, have a manual set of coordinates (x,y,width,height) or a window name (the first matching window title will be used) +;screen_capture_area = +;screen_capture_area = screen_1 +;screen_capture_area = 400,200,1500,600 +;screen_capture_area = OBS +;note: if screen_capture_area is a window name, this can be changed to capture inactive windows too. On Linux, the window must then not be covered by other windows! ;screen_capture_only_active_windows = True ;screen_capture_delay_secs = 3 ;note: this specifies a combo to wait on for taking a screenshot instead of using the delay. As an example: ++s. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key diff --git a/pyproject.toml b/pyproject.toml index b643397..c7c28ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "owocr" -version = "1.10" +version = "1.11" description = "Japanese OCR" readme = "README.md" requires-python = ">=3.11"