Add window area selection, combo to re-select screen/window area at runtime

2025-10-12 09:07:02 +02:00
parent 6ada579b19
commit e262231a1d
3 changed files with 282 additions and 132 deletions
--- a/owocr/config.py
+++ b/owocr/config.py
@@ -44,6 +44,8 @@ parser.add_argument('-cs', '--combo_engine_switch', type=str, default=argparse.S
                    help='Combo to wait on for switching the OCR engine. As an example: "<ctrl>+<shift>+a". To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
 parser.add_argument('-sa', '--screen_capture_area', type=str, default=argparse.SUPPRESS,
                    help='Area to target when reading with screen capture. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "screen_N" (captures a whole screen, where N is the screen number starting from 1) or a window name (the first matching window title will be used).')
+parser.add_argument('-swa', '--screen_capture_window_area', type=str, default=argparse.SUPPRESS,
+                    help='If capturing with screen capture, subsection of the selected window. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "window" to use the whole window.')
 parser.add_argument('-sd', '--screen_capture_delay_secs', type=float, default=argparse.SUPPRESS,
                    help='Delay (in seconds) between screenshots when reading with screen capture. -1 to disable periodic screenshots.')
 parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
@@ -56,6 +58,8 @@ parser.add_argument('-sff', '--screen_capture_furigana_filter', type=str2bool, n
                    help="When reading with screen capture, try to filter furigana lines.")
 parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
                    help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
+parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS,
+                    help='When reading with screen capture, combo to wait on for invoking the coordinate picker to change the screen/window area. Example value: "<ctrl>+<shift>+c". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
 parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
                    help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
 parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
@@ -89,12 +93,14 @@ class Config:
        'combo_pause': '',
        'combo_engine_switch': '',
        'screen_capture_area': '',
+        'screen_capture_window_area': 'window',
        'screen_capture_delay_secs': 0,
        'screen_capture_only_active_windows': True,
        'screen_capture_frame_stabilization': -1,
        'screen_capture_line_recovery': True,
        'screen_capture_furigana_filter': True,
        'screen_capture_combo': '',
+        'coordinate_selector_combo': '',
        'screen_capture_old_macos_api': False,
        'language': 'ja',
        'output_format': 'text',
--- a/owocr/run.py
+++ b/owocr/run.py
@@ -48,7 +48,7 @@ try:
    from AppKit import NSData, NSImage, NSBitmapImageRep, NSDeviceRGBColorSpace, NSGraphicsContext, NSZeroPoint, NSZeroRect, NSCompositingOperationCopy
    from Quartz import CGWindowListCreateImageFromArray, kCGWindowImageBoundsIgnoreFraming, CGRectMake, CGRectNull, CGMainDisplayID, CGWindowListCopyWindowInfo, \
                       CGWindowListCreateDescriptionFromArray, kCGWindowListOptionOnScreenOnly, kCGWindowListExcludeDesktopElements, kCGWindowName, kCGNullWindowID, \
-                       CGImageGetWidth, CGImageGetHeight, CGDataProviderCopyData, CGImageGetDataProvider, CGImageGetBytesPerRow
+                       CGImageGetWidth, CGImageGetHeight, CGDataProviderCopyData, CGImageGetDataProvider, CGImageGetBytesPerRow, kCGWindowImageNominalResolution
    from ScreenCaptureKit import SCContentFilter, SCScreenshotManager, SCShareableContent, SCStreamConfiguration, SCCaptureResolutionBest
 except ImportError:
    pass
@@ -312,7 +312,7 @@ class TextFiltering:
        self.stable_frame_data = None
        self.last_frame_text = []
        self.last_last_frame_text = []
-        self.stable_frame_text = None
+        self.stable_frame_text = []
        self.processed_stable_frame = False
        self.frame_stabilization_timestamp = 0
        self.cj_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
@@ -388,12 +388,6 @@ class TextFiltering:
        return filtered_text

    def _find_changed_lines(self, pil_image, current_result):
-        if (self.last_frame_data != [None, None] and (current_result.image_properties.width != self.last_frame_data[1].image_properties.width or
-            current_result.image_properties.height != self.last_frame_data[1].image_properties.height)):
-            self.stable_frame_data = None
-            self.last_frame_data = [None, None]
-            self.last_last_frame_data = [None, None]
-
        if self.frame_stabilization == 0:
            changed_lines = self._find_changed_lines_impl(current_result, self.last_frame_data[1])
            if changed_lines == None:
@@ -598,6 +592,11 @@ class TextFiltering:
                    self.recovered_lines_count -= 1
                    continue

+            changed_line = current_result[i]
+
+            if next_result != None:
+                logger.opt(ansi=True).debug(f"<red>Recovered line: '{changed_line}'</red>")
+
            if current_lines_ocr:
                current_line_bbox = current_lines_ocr[i].bounding_box
                # Check if line contains only kana (no kanji)
@@ -642,11 +641,6 @@ class TextFiltering:
                    if is_furigana:
                        continue

-            changed_line = current_result[i]
-
-            if next_result != None:
-                logger.opt(ansi=True).debug(f"<red>Recovered line: '{changed_line}'</red>")
-
            if first and len(current_text) > 3:
                first = False
                # For the first line, check if it contains the end of previous text
@@ -695,10 +689,6 @@ class TextFiltering:
        return current_line

    def _check_horizontal_overlap(self, bbox1, bbox2):
-        """
-        Calculate the horizontal overlap ratio between two bounding boxes.
-        Returns a value between 0.0 (no overlap) and 1.0 (complete overlap).
-        """
        # Calculate left and right boundaries for both boxes
        left1 = bbox1.center_x - bbox1.width / 2
        right1 = bbox1.center_x + bbox1.width / 2
@@ -790,6 +780,7 @@ class ScreenshotThread(threading.Thread):
    def __init__(self):
        super().__init__(daemon=True)
        screen_capture_area = config.get_general('screen_capture_area')
+        self.is_combo_screenshot = False
        self.macos_window_tracker_instance = None
        self.windows_window_tracker_instance = None
        self.screencapture_window_active = True
@@ -821,27 +812,16 @@ class ScreenshotThread(threading.Thread):
            elif self.screencapture_mode == 3:
                coord_left, coord_top, coord_width, coord_height = [int(c.strip()) for c in screen_capture_area.split(',')]
            else:
-                logger.opt(ansi=True).info('Launching screen coordinate picker')
-                screen_selection = get_screen_selection()
-                if not screen_selection:
-                    raise ValueError('Picker window was closed or an error occurred')
-                screen_capture_monitor = screen_selection['monitor']
-                x, y, coord_width, coord_height = screen_selection['coordinates']
-                if coord_width > 0 and coord_height > 0:
-                    coord_top = screen_capture_monitor['top'] + y
-                    coord_left = screen_capture_monitor['left'] + x
-                else:
-                    logger.opt(ansi=True).info('Selection is empty, selecting whole screen')
-                    coord_left = screen_capture_monitor['left']
-                    coord_top = screen_capture_monitor['top']
-                    coord_width = screen_capture_monitor['width']
-                    coord_height = screen_capture_monitor['height']
+                self.launch_coordinate_picker(True)

+            if self.screencapture_mode != 0:
                self.sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height}
                logger.opt(ansi=True).info(f'Selected coordinates: {coord_left},{coord_top},{coord_width},{coord_height}')
        else:
            self.screen_capture_only_active_windows = config.get_general('screen_capture_only_active_windows')
+            self.window_area_coordinates = None
            area_invalid_error = '"screen_capture_area" must be empty, "screen_N" where N is a screen number starting from 1, a valid set of coordinates, or a valid window name'
+
            if sys.platform == 'darwin':
                if config.get_general('screen_capture_old_macos_api') or int(platform.mac_ver()[0].split('.')[0]) < 14:
                    self.old_macos_screenshot_api = True
@@ -890,7 +870,17 @@ class ScreenshotThread(threading.Thread):
                logger.opt(ansi=True).info(f'Selected window: {window_title}')
            else:
                raise ValueError('Window capture is only currently supported on Windows and macOS')
-        self.is_combo_screenshot = False
+
+            screen_capture_window_area = config.get_general('screen_capture_window_area')
+            if screen_capture_window_area != 'window':    
+                if len(screen_capture_window_area.split(',')) == 4:
+                    x, y, x2, y2 = [int(c.strip()) for c in screen_capture_window_area.split(',')]
+                    logger.opt(ansi=True).info(f'Selected window coordinates: {x},{y},{x2},{y2}')
+                    self.window_area_coordinates = (img.size, (x, y, x2, y2))
+                elif screen_capture_window_area == '':
+                    self.launch_coordinate_picker(True)
+                else:
+                    raise ValueError('"screen_capture_window_area" must be empty, "window" for the whole window, or a valid set of coordinates')

    def get_windows_window_handle(self, window_title):
        def callback(hwnd, window_title_part):
@@ -998,24 +988,12 @@ class ScreenshotThread(threading.Thread):
        if not found:
            on_window_closed(False)

-    def write_result(self, result):
-        if self.is_combo_screenshot:
-            self.is_combo_screenshot = False
-            image_queue.put((result, True))
-        else:
-            periodic_screenshot_queue.put(result)
-
-    def run(self):
-        if self.screencapture_mode != 2:
-            sct = mss.mss()
-        while not terminated:
-            if not screenshot_event.wait(timeout=0.1):
-                continue
+    def take_screenshot(self):
        if self.screencapture_mode == 2:
            if sys.platform == 'darwin':
                with objc.autorelease_pool():
                    if self.old_macos_screenshot_api:
-                            cg_image = CGWindowListCreateImageFromArray(CGRectNull, [self.window_id], kCGWindowImageBoundsIgnoreFraming)
+                        cg_image = CGWindowListCreateImageFromArray(CGRectNull, [self.window_id], kCGWindowImageBoundsIgnoreFraming | kCGWindowImageNominalResolution)
                    else:
                        self.capture_macos_window_screenshot(self.window_id)
                        try:
@@ -1023,8 +1001,7 @@ class ScreenshotThread(threading.Thread):
                        except queue.Empty:
                            cg_image = None
                    if not cg_image:
-                            self.write_result(0)
-                            break
+                        return None
                    width = CGImageGetWidth(cg_image)
                    height = CGImageGetHeight(cg_image)
                    raw_data = CGDataProviderCopyData(CGImageGetDataProvider(cg_image))
@@ -1049,8 +1026,7 @@ class ScreenshotThread(threading.Thread):
                    bmpinfo = save_bitmap.GetInfo()
                    bmpstr = save_bitmap.GetBitmapBits(True)
                except pywintypes.error:
-                        self.write_result(0)
-                        break
+                    return None
                img = Image.frombuffer('RGB', (bmpinfo['bmWidth'], bmpinfo['bmHeight']), bmpstr, 'raw', 'BGRX', 0, 1)
                try:
                    win32gui.DeleteObject(save_bitmap.GetHandle())
@@ -1068,10 +1044,80 @@ class ScreenshotThread(threading.Thread):
                    win32gui.ReleaseDC(self.window_handle, hwnd_dc)
                except:
                    pass
+            if self.window_area_coordinates:
+                if img.size != self.window_area_coordinates[0]:
+                    self.window_area_coordinates = None
+                    logger.opt(ansi=True).warning('Window size changed, discarding area selection')
+                else:
+                    img = img.crop(self.window_area_coordinates[1])
        else:
            sct_img = sct.grab(self.sct_params)
            img = Image.frombytes('RGB', sct_img.size, sct_img.bgra, 'raw', 'BGRX')

+        return img
+
+    def write_result(self, result):
+        if self.is_combo_screenshot:
+            self.is_combo_screenshot = False
+            image_queue.put((result, True))
+        else:
+            periodic_screenshot_queue.put(result)
+
+    def launch_coordinate_picker(self, on_init):
+        if self.screencapture_mode != 2:
+            logger.opt(ansi=True).info('Launching screen coordinate picker')
+            screen_selection = get_screen_selection()
+            if not screen_selection:
+                if on_init:
+                    raise ValueError('Picker window was closed or an error occurred')
+                else:
+                    logger.opt(ansi=True).warning('Picker window was closed or an error occurred, leaving settings unchanged')
+                    return
+            screen_capture_monitor = screen_selection['monitor']
+            x, y, coord_width, coord_height = screen_selection['coordinates']
+            if coord_width > 0 and coord_height > 0:
+                coord_top = screen_capture_monitor['top'] + y
+                coord_left = screen_capture_monitor['left'] + x
+            else:
+                logger.opt(ansi=True).info('Selection is empty, selecting whole screen')
+                coord_left = screen_capture_monitor['left']
+                coord_top = screen_capture_monitor['top']
+                coord_width = screen_capture_monitor['width']
+                coord_height = screen_capture_monitor['height']
+            self.sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height}
+            logger.opt(ansi=True).info(f'Selected coordinates: {coord_left},{coord_top},{coord_width},{coord_height}')
+        else:
+            self.window_area_coordinates = None
+            img = self.take_screenshot()
+            logger.opt(ansi=True).info('Launching window coordinate picker')
+            window_selection = get_screen_selection(img)
+            if not window_selection:
+                logger.opt(ansi=True).warning('Picker window was closed or an error occurred, selecting whole window')
+            else:
+                x, y, coord_width, coord_height = window_selection['coordinates']
+                if coord_width > 0 and coord_height > 0:
+                    x2 = x + coord_width
+                    y2 = y + coord_height
+                    logger.opt(ansi=True).info(f'Selected window coordinates: {x},{y},{x2},{y2}')
+                    self.window_area_coordinates = (img.size, (x, y, x2, y2))
+                else:
+                    logger.opt(ansi=True).info('Selection is empty, selecting whole window')
+
+    def run(self):
+        if self.screencapture_mode != 2:
+            sct = mss.mss()
+        while not terminated:
+            if not screenshot_event.wait(timeout=0.1):
+                if coordinate_selector_event.is_set():
+                    self.launch_coordinate_picker(False)
+                    coordinate_selector_event.clear()
+                continue
+
+            img = self.take_screenshot()
+            if not img:
+                self.write_result(0)
+                break
+
            self.write_result(img)
            screenshot_event.clear()

@@ -1275,8 +1321,12 @@ def user_input_thread_run():
    if sys.platform == 'win32':
        import msvcrt
        while not terminated:
-            user_input_bytes = msvcrt.getch()
+            if coordinate_selector_event.is_set():
+                while coordinate_selector_event.is_set():
+                    time.sleep(0.1)
+            if msvcrt.kbhit():
                try:
+                    user_input_bytes = msvcrt.getch()
                    user_input = user_input_bytes.decode()
                    if user_input.lower() in 'tq':
                        _terminate_handler()
@@ -1287,12 +1337,20 @@ def user_input_thread_run():
                except UnicodeDecodeError:
                    pass
            else:
-        import tty, termios
+                time.sleep(0.1)
+    else:
+        import tty, termios, select
        fd = sys.stdin.fileno()
        old_settings = termios.tcgetattr(fd)
        try:
-            tty.setcbreak(sys.stdin.fileno())
+            tty.setcbreak(fd)
            while not terminated:
+                if coordinate_selector_event.is_set():
+                    while coordinate_selector_event.is_set():
+                        time.sleep(0.1)
+                    tty.setcbreak(fd)
+                rlist, _, _ = select.select([sys.stdin], [], [], 0.1)
+                if rlist:
                    user_input = sys.stdin.read(1)
                    if user_input.lower() in 'tq':
                        _terminate_handler()
@@ -1322,6 +1380,10 @@ def on_screenshot_combo():
    screenshot_event.set()


+def on_coordinate_selector_combo():
+    coordinate_selector_event.set()
+
+
 def run():
    logger_level = 'DEBUG' if config.get_general('uwu') else 'INFO'
    logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format'), 'level': logger_level}])
@@ -1379,6 +1441,7 @@ def run():
    global websocket_server_thread
    global screenshot_thread
    global image_queue
+    global coordinate_selector_event
    non_path_inputs = ('screencapture', 'clipboard', 'websocket', 'unixsocket')
    read_from = config.get_general('read_from')
    read_from_secondary = config.get_general('read_from_secondary')
@@ -1403,6 +1466,7 @@ def run():
    combo_engine_switch = config.get_general('combo_engine_switch')
    screen_capture_periodic = False
    screen_capture_on_combo = False
+    coordinate_selector_event = threading.Event()
    notifier = DesktopNotifierSync()
    image_queue = queue.Queue()
    key_combos = {}
@@ -1422,10 +1486,13 @@ def run():
        global screenshot_event
        screen_capture_delay_secs = config.get_general('screen_capture_delay_secs')
        screen_capture_combo = config.get_general('screen_capture_combo')
+        coordinate_selector_combo = config.get_general('coordinate_selector_combo')
        last_screenshot_time = 0
        if screen_capture_combo != '':
            screen_capture_on_combo = True
            key_combos[screen_capture_combo] = on_screenshot_combo
+        if coordinate_selector_combo != '':
+            key_combos[coordinate_selector_combo] = on_coordinate_selector_combo
        if screen_capture_delay_secs != -1:
            global periodic_screenshot_queue
            periodic_screenshot_queue = queue.Queue()
@@ -1547,3 +1614,4 @@ def run():
        screenshot_thread.join()
    if key_combo_listener:
        key_combo_listener.stop()
+    user_input_thread.join()
--- a/owocr/screen_coordinate_picker.py
+++ b/owocr/screen_coordinate_picker.py
@@ -11,17 +11,90 @@ except:


 class ScreenSelector:
-    def __init__(self, result):
+    def __init__(self, result, input_image=None):
        self.sct = mss.mss()
        self.monitors = self.sct.monitors[1:]
        self.root = None
        self.result = result
+        self.input_image = input_image

    def on_select(self, monitor, coordinates):
        self.result['monitor'] = monitor
        self.result['coordinates'] = coordinates
        self.root.destroy()

+    def create_window_from_image(self, img):
+        original_width, original_height = img.size
+        display_monitor = None
+        
+        for monitor in self.monitors:
+            if (monitor['width'] >= original_width and 
+                monitor['height'] >= original_height):
+                display_monitor = monitor
+                break
+
+        if not display_monitor:
+            display_monitor = self.monitors[0]
+
+        window_width = min(original_width, display_monitor['width'])
+        window_height = min(original_height, display_monitor['height'])
+        left = display_monitor['left'] + (display_monitor['width'] - window_width) // 2
+        top = display_monitor['top'] + (display_monitor['height'] - window_height) // 2
+
+        window = tk.Toplevel(self.root)
+        window.geometry(f"{window_width}x{window_height}+{left}+{top}")
+        window.overrideredirect(1)
+        window.attributes('-topmost', 1)
+
+        # Resize image if it's larger than the window
+        if img.width > window_width or img.height > window_height:
+            img = img.resize((window_width, window_height), Image.Resampling.LANCZOS)
+            scale_x = original_width / window_width
+            scale_y = original_height / window_height
+        else:
+            scale_x = 1
+            scale_y = 1
+
+        img_tk = ImageTk.PhotoImage(img)
+
+        canvas = tk.Canvas(window, cursor='cross', highlightthickness=0)
+        canvas.pack(fill=tk.BOTH, expand=True)
+        canvas.image = img_tk
+        canvas.create_image(0, 0, image=img_tk, anchor=tk.NW)
+
+        start_x, start_y, rect = None, None, None
+
+        def on_click(event):
+            nonlocal start_x, start_y, rect
+            start_x, start_y = event.x, event.y
+            rect = canvas.create_rectangle(start_x, start_y, start_x, start_y, outline='red')
+
+        def on_drag(event):
+            nonlocal rect, start_x, start_y
+            if rect:
+                canvas.coords(rect, start_x, start_y, event.x, event.y)
+
+        def on_release(event):
+            nonlocal start_x, start_y, scale_x, scale_y
+            end_x, end_y = event.x, event.y
+            
+            x1 = min(start_x, end_x) 
+            y1 = min(start_y, end_y) 
+            x2 = max(start_x, end_x) 
+            y2 = max(start_y, end_y)
+
+            x1 = int(x1 * scale_x)
+            y1 = int(y1 * scale_y)
+            x2 = int(x2 * scale_x)
+            y2 = int(y2 * scale_y)
+            
+            # Return None for monitor when using input image
+            self.on_select(None, (x1, y1, x2 - x1, y2 - y1))
+
+        canvas.bind('<ButtonPress-1>', on_click)
+        canvas.bind('<B1-Motion>', on_drag)
+        canvas.bind('<ButtonRelease-1>', on_release)
+
    def create_window(self, monitor):
        screenshot = self.sct.grab(monitor)
        img = Image.frombytes('RGB', screenshot.size, screenshot.rgb)
@@ -72,6 +145,9 @@ class ScreenSelector:
        self.root = tk.Tk()
        self.root.withdraw()

+        if self.input_image:
+            self.create_window_from_image(self.input_image)
+        else:
            for monitor in self.monitors:
                self.create_window(monitor)

@@ -79,18 +155,18 @@ class ScreenSelector:
        self.root.update()


-def run_screen_selector(result):
-    selector = ScreenSelector(result)
+def run_screen_selector(result, input_image=None):
+    selector = ScreenSelector(result, input_image)
    selector.start()


-def get_screen_selection():
+def get_screen_selection(pil_image = None):
    if not selector_available:
        raise ValueError('tkinter or PIL with tkinter support are not installed, unable to open picker')

    with Manager() as manager:
        res = manager.dict()
-        process = Process(target=run_screen_selector, args=(res,))
+        process = Process(target=run_screen_selector, args=(res, pil_image))
        
        process.start()    
        process.join()