Add window area selection, combo to re-select screen/window area at runtime

This commit is contained in:
AuroraWright
2025-10-12 09:07:02 +02:00
parent 6ada579b19
commit e262231a1d
3 changed files with 282 additions and 132 deletions

View File

@@ -44,6 +44,8 @@ parser.add_argument('-cs', '--combo_engine_switch', type=str, default=argparse.S
help='Combo to wait on for switching the OCR engine. As an example: "<ctrl>+<shift>+a". To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-sa', '--screen_capture_area', type=str, default=argparse.SUPPRESS,
help='Area to target when reading with screen capture. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "screen_N" (captures a whole screen, where N is the screen number starting from 1) or a window name (the first matching window title will be used).')
parser.add_argument('-swa', '--screen_capture_window_area', type=str, default=argparse.SUPPRESS,
help='If capturing with screen capture, subsection of the selected window. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "window" to use the whole window.')
parser.add_argument('-sd', '--screen_capture_delay_secs', type=float, default=argparse.SUPPRESS,
help='Delay (in seconds) between screenshots when reading with screen capture. -1 to disable periodic screenshots.')
parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
@@ -56,6 +58,8 @@ parser.add_argument('-sff', '--screen_capture_furigana_filter', type=str2bool, n
help="When reading with screen capture, try to filter furigana lines.")
parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS,
help='When reading with screen capture, combo to wait on for invoking the coordinate picker to change the screen/window area. Example value: "<ctrl>+<shift>+c". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
@@ -89,12 +93,14 @@ class Config:
'combo_pause': '',
'combo_engine_switch': '',
'screen_capture_area': '',
'screen_capture_window_area': 'window',
'screen_capture_delay_secs': 0,
'screen_capture_only_active_windows': True,
'screen_capture_frame_stabilization': -1,
'screen_capture_line_recovery': True,
'screen_capture_furigana_filter': True,
'screen_capture_combo': '',
'coordinate_selector_combo': '',
'screen_capture_old_macos_api': False,
'language': 'ja',
'output_format': 'text',

View File

@@ -48,7 +48,7 @@ try:
from AppKit import NSData, NSImage, NSBitmapImageRep, NSDeviceRGBColorSpace, NSGraphicsContext, NSZeroPoint, NSZeroRect, NSCompositingOperationCopy
from Quartz import CGWindowListCreateImageFromArray, kCGWindowImageBoundsIgnoreFraming, CGRectMake, CGRectNull, CGMainDisplayID, CGWindowListCopyWindowInfo, \
CGWindowListCreateDescriptionFromArray, kCGWindowListOptionOnScreenOnly, kCGWindowListExcludeDesktopElements, kCGWindowName, kCGNullWindowID, \
CGImageGetWidth, CGImageGetHeight, CGDataProviderCopyData, CGImageGetDataProvider, CGImageGetBytesPerRow
CGImageGetWidth, CGImageGetHeight, CGDataProviderCopyData, CGImageGetDataProvider, CGImageGetBytesPerRow, kCGWindowImageNominalResolution
from ScreenCaptureKit import SCContentFilter, SCScreenshotManager, SCShareableContent, SCStreamConfiguration, SCCaptureResolutionBest
except ImportError:
pass
@@ -312,7 +312,7 @@ class TextFiltering:
self.stable_frame_data = None
self.last_frame_text = []
self.last_last_frame_text = []
self.stable_frame_text = None
self.stable_frame_text = []
self.processed_stable_frame = False
self.frame_stabilization_timestamp = 0
self.cj_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
@@ -388,12 +388,6 @@ class TextFiltering:
return filtered_text
def _find_changed_lines(self, pil_image, current_result):
if (self.last_frame_data != [None, None] and (current_result.image_properties.width != self.last_frame_data[1].image_properties.width or
current_result.image_properties.height != self.last_frame_data[1].image_properties.height)):
self.stable_frame_data = None
self.last_frame_data = [None, None]
self.last_last_frame_data = [None, None]
if self.frame_stabilization == 0:
changed_lines = self._find_changed_lines_impl(current_result, self.last_frame_data[1])
if changed_lines == None:
@@ -598,6 +592,11 @@ class TextFiltering:
self.recovered_lines_count -= 1
continue
changed_line = current_result[i]
if next_result != None:
logger.opt(ansi=True).debug(f"<red>Recovered line: '{changed_line}'</red>")
if current_lines_ocr:
current_line_bbox = current_lines_ocr[i].bounding_box
# Check if line contains only kana (no kanji)
@@ -642,11 +641,6 @@ class TextFiltering:
if is_furigana:
continue
changed_line = current_result[i]
if next_result != None:
logger.opt(ansi=True).debug(f"<red>Recovered line: '{changed_line}'</red>")
if first and len(current_text) > 3:
first = False
# For the first line, check if it contains the end of previous text
@@ -695,10 +689,6 @@ class TextFiltering:
return current_line
def _check_horizontal_overlap(self, bbox1, bbox2):
"""
Calculate the horizontal overlap ratio between two bounding boxes.
Returns a value between 0.0 (no overlap) and 1.0 (complete overlap).
"""
# Calculate left and right boundaries for both boxes
left1 = bbox1.center_x - bbox1.width / 2
right1 = bbox1.center_x + bbox1.width / 2
@@ -790,6 +780,7 @@ class ScreenshotThread(threading.Thread):
def __init__(self):
super().__init__(daemon=True)
screen_capture_area = config.get_general('screen_capture_area')
self.is_combo_screenshot = False
self.macos_window_tracker_instance = None
self.windows_window_tracker_instance = None
self.screencapture_window_active = True
@@ -821,27 +812,16 @@ class ScreenshotThread(threading.Thread):
elif self.screencapture_mode == 3:
coord_left, coord_top, coord_width, coord_height = [int(c.strip()) for c in screen_capture_area.split(',')]
else:
logger.opt(ansi=True).info('Launching screen coordinate picker')
screen_selection = get_screen_selection()
if not screen_selection:
raise ValueError('Picker window was closed or an error occurred')
screen_capture_monitor = screen_selection['monitor']
x, y, coord_width, coord_height = screen_selection['coordinates']
if coord_width > 0 and coord_height > 0:
coord_top = screen_capture_monitor['top'] + y
coord_left = screen_capture_monitor['left'] + x
else:
logger.opt(ansi=True).info('Selection is empty, selecting whole screen')
coord_left = screen_capture_monitor['left']
coord_top = screen_capture_monitor['top']
coord_width = screen_capture_monitor['width']
coord_height = screen_capture_monitor['height']
self.launch_coordinate_picker(True)
if self.screencapture_mode != 0:
self.sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height}
logger.opt(ansi=True).info(f'Selected coordinates: {coord_left},{coord_top},{coord_width},{coord_height}')
else:
self.screen_capture_only_active_windows = config.get_general('screen_capture_only_active_windows')
self.window_area_coordinates = None
area_invalid_error = '"screen_capture_area" must be empty, "screen_N" where N is a screen number starting from 1, a valid set of coordinates, or a valid window name'
if sys.platform == 'darwin':
if config.get_general('screen_capture_old_macos_api') or int(platform.mac_ver()[0].split('.')[0]) < 14:
self.old_macos_screenshot_api = True
@@ -890,7 +870,17 @@ class ScreenshotThread(threading.Thread):
logger.opt(ansi=True).info(f'Selected window: {window_title}')
else:
raise ValueError('Window capture is only currently supported on Windows and macOS')
self.is_combo_screenshot = False
screen_capture_window_area = config.get_general('screen_capture_window_area')
if screen_capture_window_area != 'window':
if len(screen_capture_window_area.split(',')) == 4:
x, y, x2, y2 = [int(c.strip()) for c in screen_capture_window_area.split(',')]
logger.opt(ansi=True).info(f'Selected window coordinates: {x},{y},{x2},{y2}')
self.window_area_coordinates = (img.size, (x, y, x2, y2))
elif screen_capture_window_area == '':
self.launch_coordinate_picker(True)
else:
raise ValueError('"screen_capture_window_area" must be empty, "window" for the whole window, or a valid set of coordinates')
def get_windows_window_handle(self, window_title):
def callback(hwnd, window_title_part):
@@ -998,24 +988,12 @@ class ScreenshotThread(threading.Thread):
if not found:
on_window_closed(False)
def write_result(self, result):
if self.is_combo_screenshot:
self.is_combo_screenshot = False
image_queue.put((result, True))
else:
periodic_screenshot_queue.put(result)
def run(self):
if self.screencapture_mode != 2:
sct = mss.mss()
while not terminated:
if not screenshot_event.wait(timeout=0.1):
continue
def take_screenshot(self):
if self.screencapture_mode == 2:
if sys.platform == 'darwin':
with objc.autorelease_pool():
if self.old_macos_screenshot_api:
cg_image = CGWindowListCreateImageFromArray(CGRectNull, [self.window_id], kCGWindowImageBoundsIgnoreFraming)
cg_image = CGWindowListCreateImageFromArray(CGRectNull, [self.window_id], kCGWindowImageBoundsIgnoreFraming | kCGWindowImageNominalResolution)
else:
self.capture_macos_window_screenshot(self.window_id)
try:
@@ -1023,8 +1001,7 @@ class ScreenshotThread(threading.Thread):
except queue.Empty:
cg_image = None
if not cg_image:
self.write_result(0)
break
return None
width = CGImageGetWidth(cg_image)
height = CGImageGetHeight(cg_image)
raw_data = CGDataProviderCopyData(CGImageGetDataProvider(cg_image))
@@ -1049,8 +1026,7 @@ class ScreenshotThread(threading.Thread):
bmpinfo = save_bitmap.GetInfo()
bmpstr = save_bitmap.GetBitmapBits(True)
except pywintypes.error:
self.write_result(0)
break
return None
img = Image.frombuffer('RGB', (bmpinfo['bmWidth'], bmpinfo['bmHeight']), bmpstr, 'raw', 'BGRX', 0, 1)
try:
win32gui.DeleteObject(save_bitmap.GetHandle())
@@ -1068,10 +1044,80 @@ class ScreenshotThread(threading.Thread):
win32gui.ReleaseDC(self.window_handle, hwnd_dc)
except:
pass
if self.window_area_coordinates:
if img.size != self.window_area_coordinates[0]:
self.window_area_coordinates = None
logger.opt(ansi=True).warning('Window size changed, discarding area selection')
else:
img = img.crop(self.window_area_coordinates[1])
else:
sct_img = sct.grab(self.sct_params)
img = Image.frombytes('RGB', sct_img.size, sct_img.bgra, 'raw', 'BGRX')
return img
def write_result(self, result):
if self.is_combo_screenshot:
self.is_combo_screenshot = False
image_queue.put((result, True))
else:
periodic_screenshot_queue.put(result)
def launch_coordinate_picker(self, on_init):
if self.screencapture_mode != 2:
logger.opt(ansi=True).info('Launching screen coordinate picker')
screen_selection = get_screen_selection()
if not screen_selection:
if on_init:
raise ValueError('Picker window was closed or an error occurred')
else:
logger.opt(ansi=True).warning('Picker window was closed or an error occurred, leaving settings unchanged')
return
screen_capture_monitor = screen_selection['monitor']
x, y, coord_width, coord_height = screen_selection['coordinates']
if coord_width > 0 and coord_height > 0:
coord_top = screen_capture_monitor['top'] + y
coord_left = screen_capture_monitor['left'] + x
else:
logger.opt(ansi=True).info('Selection is empty, selecting whole screen')
coord_left = screen_capture_monitor['left']
coord_top = screen_capture_monitor['top']
coord_width = screen_capture_monitor['width']
coord_height = screen_capture_monitor['height']
self.sct_params = {'top': coord_top, 'left': coord_left, 'width': coord_width, 'height': coord_height}
logger.opt(ansi=True).info(f'Selected coordinates: {coord_left},{coord_top},{coord_width},{coord_height}')
else:
self.window_area_coordinates = None
img = self.take_screenshot()
logger.opt(ansi=True).info('Launching window coordinate picker')
window_selection = get_screen_selection(img)
if not window_selection:
logger.opt(ansi=True).warning('Picker window was closed or an error occurred, selecting whole window')
else:
x, y, coord_width, coord_height = window_selection['coordinates']
if coord_width > 0 and coord_height > 0:
x2 = x + coord_width
y2 = y + coord_height
logger.opt(ansi=True).info(f'Selected window coordinates: {x},{y},{x2},{y2}')
self.window_area_coordinates = (img.size, (x, y, x2, y2))
else:
logger.opt(ansi=True).info('Selection is empty, selecting whole window')
def run(self):
if self.screencapture_mode != 2:
sct = mss.mss()
while not terminated:
if not screenshot_event.wait(timeout=0.1):
if coordinate_selector_event.is_set():
self.launch_coordinate_picker(False)
coordinate_selector_event.clear()
continue
img = self.take_screenshot()
if not img:
self.write_result(0)
break
self.write_result(img)
screenshot_event.clear()
@@ -1275,8 +1321,12 @@ def user_input_thread_run():
if sys.platform == 'win32':
import msvcrt
while not terminated:
user_input_bytes = msvcrt.getch()
if coordinate_selector_event.is_set():
while coordinate_selector_event.is_set():
time.sleep(0.1)
if msvcrt.kbhit():
try:
user_input_bytes = msvcrt.getch()
user_input = user_input_bytes.decode()
if user_input.lower() in 'tq':
_terminate_handler()
@@ -1287,12 +1337,20 @@ def user_input_thread_run():
except UnicodeDecodeError:
pass
else:
import tty, termios
time.sleep(0.1)
else:
import tty, termios, select
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setcbreak(sys.stdin.fileno())
tty.setcbreak(fd)
while not terminated:
if coordinate_selector_event.is_set():
while coordinate_selector_event.is_set():
time.sleep(0.1)
tty.setcbreak(fd)
rlist, _, _ = select.select([sys.stdin], [], [], 0.1)
if rlist:
user_input = sys.stdin.read(1)
if user_input.lower() in 'tq':
_terminate_handler()
@@ -1322,6 +1380,10 @@ def on_screenshot_combo():
screenshot_event.set()
def on_coordinate_selector_combo():
coordinate_selector_event.set()
def run():
logger_level = 'DEBUG' if config.get_general('uwu') else 'INFO'
logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format'), 'level': logger_level}])
@@ -1379,6 +1441,7 @@ def run():
global websocket_server_thread
global screenshot_thread
global image_queue
global coordinate_selector_event
non_path_inputs = ('screencapture', 'clipboard', 'websocket', 'unixsocket')
read_from = config.get_general('read_from')
read_from_secondary = config.get_general('read_from_secondary')
@@ -1403,6 +1466,7 @@ def run():
combo_engine_switch = config.get_general('combo_engine_switch')
screen_capture_periodic = False
screen_capture_on_combo = False
coordinate_selector_event = threading.Event()
notifier = DesktopNotifierSync()
image_queue = queue.Queue()
key_combos = {}
@@ -1422,10 +1486,13 @@ def run():
global screenshot_event
screen_capture_delay_secs = config.get_general('screen_capture_delay_secs')
screen_capture_combo = config.get_general('screen_capture_combo')
coordinate_selector_combo = config.get_general('coordinate_selector_combo')
last_screenshot_time = 0
if screen_capture_combo != '':
screen_capture_on_combo = True
key_combos[screen_capture_combo] = on_screenshot_combo
if coordinate_selector_combo != '':
key_combos[coordinate_selector_combo] = on_coordinate_selector_combo
if screen_capture_delay_secs != -1:
global periodic_screenshot_queue
periodic_screenshot_queue = queue.Queue()
@@ -1547,3 +1614,4 @@ def run():
screenshot_thread.join()
if key_combo_listener:
key_combo_listener.stop()
user_input_thread.join()

View File

@@ -11,17 +11,90 @@ except:
class ScreenSelector:
def __init__(self, result):
def __init__(self, result, input_image=None):
self.sct = mss.mss()
self.monitors = self.sct.monitors[1:]
self.root = None
self.result = result
self.input_image = input_image
def on_select(self, monitor, coordinates):
self.result['monitor'] = monitor
self.result['coordinates'] = coordinates
self.root.destroy()
def create_window_from_image(self, img):
original_width, original_height = img.size
display_monitor = None
for monitor in self.monitors:
if (monitor['width'] >= original_width and
monitor['height'] >= original_height):
display_monitor = monitor
break
if not display_monitor:
display_monitor = self.monitors[0]
window_width = min(original_width, display_monitor['width'])
window_height = min(original_height, display_monitor['height'])
left = display_monitor['left'] + (display_monitor['width'] - window_width) // 2
top = display_monitor['top'] + (display_monitor['height'] - window_height) // 2
window = tk.Toplevel(self.root)
window.geometry(f"{window_width}x{window_height}+{left}+{top}")
window.overrideredirect(1)
window.attributes('-topmost', 1)
# Resize image if it's larger than the window
if img.width > window_width or img.height > window_height:
img = img.resize((window_width, window_height), Image.Resampling.LANCZOS)
scale_x = original_width / window_width
scale_y = original_height / window_height
else:
scale_x = 1
scale_y = 1
img_tk = ImageTk.PhotoImage(img)
canvas = tk.Canvas(window, cursor='cross', highlightthickness=0)
canvas.pack(fill=tk.BOTH, expand=True)
canvas.image = img_tk
canvas.create_image(0, 0, image=img_tk, anchor=tk.NW)
start_x, start_y, rect = None, None, None
def on_click(event):
nonlocal start_x, start_y, rect
start_x, start_y = event.x, event.y
rect = canvas.create_rectangle(start_x, start_y, start_x, start_y, outline='red')
def on_drag(event):
nonlocal rect, start_x, start_y
if rect:
canvas.coords(rect, start_x, start_y, event.x, event.y)
def on_release(event):
nonlocal start_x, start_y, scale_x, scale_y
end_x, end_y = event.x, event.y
x1 = min(start_x, end_x)
y1 = min(start_y, end_y)
x2 = max(start_x, end_x)
y2 = max(start_y, end_y)
x1 = int(x1 * scale_x)
y1 = int(y1 * scale_y)
x2 = int(x2 * scale_x)
y2 = int(y2 * scale_y)
# Return None for monitor when using input image
self.on_select(None, (x1, y1, x2 - x1, y2 - y1))
canvas.bind('<ButtonPress-1>', on_click)
canvas.bind('<B1-Motion>', on_drag)
canvas.bind('<ButtonRelease-1>', on_release)
def create_window(self, monitor):
screenshot = self.sct.grab(monitor)
img = Image.frombytes('RGB', screenshot.size, screenshot.rgb)
@@ -72,6 +145,9 @@ class ScreenSelector:
self.root = tk.Tk()
self.root.withdraw()
if self.input_image:
self.create_window_from_image(self.input_image)
else:
for monitor in self.monitors:
self.create_window(monitor)
@@ -79,18 +155,18 @@ class ScreenSelector:
self.root.update()
def run_screen_selector(result):
selector = ScreenSelector(result)
def run_screen_selector(result, input_image=None):
selector = ScreenSelector(result, input_image)
selector.start()
def get_screen_selection():
def get_screen_selection(pil_image = None):
if not selector_available:
raise ValueError('tkinter or PIL with tkinter support are not installed, unable to open picker')
with Manager() as manager:
res = manager.dict()
process = Process(target=run_screen_selector, args=(res,))
process = Process(target=run_screen_selector, args=(res, pil_image))
process.start()
process.join()