Live OpenCV window capture (screenshot) on macOS (Darwin) using Python

Question:

I am following a tutorial on Open CV and trying to rewrite the following code:
https://github.com/learncodebygaming/opencv_tutorials/tree/master/005_real_time

(specifically, the windowcapture.py file)

This file uses win32gui, win32ui, win32con to capture a given open window by window name and take a screenshot of it for cv2 processing later down the line.

I have attempted to recreate this functionality using Quartz for macOS using the following example:
https://stackoverflow.com/a/48030215/14649706

So my own version of windowcapture.py looks like this:

import numpy as np
from Quartz import CGWindowListCopyWindowInfo, kCGNullWindowID, kCGWindowListOptionAll, CGRectNull, CGWindowListCreateImage, kCGWindowImageBoundsIgnoreFraming, kCGWindowListExcludeDesktopElements, CGImageGetDataProvider, CGDataProviderCopyData, CFDataGetBytePtr, CFDataGetLength
import os
from PIL import Image
import cv2 as cv

class WindowCapture:

    # properties
    window_name = None
    window = None
    window_id = None
    window_width = 0
    window_height = 0

    # constructor
    def __init__(self, given_window_name=None):
        if given_window_name is not None:

            self.window_name = given_window_name
            self.window = self.get_window()

            if self.window is None:
                raise Exception('Unable to find window: {}'.format(given_window_name))

            self.window_id = self.get_window_id()

            self.window_width = self.get_window_width()
            self.window_height = self.get_window_height()

            self.window_x = self.get_window_pos_x()
            self.window_y = self.get_window_pos_y()

    # determine the window we want to capture
    def get_window(self):
        windows = CGWindowListCopyWindowInfo(kCGWindowListOptionAll, kCGNullWindowID)
        for window in windows:
            name = window.get('kCGWindowName', 'Unknown')
            if name and self.window_name in name:
                
                return window
        return None
    
    def get_window_id(self):
        return self.window['kCGWindowNumber']

    def get_window_width(self):
        return int(self.window['kCGWindowBounds']['Width'])
    
    def get_window_height(self):
        return int(self.window['kCGWindowBounds']['Height'])

    def get_window_pos_x(self):
        return int(self.window['kCGWindowBounds']['X'])

    def get_window_pos_y(self):
        return int(self.window['kCGWindowBounds']['Y'])
    
    def get_image_from_window(self):
        image_filename = 'test-img.png'
        # -x mutes sound and -l specifies windowId
        os.system('screencapture -x -l %s %s' % (self.window_id, image_filename))
        pil_image = Image.open(image_filename)
        image_as_numpy_array = np.array(pil_image)
        os.remove(image_filename)

        image = cv.cvtColor(image_as_numpy_array, cv.COLOR_BGR2RGB)
        return image

My get_image_from_window method here works fine, I am able to use cv.imshow('cv', screenshot) to view it:

import cv2 as cv
from time import time
from windowcapture import WindowCapture

# initialize the WindowCapture class
wincap = WindowCapture('Blue Box Clicker')

loop_time = time()

while(True):
    # get an updated image of the game
    screenshot = wincap.get_image_from_window()

    cv.imshow('cv', screenshot)

    # debug the loop rate
    print('FPS {}'.format(1 / (time() - loop_time)))
    loop_time = time()

    # press 'q' with the output window focused to exit.
    # waits 1 ms every loop to process key presses
    if cv.waitKey(1) == ord('q'):
        cv.destroyAllWindows()
        break

print('Done.')

But I don’t want to save the image locally to then load it again.
I believe this is very inefficient and I would like to achieve the same functionality without actually saving the image file and then opening it.

Similarly to how it is done here (in the GitHub link above):

    def get_screenshot(self):

        # get the window image data
        wDC = win32gui.GetWindowDC(self.hwnd)
        dcObj = win32ui.CreateDCFromHandle(wDC)
        cDC = dcObj.CreateCompatibleDC()
        dataBitMap = win32ui.CreateBitmap()
        dataBitMap.CreateCompatibleBitmap(dcObj, self.w, self.h)
        cDC.SelectObject(dataBitMap)
        cDC.BitBlt((0, 0), (self.w, self.h), dcObj, (self.cropped_x, self.cropped_y), win32con.SRCCOPY)

        # convert the raw data into a format opencv can read
        #dataBitMap.SaveBitmapFile(cDC, 'debug.bmp')
        signedIntsArray = dataBitMap.GetBitmapBits(True)
        img = np.fromstring(signedIntsArray, dtype='uint8')
        img.shape = (self.h, self.w, 4)

        # free resources
        dcObj.DeleteDC()
        cDC.DeleteDC()
        win32gui.ReleaseDC(self.hwnd, wDC)
        win32gui.DeleteObject(dataBitMap.GetHandle())

        # drop the alpha channel, or cv.matchTemplate() will throw an error like:
        #   error: (-215:Assertion failed) (depth == CV_8U || depth == CV_32F) && type == _templ.type() 
        #   && _img.dims() <= 2 in function 'cv::matchTemplate'
        img = img[...,:3]

        # make image C_CONTIGUOUS to avoid errors that look like:
        #   File ... in draw_rectangles
        #   TypeError: an integer is required (got type tuple)
        # see the discussion here:
        # https://github.com/opencv/opencv/issues/14866#issuecomment-580207109
        img = np.ascontiguousarray(img)

        return img

How can I achieve this using Quartz?

I am on macOS (M1 Pro) and would really like to get this working.

At the moment, this program runs at around 12fps.

The program it is trying to capture is another python program (a simple pygame):

import pygame
import random

# Set up the game window
pygame.init()
window_width, window_height = 640, 480
window = pygame.display.set_mode((window_width, window_height))
pygame.display.set_caption("Blue Box Clicker")

# Set up the clock
clock = pygame.time.Clock()

# Set up the game variables
background_color = (0, 0, 0)
box_color = (0, 0, 255)
box_width, box_height = 50, 50
box_x, box_y = 0, 0

# Set up the game loop
running = True
while running:

    # Handle events
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False
        elif event.type == pygame.MOUSEBUTTONDOWN:
            mouse_x, mouse_y = pygame.mouse.get_pos()
            if box_x <= mouse_x <= box_x + box_width and box_y <= mouse_y <= box_y + box_height:
                # Correct click
                box_x, box_y = random.randint(
                    0, window_width - box_width), random.randint(0, window_height - box_height)
                # Incorrect click


    # Draw the background
    window.fill(background_color)

    # Draw the box
    pygame.draw.rect(window, box_color, (box_x, box_y, box_width, box_height))

    # Update the window
    pygame.display.update()

    # Limit the frame rate
    clock.tick(60)

# Clean up
pygame.quit()
Asked By: nopassport1

||

Answers:

I fixed this using the following code for my image capture:

def get_image_from_window(self):
    core_graphics_image = QZ.CGWindowListCreateImage(
        QZ.CGRectNull,
        QZ.kCGWindowListOptionIncludingWindow,
        self.window_id,
        QZ.kCGWindowImageBoundsIgnoreFraming | QZ.kCGWindowImageNominalResolution
    )

    bytes_per_row = QZ.CGImageGetBytesPerRow(core_graphics_image)
    width = QZ.CGImageGetWidth(core_graphics_image)
    height = QZ.CGImageGetHeight(core_graphics_image)

    core_graphics_data_provider = QZ.CGImageGetDataProvider(core_graphics_image)
    core_graphics_data = QZ.CGDataProviderCopyData(core_graphics_data_provider)

    np_raw_data = np.frombuffer(core_graphics_data, dtype=np.uint8)

    numpy_data = np.lib.stride_tricks.as_strided(np_raw_data,
                                            shape=(height, width, 3),
                                            strides=(bytes_per_row, 4, 1),
                                            writeable=False)
    
    final_output = np.ascontiguousarray(numpy_data, dtype=np.uint8)

    return final_output

This method returns the captured CGImage in a format that cv2 can recognise and use for matchTemplate.

My full code looks like this:

#main.py

import cv2 as cv
from time import time
from windowcapture import WindowCapture

# initialize the WindowCapture class
wincap = WindowCapture('Blue Box Clicker')

loop_time = time()

while(True):
    # get an updated image of the game
    screenshot = wincap.get_image_from_window()

    # display the processed image
    cv.imshow('Computer Vision', screenshot)

    # debug the loop rate
    print('FPS {}'.format(1 / (time() - loop_time)))
    loop_time = time()

    # hold 'q' with the output window focused to exit.
    # waits 1 ms every loop to process key presses
    if cv.waitKey(1) == ord('q'):
        cv.destroyAllWindows()
        break

print('Done.')
#windowcapture.py

import numpy as np
import Quartz as QZ

class WindowCapture:

    # properties
    window_name = None
    window = None
    window_id = None
    window_width = 0
    window_height = 0

    # constructor
    def __init__(self, given_window_name=None):
        if given_window_name is not None:

            self.window_name = given_window_name
            self.window = self.get_window()

            if self.window is None:
                raise Exception('Unable to find window: {}'.format(given_window_name))

            self.window_id = self.get_window_id()

            self.window_width = self.get_window_width()
            self.window_height = self.get_window_height()

            self.window_x = self.get_window_pos_x()
            self.window_y = self.get_window_pos_y()
        else:
            raise Exception('No window name given')

    def get_window(self):
        windows = QZ.CGWindowListCopyWindowInfo(QZ.kCGWindowListOptionAll, QZ.kCGNullWindowID)
        for window in windows:
            name = window.get('kCGWindowName', 'Unknown')
            if name and self.window_name in name:
                
                return window
        return None
    
    def get_window_id(self):
        return self.window['kCGWindowNumber']

    def get_window_width(self):
        return int(self.window['kCGWindowBounds']['Width'])
    
    def get_window_height(self):
        return int(self.window['kCGWindowBounds']['Height'])

    def get_window_pos_x(self):
        return int(self.window['kCGWindowBounds']['X'])

    def get_window_pos_y(self):
        return int(self.window['kCGWindowBounds']['Y'])
    
    def get_image_from_window(self):
        core_graphics_image = QZ.CGWindowListCreateImage(
            QZ.CGRectNull,
            QZ.kCGWindowListOptionIncludingWindow,
            self.window_id,
            QZ.kCGWindowImageBoundsIgnoreFraming | QZ.kCGWindowImageNominalResolution
        )

        bytes_per_row = QZ.CGImageGetBytesPerRow(core_graphics_image)
        width = QZ.CGImageGetWidth(core_graphics_image)
        height = QZ.CGImageGetHeight(core_graphics_image)

        core_graphics_data_provider = QZ.CGImageGetDataProvider(core_graphics_image)
        core_graphics_data = QZ.CGDataProviderCopyData(core_graphics_data_provider)

        np_raw_data = np.frombuffer(core_graphics_data, dtype=np.uint8)

        numpy_data = np.lib.stride_tricks.as_strided(np_raw_data,
                                                shape=(height, width, 3),
                                                strides=(bytes_per_row, 4, 1),
                                                writeable=False)
        
        final_output = np.ascontiguousarray(numpy_data, dtype=np.uint8)

        return final_output

And this works with an average 60fps on a MacBook Pro (with an M1 Pro processor).

Answered By: nopassport1