How to create a continuous stream of Python's concurrent.futures.ProcessPoolExecutor.submits()?

Question:

I am able to submit batches of concurrent.futures.ProcessPoolExecutor.submits() where each batch may contain several submit(). However, I noticed that if each batch of submits consumes a significant about of RAM, there can be quite a bit of RAM usage inefficiencies; need to wait for all futures in the batch to be completed before another batch of submit() can be submitted.

How does one create a continuous stream of Python’s concurrent.futures.ProcessPoolExecutor.submit() until some condition is satisfied?

Test Script:

#!/usr/bin/env python3

import numpy as np
from numpy.random import default_rng, SeedSequence
import concurrent.futures as cf
from itertools import count


def dojob( process, iterations, samples, rg ):
    # Do some tasks
    result = []
    for i in range( iterations ):
        a = rg.standard_normal( samples )
        b = rg.integers( -3, 3, samples )
        mean = np.mean( a + b )
        result.append( ( i, mean ) )
        return { process : result }

if __name__ == '__main__':

    cpus = 2
    iterations = 10000
    samples = 1000

    # Setup NumPy Random Generator
    ss = SeedSequence( 1234567890 )
    child_seeds = ss.spawn( cpus )
    rg_streams = [ default_rng(s) for s in child_seeds ]

    # Peform concurrent analysis by batches
    counter = count( start=0, step=1 )

    # Serial Run of dojob
    process = next( counter )
    for cpu in range( cpus ):
        process = next( counter )
        rg = rg_streams[ cpu ]
        rdict = dojob( process, iterations, samples, rg )
    print( 'rdict', rdict )

    
    # Concurrent Run of dojob
    futures = []
    results = []
    with cf.ProcessPoolExecutor( max_workers=cpus ) as executor:

        while True:
            
            for cpu in range( cpus ):
                process = next( counter )
                rg = rg_streams[ cpu ]
                futures.append( executor.submit( dojob, process, iterations, samples, rg ) )
                
            for future in cf.as_completed( futures ):
                # Do some post processing
                r = future.result()
                for k, v in r.items():
                    if len( results ) < 5000:
                        results.append( np.std( v ) )
                        print( k, len(results) )

            if len(results) <= 100: #Put a huge number to simulate continuous streaming 
                futures = []
                child_seeds = child_seeds[0].spawn( cpus )
                rg_streams = [ default_rng(s) for s in child_seeds ]
            else:
                break

    print( 'n*** Concurrent Analyses Ended ***' )
Asked By: Sun Bear

||

Answers:

To expand on my comment, how about something like this, using the completion callback and a threading.Condition? I took the liberty of adding a progress indicator too.

EDIT: I refactored this into a neat function you pass your desired concurrency and queue depth, as well as a function that generates new jobs, and another function that processes a result and lets the executor know whether you’ve had enough.

import concurrent.futures as cf
import threading
import time
from itertools import count

import numpy as np
from numpy.random import SeedSequence, default_rng


def dojob(process, iterations, samples, rg):
    # Do some tasks
    result = []
    for i in range(iterations):
        a = rg.standard_normal(samples)
        b = rg.integers(-3, 3, samples)
        mean = np.mean(a + b)
        result.append((i, mean))
    return {process: result}


def execute_concurrently(cpus, max_queue_length, get_job_fn, process_result_fn):
    running_futures = set()
    jobs_complete = 0
    job_cond = threading.Condition()
    all_complete_event = threading.Event()

    def on_complete(future):
        nonlocal jobs_complete
        if process_result_fn(future.result()):
            all_complete_event.set()
        running_futures.discard(future)
        jobs_complete += 1
        with job_cond:
            job_cond.notify_all()

    time_since_last_status = 0
    start_time = time.time()
    with cf.ProcessPoolExecutor(cpus) as executor:
        while True:
            while len(running_futures) < max_queue_length:
                fn, args = get_job_fn()
                fut = executor.submit(fn, *args)
                fut.add_done_callback(on_complete)
                running_futures.add(fut)

            with job_cond:
                job_cond.wait()

            if all_complete_event.is_set():
                break

            if time.time() - time_since_last_status > 1.0:
                rps = jobs_complete / (time.time() - start_time)
                print(
                    f"{len(running_futures)} running futures on {cpus} CPUs, "
                    f"{jobs_complete} complete. RPS: {rps:.2f}"
                )
                time_since_last_status = time.time()


def main():
    ss = SeedSequence(1234567890)
    counter = count(start=0, step=1)
    iterations = 10000
    samples = 1000
    results = []

    def get_job():
        seed = ss.spawn(1)[0]
        rg = default_rng(seed)
        process = next(counter)
        return dojob, (process, iterations, samples, rg)

    def process_result(result):
        for k, v in result.items():
            results.append(np.std(v))
        if len(results) >= 10000:
            return True  # signal we're complete

    execute_concurrently(
        cpus=16,
        max_queue_length=20,
        get_job_fn=get_job,
        process_result_fn=process_result,
    )


if __name__ == "__main__":
    main()
Answered By: AKX

The Answer posted by @AKX works. Kudos to him. After testing it, I would like to recommend two amendments that I believe are worth considering and implementing.

Amendment 1: To prematurely cancel the execution of the python script, Ctrl+C has to be used. Unfortunately, doing that would not terminate the concurrent.futures.ProcessPoolExecutor() processes that are executing the function dojob(). This issue becomes more pronounced when the time is taken to complete dojob() is long; this situation can be simulated by making the sample size in the script to be large (e.g. samples = 100000). This issue can be seen when the terminal command ps -ef | grep python is executed. Also, if dojob() consumes a significant amount of RAM, the memory used by these concurrent processes do not get released until the concurrent processes are manually killed (e.g. kill -9 [PID]). To address these issues, the following amendment is needed.

with job_cond:
    job_cond.wait()

should be changed to:

try:
    with job_cond:
        job_cond.wait()
except KeyboardInterrupt:
    # Cancel running futures
    for future in running_futures:
        _ = future.cancel()
    # Ensure concurrent.futures.executor jobs really do finish.
    _ = cf.wait(running_futures, timeout=None)

So when Ctrl+C has to be used, you just have to press it once first. Next, give some time for the futures in running_futures to be cancelled. This could take a few seconds to several seconds to complete; it depends on the resource requirements of dojob(). You can see the CPUs activity in your task manager or system monitor drops to zero or hear the high revving sound from your cpu cooling fan reduce. Note, the RAM used would not be released yet. Thereafter, press Ctrl+C again and that should allow a clean exit of all the concurrent processes whereby the used RAM are also released.

Amendment 2: Presently, the inner while-loop dictates that jobs must be submitted continuously as fast as the cpu "mainThread" can allow. Realistically, there is no benefit to be able to submit more jobs than there are available cpus in the cpus pool. Doing so only unnecessarily consumes cpu resources from the "MainThread" of the main processor. To regulate the continuous job submission, a new submit_job threading.Event() object can be used.

Firstly, define such an object and set its value to True with:

submit_job = threading.Event()
submit_job.set()

Next, at the end of the inner while-loop add this condition and .wait() method:

with cf.ProcessPoolExecutor(cpus) as executor:
    while True:
        while len(running_futures) < max_queue_length:
            fn, args = get_job_fn()
            fut = executor.submit(fn, *args)
            fut.add_done_callback(on_complete)
            running_futures.add(fut)
            if len(running_futures) >= cpus: # Add this line
                submit_job.clear()           # Add this line
            submit_job.wait()                # Add this line

Finally change the on_complete(future) callback to:

def on_complete(future):
    nonlocal jobs_complete
    if process_result_fn(future.result()):
        all_complete_event.set()
    running_futures.discard(future)
    if len(running_futures) < cpus: # add this conditional setting
        submit_job.set()            # add this conditional setting
    jobs_complete += 1
    with job_cond:
        job_cond.notify_all()
Answered By: Sun Bear

There is a library called Pypeln that does this beautifully. It allows for streaming tasks between stages, and each stage can be run in a process, thread, or asyncio pool, depending on what is optimum for your use case.

Sample code:

import pypeln as pl
import time
from random import random

def slow_add1(x):
    time.sleep(random()) # <= some slow computation
    return x + 1

def slow_gt3(x):
    time.sleep(random()) # <= some slow computation
    return x > 3

data = range(10) # [0, 1, 2, ..., 9] 

stage = pl.process.map(slow_add1, data, workers=3, maxsize=4)
stage = pl.process.filter(slow_gt3, stage, workers=2)

data = list(stage) # e.g. [5, 6, 9, 4, 8, 10, 7]
Answered By: Nick Crews
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.