Automatically restarting killed Python process executed within a multiprocessing.Pool
Question:
I have the following code:
import multiprocessing
import urllib.request
from time import sleep
connected = False
def check_internet_connection() -> None:
while True:
print(f'inet', flush=True)
try:
urllib.request.urlopen('https://www.google.com', timeout=1)
connected = True
except urllib.request.URLError:
connected = False
finally:
sleep(5 - time.time() % 5)
def loop(delay: float = 0.5) -> None:
while True:
print(f'loop with delay {delay}', flush=True)
sleep(delay - time.time() % delay)
if __name__ == '__main__':
with multiprocessing.Pool(3, maxtasksperchild=1) as pool:
while True:
pool.apply_async(func=check_internet_connection)
pool.map_async(func=loop, iterable=[0.5, 1.0])
pool.close()
pool.join()
Current behavior: If I kill one of the Python processes spawned by multiprocessing, it does not get restarted.
Wanted behavior: The process should be restarted immediately.
If I leave out the pool.close()
and pool.join()
, I get my desired behavior. However, the memory consumption increases at an incredible rate and causes my PC to crash within a few minutes.
My alternative approach was this:
import multiprocessing
import time
def loop(delay: float) -> None:
while True:
print("Function with delay", delay)
time.sleep(delay - time.time() % delay)
if __name__ == '__main__':
while True:
p1 = multiprocessing.Process(target=loop, args=(0.5,))
p2 = multiprocessing.Process(target=loop, args=(1.0,))
p1.start()
p2.start()
p1.join()
p2.join()
However, this does also not restart the killed process until both processes are killed.
Edit: My overall goal is to be able to execute a number of functions that each poll sensor values with different delays and write them to separate files. If there is a working internet connection, I also want to send the sensor data to my server. Since I do not want to lose any data, the processes should restart as fast as possible in case they die. Maybe the approach via multiprocessing is not suited at all for this problem. If you have any suggestions, thank you.
Answers:
The standard multiprocessing.Pool
does not support workers termination.
The pebble
library has been developed to cover these limitations. It automatically handles failed jobs and restart workers. Yet it is a process pool designed to handle a list of tasks and if a task fails it will move on to the next one.
Therefore, you need to find means to track which task fail and re-submit it. Here is a simple example on how you can do it.
import time
import pebble
from concurrent.futures import wait, FIRST_EXCEPTION
def loop(delay: float = 0.5) -> None:
while True:
print(f'loop with delay {delay}', flush=True)
time.sleep(delay - time.time() % delay)
jobs = []
pool = pebble.ProcessPool()
for delay in [0.5, 1, 1.5, 2]:
future = pool.schedule(loop, args=[delay])
# Add delay to the Future object
future.delay = delay
jobs.append(future)
while True:
futures = []
# Wait for any job to fail
wait(jobs, return_when=FIRST_EXCEPTION)
# Retrieve failed and running jobs
running_jobs = [f for f in jobs if f.running()]
failed_jobs = [f for f in jobs if not f.running()]
# Re-schedule failed jobs
for future in failed_jobs:
future = pool.schedule(loop, args=[future.delay])
futures.append(future)
jobs = running_jobs + futures
I have the following code:
import multiprocessing
import urllib.request
from time import sleep
connected = False
def check_internet_connection() -> None:
while True:
print(f'inet', flush=True)
try:
urllib.request.urlopen('https://www.google.com', timeout=1)
connected = True
except urllib.request.URLError:
connected = False
finally:
sleep(5 - time.time() % 5)
def loop(delay: float = 0.5) -> None:
while True:
print(f'loop with delay {delay}', flush=True)
sleep(delay - time.time() % delay)
if __name__ == '__main__':
with multiprocessing.Pool(3, maxtasksperchild=1) as pool:
while True:
pool.apply_async(func=check_internet_connection)
pool.map_async(func=loop, iterable=[0.5, 1.0])
pool.close()
pool.join()
Current behavior: If I kill one of the Python processes spawned by multiprocessing, it does not get restarted.
Wanted behavior: The process should be restarted immediately.
If I leave out the pool.close()
and pool.join()
, I get my desired behavior. However, the memory consumption increases at an incredible rate and causes my PC to crash within a few minutes.
My alternative approach was this:
import multiprocessing
import time
def loop(delay: float) -> None:
while True:
print("Function with delay", delay)
time.sleep(delay - time.time() % delay)
if __name__ == '__main__':
while True:
p1 = multiprocessing.Process(target=loop, args=(0.5,))
p2 = multiprocessing.Process(target=loop, args=(1.0,))
p1.start()
p2.start()
p1.join()
p2.join()
However, this does also not restart the killed process until both processes are killed.
Edit: My overall goal is to be able to execute a number of functions that each poll sensor values with different delays and write them to separate files. If there is a working internet connection, I also want to send the sensor data to my server. Since I do not want to lose any data, the processes should restart as fast as possible in case they die. Maybe the approach via multiprocessing is not suited at all for this problem. If you have any suggestions, thank you.
The standard multiprocessing.Pool
does not support workers termination.
The pebble
library has been developed to cover these limitations. It automatically handles failed jobs and restart workers. Yet it is a process pool designed to handle a list of tasks and if a task fails it will move on to the next one.
Therefore, you need to find means to track which task fail and re-submit it. Here is a simple example on how you can do it.
import time
import pebble
from concurrent.futures import wait, FIRST_EXCEPTION
def loop(delay: float = 0.5) -> None:
while True:
print(f'loop with delay {delay}', flush=True)
time.sleep(delay - time.time() % delay)
jobs = []
pool = pebble.ProcessPool()
for delay in [0.5, 1, 1.5, 2]:
future = pool.schedule(loop, args=[delay])
# Add delay to the Future object
future.delay = delay
jobs.append(future)
while True:
futures = []
# Wait for any job to fail
wait(jobs, return_when=FIRST_EXCEPTION)
# Retrieve failed and running jobs
running_jobs = [f for f in jobs if f.running()]
failed_jobs = [f for f in jobs if not f.running()]
# Re-schedule failed jobs
for future in failed_jobs:
future = pool.schedule(loop, args=[future.delay])
futures.append(future)
jobs = running_jobs + futures