Updating Prometheus metrics Http server: [48] address already in use error
Question:
I’m working on a Dockerized project and monitoring the system state with Prometheus_client package for python.
I’ve a module named train.py which runs every time a request is sent to the container. in this file I use the logger below to collect model’s metrics and expose them
class PrometheusLogger(BaseLogger):
def __init__(self):
self.registry = pc.CollectorRegistry() #pc is for prometheus_client
self.training_state = pc.Enum('training_state','returns state of training',
states=['not started','running','done'], registry=self.registry)
self.coverage = pc.Gauge('dataset_coverage','shows the coverage of the datasets',
labelnames=['dataset'], registry=self.registry)
self.diversity = pc.Gauge('model_diversity','shows the diversity parameter', registry=self.registry)
self.personalization = pc.Gauge('model_personalization','shows the personalization parameter', registry=self.registry)
self.scores = pc.Gauge('model_scores','shows the scores of the model',
labelnames=['name'], registry=self.registry)
def log_metrics(self, metrics):
self.coverage.labels(dataset='catalog').set(float(metrics['catalog_coverage']))
self.coverage.labels(dataset='user').set(float(metrics['user_coverage']))
self.diversity.set(float(metrics['diversity']))
self.personalization.set(float(metrics['personalization']))
self.scores.labels(name='rmse').set(float(metrics['test_RMSE']))
self.scores.labels(name='recall').set(float(metrics['test_recall']))
self.scores.labels(name='f1').set(float(metrics['test_f1']))
self.scores.labels(name='ndcg').set(float(metrics['test_nDCG_score']))
self.scores.labels(name='precision').set(float(metrics['test_precision_k']))
def expose_metrics(self):
pc.start_http_server(8003, registry=self.registry)
and the train function works as follows:
def train(model, logger=PrometheusLogger):
model.fit()
metrics = model.get_metrics()
logger.log_metrics(metrics)
logger.expose_metrics()
note that metrics are exposed in port 8003 inside the docker network. another container reaches this port and makes an API to the localhost.
this code works well for the first time, but when I recall the train function for the second time, I get Error[48]: Address already in use.
I’ve tried many ways like get_pid() and killing it with os.kill but they don’t work. what should I do?
Answers:
This problem happens because the start_http_server()
command doesn’t automatically update on the port specified and it’s actually the prometheus-client
package problem.
I figured something out and it’s not really a solution but it works for this scenario.
you can replace the start_http_server
with this function to answer a limited number of requests if you don’t need real time answering and use a While True
if you need so.
import prometheus_client as pc
from wsgiref.simple_server import make_server
registry = pc.CollectorRegistry()
def expose_metrics(self):
app = pc.make_wsgi_app(registry=registry)
httpd = make_server('', 8003, app)
for _ in range(5): #or While True
httpd.handle_request()
I’m working on a Dockerized project and monitoring the system state with Prometheus_client package for python.
I’ve a module named train.py which runs every time a request is sent to the container. in this file I use the logger below to collect model’s metrics and expose them
class PrometheusLogger(BaseLogger):
def __init__(self):
self.registry = pc.CollectorRegistry() #pc is for prometheus_client
self.training_state = pc.Enum('training_state','returns state of training',
states=['not started','running','done'], registry=self.registry)
self.coverage = pc.Gauge('dataset_coverage','shows the coverage of the datasets',
labelnames=['dataset'], registry=self.registry)
self.diversity = pc.Gauge('model_diversity','shows the diversity parameter', registry=self.registry)
self.personalization = pc.Gauge('model_personalization','shows the personalization parameter', registry=self.registry)
self.scores = pc.Gauge('model_scores','shows the scores of the model',
labelnames=['name'], registry=self.registry)
def log_metrics(self, metrics):
self.coverage.labels(dataset='catalog').set(float(metrics['catalog_coverage']))
self.coverage.labels(dataset='user').set(float(metrics['user_coverage']))
self.diversity.set(float(metrics['diversity']))
self.personalization.set(float(metrics['personalization']))
self.scores.labels(name='rmse').set(float(metrics['test_RMSE']))
self.scores.labels(name='recall').set(float(metrics['test_recall']))
self.scores.labels(name='f1').set(float(metrics['test_f1']))
self.scores.labels(name='ndcg').set(float(metrics['test_nDCG_score']))
self.scores.labels(name='precision').set(float(metrics['test_precision_k']))
def expose_metrics(self):
pc.start_http_server(8003, registry=self.registry)
and the train function works as follows:
def train(model, logger=PrometheusLogger):
model.fit()
metrics = model.get_metrics()
logger.log_metrics(metrics)
logger.expose_metrics()
note that metrics are exposed in port 8003 inside the docker network. another container reaches this port and makes an API to the localhost.
this code works well for the first time, but when I recall the train function for the second time, I get Error[48]: Address already in use.
I’ve tried many ways like get_pid() and killing it with os.kill but they don’t work. what should I do?
This problem happens because the start_http_server()
command doesn’t automatically update on the port specified and it’s actually the prometheus-client
package problem.
I figured something out and it’s not really a solution but it works for this scenario.
you can replace the start_http_server
with this function to answer a limited number of requests if you don’t need real time answering and use a While True
if you need so.
import prometheus_client as pc
from wsgiref.simple_server import make_server
registry = pc.CollectorRegistry()
def expose_metrics(self):
app = pc.make_wsgi_app(registry=registry)
httpd = make_server('', 8003, app)
for _ in range(5): #or While True
httpd.handle_request()