How do I create a custom handler in torchserve?

Question

I am trying to create a custom handler on Torchserve.
The custom handler has been modified as follows

# custom handler file

# model_handler.py

"""
ModelHandler defines a custom model handler.
"""
import os
import soundfile
from espnet2.bin.enh_inference import *

from ts.torch_handler.base_handler import BaseHandler

class ModelHandler(BaseHandler):
    """
    A custom model handler implementation.
    """

    def __init__(self):
        self._context = None
        self.initialized = False
        self.model = None
        self.device = None

    def initialize(self, context):
        """
        Invoke by torchserve for loading a model
        :param context: context contains model server system properties
        :return:
        """

        #  load the model
        self.manifest = context.manifest

        properties = context.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        # Read model serialize/pt file
        serialized_file = self.manifest['model']['serializedFile']
        model_pt_path = os.path.join(model_dir, serialized_file)

        if not os.path.isfile(model_pt_path):
            raise RuntimeError("Missing the model.pt file")

        self.model = SeparateSpeech("train_enh_transformer_tf.yaml", "valid.loss.best.pth")

        self.initialized = True

    def preprocess(self,data):
        audio_data, rate  = soundfile.read(data)
        preprocessed_data = audio_data[np.newaxis, :]

        return preprocessed_data

    def inference(self, model_input):
        model_output = self.model(model_input)
        return model_output
    def postprocess(self, inference_output):
        """
        Return inference result.
        :param inference_output: list of inference output
        :return: list of predict results
        """
        # Take output from network and post-process to desired format
        postprocess_output = inference_output
        #wav ni suru
        return postprocess_output

    def handle(self, data, context):
        model_input = self.preprocess(data)
        model_output = self.inference(model_input)
        return self.postprocess(model_output)

The torchserve appears to be working.

The Torchserve logs are as follows

root@5c780ba74916:~/files# torchserve --start --ncs --model-store model_store --models denoise_transformer=denoise_transformer.mar
root@5c780ba74916:~/files# WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.
2022-08-24T14:06:06,662 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...
2022-08-24T14:06:06,796 [INFO ] main org.pytorch.serve.ModelServer -
Torchserve version: 0.6.0
TS Home: /usr/local/lib/python3.8/dist-packages
Current directory: /root/files
Temp directory: /tmp
Number of GPUs: 1
Number of CPUs: 16
Max heap size: 4002 M
Python executable: /usr/bin/python3
Config file: N/A
Inference address: http://127.0.0.1:8080
Management address: http://127.0.0.1:8081
Metrics address: http://127.0.0.1:8082
Model Store: /root/files/model_store
Initial Models: denoise_transformer=denoise_transformer.mar
Log dir: /root/files/logs
Metrics dir: /root/files/logs
Netty threads: 0
Netty client threads: 0
Default workers per model: 1
Blacklist Regex: N/A
Maximum Response Size: 6553500
Maximum Request Size: 6553500
Limit Maximum Image Pixels: true
Prefer direct buffer: false
Allowed Urls: [file://.*|http(s)?://.*]
Custom python dependency for model allowed: false
Metrics report format: prometheus
Enable metrics API: true
Workflow Store: /root/files/model_store
Model config: N/A
2022-08-24T14:06:06,805 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager -  Loading snapshot serializer plugin...
2022-08-24T14:06:06,817 [INFO ] main org.pytorch.serve.ModelServer - Loading initial models: denoise_transformer.mar
2022-08-24T14:06:07,006 [DEBUG] main org.pytorch.serve.wlm.ModelVersionedRefs - Adding new version 1.0 for model denoise_transformer
2022-08-24T14:06:07,007 [DEBUG] main org.pytorch.serve.wlm.ModelVersionedRefs - Setting default version to 1.0 for model denoise_transformer
2022-08-24T14:06:07,007 [INFO ] main org.pytorch.serve.wlm.ModelManager - Model denoise_transformer loaded.
2022-08-24T14:06:07,007 [DEBUG] main org.pytorch.serve.wlm.ModelManager - updateModel: denoise_transformer, count: 1
2022-08-24T14:06:07,015 [DEBUG] W-9000-denoise_transformer_1.0 org.pytorch.serve.wlm.WorkerLifeCycle - Worker cmdline: [/usr/bin/python3, /usr/local/lib/python3.8/dist-packages/ts/model_service_worker.py, --sock-type, unix, --sock-name, /tmp/.ts.sock.9000]
2022-08-24T14:06:07,016 [INFO ] main org.pytorch.serve.ModelServer - Initialize Inference server with: EpollServerSocketChannel.
2022-08-24T14:06:07,059 [INFO ] main org.pytorch.serve.ModelServer - Inference API bind to: http://127.0.0.1:8080
2022-08-24T14:06:07,059 [INFO ] main org.pytorch.serve.ModelServer - Initialize Management server with: EpollServerSocketChannel.
2022-08-24T14:06:07,060 [INFO ] main org.pytorch.serve.ModelServer - Management API bind to: http://127.0.0.1:8081
2022-08-24T14:06:07,060 [INFO ] main org.pytorch.serve.ModelServer - Initialize Metrics server with: EpollServerSocketChannel.
2022-08-24T14:06:07,062 [INFO ] main org.pytorch.serve.ModelServer - Metrics API bind to: http://127.0.0.1:8082
Model server started.
2022-08-24T14:06:07,258 [WARN ] pool-3-thread-1 org.pytorch.serve.metrics.MetricCollector - worker pid is not available yet.
2022-08-24T14:06:07,363 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - Listening on port: /tmp/.ts.sock.9000
2022-08-24T14:06:07,364 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - [PID]6258
2022-08-24T14:06:07,364 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - Torch worker started.
2022-08-24T14:06:07,365 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - Python runtime: 3.8.10
2022-08-24T14:06:07,365 [DEBUG] W-9000-denoise_transformer_1.0 org.pytorch.serve.wlm.WorkerThread - W-9000-denoise_transformer_1.0 State change null -> WORKER_STARTED
2022-08-24T14:06:07,368 [INFO ] W-9000-denoise_transformer_1.0 org.pytorch.serve.wlm.WorkerThread - Connecting to: /tmp/.ts.sock.9000
2022-08-24T14:06:07,374 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - Connection accepted: /tmp/.ts.sock.9000.
2022-08-24T14:06:07,376 [INFO ] W-9000-denoise_transformer_1.0 org.pytorch.serve.wlm.WorkerThread - Flushing req. to backend at: 1661317567376
2022-08-24T14:06:07,398 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - model_name: denoise_transformer, batchSize: 1
2022-08-24T14:06:07,596 [INFO ] pool-3-thread-1 TS_METRICS - CPUUtilization.Percent:0.0|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,596 [INFO ] pool-3-thread-1 TS_METRICS - DiskAvailable.Gigabytes:220.49971389770508|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,597 [INFO ] pool-3-thread-1 TS_METRICS - DiskUsage.Gigabytes:17.66714859008789|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,597 [INFO ] pool-3-thread-1 TS_METRICS - DiskUtilization.Percent:7.4|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,597 [INFO ] pool-3-thread-1 TS_METRICS - GPUMemoryUtilization.Percent:17.9931640625|#Level:Host,device_id:0|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,597 [INFO ] pool-3-thread-1 TS_METRICS - GPUMemoryUsed.Megabytes:1474|#Level:Host,device_id:0|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,598 [INFO ] pool-3-thread-1 TS_METRICS - GPUUtilization.Percent:8|#Level:Host,device_id:0|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,598 [INFO ] pool-3-thread-1 TS_METRICS - MemoryAvailable.Megabytes:14307.53515625|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,598 [INFO ] pool-3-thread-1 TS_METRICS - MemoryUsed.Megabytes:1372.1640625|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:07,598 [INFO ] pool-3-thread-1 TS_METRICS - MemoryUtilization.Percent:10.6|#Level:Host|#hostname:5c780ba74916,timestamp:1661317567
2022-08-24T14:06:08,306 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - encoder self-attention layer type = self-attention
2022-08-24T14:06:08,328 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - Eps is deprecated in si_snr loss, set clamp_db instead.
2022-08-24T14:06:08,373 [INFO ] W-9000-denoise_transformer_1.0-stdout MODEL_LOG - Perform direct speech enhancement on the input
2022-08-24T14:06:08,390 [INFO ] W-9000-denoise_transformer_1.0 org.pytorch.serve.wlm.WorkerThread - Backend response time: 992
2022-08-24T14:06:08,391 [DEBUG] W-9000-denoise_transformer_1.0 org.pytorch.serve.wlm.WorkerThread - W-9000-denoise_transformer_1.0 State change WORKER_STARTED -> WORKER_MODEL_LOADED
2022-08-24T14:06:08,391 [INFO ] W-9000-denoise_transformer_1.0 TS_METRICS - W-9000-denoise_transformer_1.0.ms:1380|#Level:Host|#hostname:5c780ba74916,timestamp:1661317568
2022-08-24T14:06:08,391 [INFO ] W-9000-denoise_transformer_1.0 TS_METRICS - WorkerThreadTime.ms:23|#Level:Host|#hostname:5c780ba74916,timestamp:1661317568

I sent the wav file to torchserve with the following command

curl http://127.0.0.1:8080/predictions/denoise_transformer -T Mix.wav

However, the following error was returned

<HTML><HEAD>
<TITLE>Request Error</TITLE>
</HEAD>
<BODY>
<FONT face="Helvetica">
<big><strong></strong></big><BR>
</FONT>
<blockquote>
<TABLE border=0 cellPadding=1 width="80%">
<TR><TD>
<FONT face="Helvetica">
<big>Request Error (invalid_request)</big>
<BR>
<BR>
</FONT>
</TD></TR>
<TR><TD>
<FONT face="Helvetica">
Your request could not be processed. Request could not be handled
</FONT>
</TD></TR>
<TR><TD>
<FONT face="Helvetica">
This could be caused by a misconfiguration, or possibly a malformed request.
</FONT>
</TD></TR>
<TR><TD>
<FONT face="Helvetica" SIZE=2>
<BR>
For assistance, contact your network support team.
</FONT>
</TD></TR>
</TABLE>
</blockquote>
</FONT>
</BODY></HTML>

Is there somewhere wrong?
Best regards.

Asked By: Takayama-Shin

||

Source

Answer 1

Hi here’s an example but it seems the problem should be how you receive parse the data

from typing import Dict, List, Tuple
import numpy as np
import soundfile as sf
from io import BytesIO
from ts.torch_handler.base_handler import BaseHandler

import torch
from model import Model

class SoundModelHandler(BaseHandler):
    def __init__(self):
        super(SoundModelHandler, self).__init__()
        self.initialized = False

    def preproc_one_wav(self, req: Dict[str, bytearray]) -> Tuple[np.ndarray, int]:
        """
        Function to convert req data to image
        :param req:
        :return: np array of wav form
        """
        wav_byte = req.get("data")
        
        if wav_byte is None:
            wav_byte = req.get('body')
        
        # create a stream from the encoded image
        wav, sr = sf.read(BytesIO(wav_byte))
        return wav, sr
        
    def initialize(self, context):
        """
        Invoke by torchserve for loading a model
        :param context: context contains model server system properties
        :return:
        """
        self.manifest = context.manifest
        
        properties = context.system_properties
        model_dir = properties.get("model_dir")
        self._context = context

        self.model = Model()
        self.model.load_state_dict(torch.load(model_dir + 'weights.pt'))
        
        self.initialize = True
        self.device = torch.device(
            "cuda:" + str(properties.get("gpu_id"))
            if torch.cuda.is_available() and properties.get("gpu_id") is not None
            else "cpu"
        )

        
    def preprocess(self, requests: List[Dict[str, bytearray]]):
        """
        Function to prepare data from the model
        
        :param requests:
        :return: tensor of the processed shape specified by the model
        """
        batch_crop = [self.preproc_one_wav(req) for req in requests]

        # Do something here if you want return as torch tensor
        # You can apply torch cat here as well
        batch = torch.cat(batch_crop)
        
        return batch
        
    def inference(self, model_input: torch.Tensor):
        """
        Given the data from .preprocess, perform inference using the model.
        
        :param reqeuest:
        :return: Logits or predictions given by the model
        """
        with torch.no_grad():
            generated_ids = self.model.generate(model_input.to(self.device))
        
        return generated_ids

Btw heres an example you can do to make a request to your model in python

import json
import requests

sample = {'data': wav_in_byte}
results = requests.post('ip:port', data=json.dumps(sample))
# parse results
get_results = results.json()

Answered By: Edwin Cheong

How do I create a custom handler in torchserve?

Question:

Answers: