Description
Using a multi input DALI model the TRITON server crashes when sending an inference request from a python client. The server does not give any errors.
The amount of outputs does not influence the issue.
Triton Information
What version of Triton are you using?
20.11 and 20.12
Are you using the Triton container or did you build it yourself?
NGC Container.
The server is run using the following command: tritonserver --model-store=/Workdir/Models --strict-model-config=false --exit-on-error=false --model-control-mode=poll --log-verbose=2
To Reproduce
Baseline + model creation
The following code can be used to confirm the validity of the pipeline as well as the creation of the serialized model. Tested on the NGC Tensorflow 20.11 and 20.12 images.
import nvidia.dali as dali
import nvidia.dali.types as types
import numpy as np
batch_size = 1
pipe = dali.pipeline.Pipeline(batch_size=batch_size, num_threads=1, device_id=0)
# Create the pipeline
with pipe:
x = dali.fn.external_source(device="cpu", name="DALI_X_INPUT")
y = dali.fn.external_source(device="cpu", name="DALI_Y_INPUT")
pipe.set_outputs(x, y)
# Build the pipeline and save to file for testing in TRITON
pipe.build()
pipe.serialize(filename="./model_multi_input.dali")
# Input the test data
pipe.feed_input("DALI_X_INPUT", np.array([1200.0]*batch_size, dtype=np.float32))
pipe.feed_input("DALI_Y_INPUT", np.array([1920.0]*batch_size, dtype=np.float32))
# Run the pipeline
pipe_out = pipe.run()
# Check the output data
print(f"X Input: {np.array(pipe_out[0][0])}, Y Input: {np.array(pipe_out[1][0])}")
# Outputs: "X Input: 1200.0, Y Input: 1920.0"
Tests using TRITON
Using the following config.pbtxt:
name: "dali_multi_input"
backend: "dali"
max_batch_size: 1
default_model_filename: "model_multi_input.dali"
input [
{
name: "DALI_X_INPUT"
data_type: TYPE_UINT8
dims: [ -1 ]
},
{
name: "DALI_Y_INPUT"
data_type: TYPE_UINT8
dims: [ -1 ]
}
]
output [
{
name: "DALI_OUTPUT_X"
data_type: TYPE_UINT8
dims: [ -1 ]
},
{
name: "DALI_OUTPUT_Y"
data_type: TYPE_UINT8
dims: [ -1 ]
}
]
And the following Client.py running using the python client wheels from 20.11 and 20.12:
import argparse
import numpy as np
import os
from builtins import range
import sys
import tritonclient.grpc as grpcclient
import tritonclient.grpc.model_config_pb2 as model_config
import tritonclient.http as httpclient
from tritonclient.utils import triton_to_np_dtype
from tritonclient.utils import InferenceServerException
FLAGS = None
def parse_model_grpc(model_metadata, model_config):
"""
Check the configuration of a model to make sure it meets the
requirements for an image classification network (as expected by
this client)
"""
if len(model_metadata.inputs) != 2:
raise Exception("expecting 2 input, got {}".format(len(model_metadata.inputs)))
if len(model_config.input) != 2:
raise Exception(
"expecting 2 input in model configuration, got {}".format(len(model_config.input)))
input_metadata = model_metadata.inputs
output_metadata = model_metadata.outputs
return (input_metadata, output_metadata, model_config.max_batch_size)
def parse_model_http(model_metadata, model_config):
"""
Check the configuration of a model to make sure it meets the
requirements for an image classification network (as expected by
this client)
"""
if len(model_metadata['inputs']) != 2:
raise Exception("expecting 2 input, got {}".format(len(model_metadata['inputs'])))
if len(model_config['input']) != 2:
raise Exception(
"expecting 2 input in model configuration, got {}".format(len(model_config['input'])))
input_metadata = model_metadata['inputs']
output_metadata = model_metadata['outputs']
return (input_metadata, output_metadata, model_config['max_batch_size'])
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-v',
'--verbose',
action="store_true",
required=False,
default=True,
help='Enable verbose output')
parser.add_argument('-m',
'--model-name',
type=str,
required=False,
default='dali_multi_input',
help='Name of model. Default is preprocess_inception_ensemble.')
parser.add_argument('-u',
'--url',
type=str,
required=False,
default='localhost:7000',
help='Inference server URL. Default is localhost:8000.')
parser.add_argument('-i',
'--protocol',
type=str,
required=False,
default='HTTP',
help='Protocol (HTTP/gRPC) used to ' +
'communicate with inference service. Default is HTTP.')
FLAGS = parser.parse_args()
protocol = FLAGS.protocol.lower()
try:
if protocol == "grpc":
# Create gRPC client for communicating with the server
triton_client = grpcclient.InferenceServerClient(url=FLAGS.url, verbose=FLAGS.verbose)
else:
# Create HTTP client for communicating with the server
triton_client = httpclient.InferenceServerClient(url=FLAGS.url, verbose=FLAGS.verbose)
except Exception as e:
print("client creation failed: " + str(e))
sys.exit(1)
model_name = FLAGS.model_name
# Make sure the model matches our requirements, and get some
# properties of the model that we need for preprocessing
try:
model_metadata = triton_client.get_model_metadata(model_name=model_name)
except InferenceServerException as e:
print("failed to retrieve the metadata: " + str(e))
sys.exit(1)
try:
model_config = triton_client.get_model_config(model_name=model_name)
except InferenceServerException as e:
print("failed to retrieve the config: " + str(e))
sys.exit(1)
if FLAGS.protocol.lower() == "grpc":
input_metadata, output_metadata, batch_size = parse_model_grpc(model_metadata, model_config.config)
else:
input_metadata, output_metadata, batch_size = parse_model_http(model_metadata, model_config)
in_x = np.stack([np.array([400]*batch_size, dtype=np.uint8)], axis=0)
in_y = np.stack([np.array([640]*batch_size, dtype=np.uint8)], axis=0)
# Set the input data
inputs = []
if FLAGS.protocol.lower() == "grpc":
inputs.append(grpcclient.InferInput(input_metadata[0].name, in_x.shape, "UINT8"))
inputs[0].set_data_from_numpy(in_x)
inputs.append(grpcclient.InferInput(input_metadata[1].name, in_y.shape, "UINT8"))
inputs[1].set_data_from_numpy(in_y)
else:
inputs.append(httpclient.InferInput(input_metadata[0]['name'], in_x.shape, "UINT8"))
inputs[0].set_data_from_numpy(in_x)
inputs.append(httpclient.InferInput(input_metadata[1]['name'], in_y.shape, "UINT8"))
inputs[1].set_data_from_numpy(in_y)
output_names = [ output.name if FLAGS.protocol.lower() == "grpc" else output['name'] for output in output_metadata ]
outputs = []
for output_name in output_names:
if FLAGS.protocol.lower() == "grpc":
outputs.append(grpcclient.InferRequestedOutput(output_name))
else:
outputs.append(httpclient.InferRequestedOutput(output_name))
#This is where the client fails due to the server crash
# Send request
result = triton_client.infer(model_name, inputs, outputs=outputs)
output_x = result.as_numpy('DALI_OUTPUT_X')
output_y = result.as_numpy('DALI_OUTPUT_Y')
print("PASS")
When the client is in HTTP mode the client gives the following error:
**Exception has occurred: HTTPConnectionClosed**
connection closed.
When the client is in gRPC mode the client gives the following error:
**Exception has occurred: InferenceServerException**
[StatusCode.UNAVAILABLE] Socket closed
The server outputs the following (gRPC client) and than just quits:
I1231 11:50:04.685638 1 grpc_server.cc:270] Process for ModelMetadata, rpc_ok=1, 0 step START
I1231 11:50:04.685677 1 grpc_server.cc:225] Ready for RPC 'ModelMetadata', 1
I1231 11:50:04.685684 1 model_repository_manager.cc:496] GetInferenceBackend() 'dali_multi_input' version -1
I1231 11:50:04.685690 1 model_repository_manager.cc:452] VersionStates() 'dali_multi_input'
I1231 11:50:04.685756 1 grpc_server.cc:270] Process for ModelMetadata, rpc_ok=1, 0 step COMPLETE
I1231 11:50:04.685763 1 grpc_server.cc:408] Done for ModelMetadata, 0
I1231 11:50:04.686291 1 grpc_server.cc:270] Process for ModelConfig, rpc_ok=1, 0 step START
I1231 11:50:04.686317 1 grpc_server.cc:225] Ready for RPC 'ModelConfig', 1
I1231 11:50:04.686323 1 model_repository_manager.cc:496] GetInferenceBackend() 'dali_multi_input' version -1
I1231 11:50:04.689018 1 grpc_server.cc:270] Process for ModelConfig, rpc_ok=1, 0 step COMPLETE
I1231 11:50:04.689028 1 grpc_server.cc:408] Done for ModelConfig, 0
I1231 11:50:04.690189 1 grpc_server.cc:3089] Process for ModelInferHandler, rpc_ok=1, 1 step START
I1231 11:50:04.690210 1 grpc_server.cc:3082] New request handler for ModelInferHandler, 4
I1231 11:50:04.690218 1 model_repository_manager.cc:496] GetInferenceBackend() 'dali_multi_input' version -1
I1231 11:50:04.690227 1 model_repository_manager.cc:496] GetInferenceBackend() 'dali_multi_input' version -1
I1231 11:50:04.690246 1 infer_request.cc:502] prepared: [0x0x7fd11c0035a0] request id: , model: dali_multi_input, requested version: -1, actual version: 1, flags: 0x0, correlation id: 0, batch size: 1, priority: 0, timeout (us): 0
original inputs:
[0x0x7fd11c003908] input: DALI_Y_INPUT, type: UINT8, original shape: [1,1], batch + shape: [1,1], shape: [1]
[0x0x7fd11c003828] input: DALI_X_INPUT, type: UINT8, original shape: [1,1], batch + shape: [1,1], shape: [1]
override inputs:
inputs:
[0x0x7fd11c003828] input: DALI_X_INPUT, type: UINT8, original shape: [1,1], batch + shape: [1,1], shape: [1]
[0x0x7fd11c003908] input: DALI_Y_INPUT, type: UINT8, original shape: [1,1], batch + shape: [1,1], shape: [1]
original requested outputs:
DALI_OUTPUT_X
DALI_OUTPUT_Y
requested outputs:
DALI_OUTPUT_X
DALI_OUTPUT_Y
The full TRITON server log: TRITON.log.
Expected behavior
Give the same output as the DALI script does, without crashing the server.