# Generate stream from Triton
curl --location 'http://localhost:8000/v2/models/tensorrt_llm_bls/generate_stream' \
--header 'Accept: text/event-stream' \
--header 'Content-Type: application/json' \
--data '{
"text_input": "What is machine learning?",
"parameters": {
"stream": false,
"temperature": 0,
"max_tokens": 20
}
}'
# Generate no-stream from Triton
curl --location 'http://localhost:8000/v2/models/tensorrt_llm_bls/generate' \
--header 'Content-Type: application/json' \
--data '{
"text_input": "What is machine learning?",
"parameters": {
"stream": true,
"temperature": 0,
"max_tokens": 20
}
}'
# OpenAI compatible API from the proxy
curl --location 'http://localhost:3000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "Who is Jensen Huang"
}
],
"stream": true,
"model": "tensorrt_llm_bls",
"max_tokens": 2048,
"temperature": 0.7
}'