I wrote some C++ code, but the inference results are different from Python. Can you help me take a look?
#include <iostream>
#include <fstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <NvInferRuntimeCommon.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <Eigen/Dense>
#include <unsupported/Eigen/Splines>
using namespace nvinfer1;
const std::vector<cv::Scalar> COLORS = {
cv::Scalar(255, 0, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255),
cv::Scalar(255, 255, 0), cv::Scalar(255, 0, 255), cv::Scalar(0, 255, 255),
cv::Scalar(128, 255, 0), cv::Scalar(255, 128, 0), cv::Scalar(128, 0, 255),
cv::Scalar(255, 0, 128), cv::Scalar(0, 128, 255), cv::Scalar(0, 255, 128),
cv::Scalar(128, 255, 255), cv::Scalar(255, 128, 255), cv::Scalar(255, 255, 128),
cv::Scalar(60, 180, 0), cv::Scalar(180, 60, 0), cv::Scalar(0, 60, 180),
cv::Scalar(0, 180, 60), cv::Scalar(60, 0, 180), cv::Scalar(180, 0, 60)};
class Lane
{
public:
Lane(const std::vector<cv::Point2f> &points, float invalid_value = -2.0f)
: points(points), invalid_value(invalid_value)
{
// Initialize spline interpolation using Eigen
Eigen::VectorXd x(points.size()), y(points.size());
for (size_t i = 0; i < points.size(); ++i)
{
x[i] = points[i].y;
y[i] = points[i].x;
}
spline = Eigen::SplineFitting<Eigen::Spline<double, 1>>::Interpolate(y.transpose(), std::min<int>(3, points.size() - 1), x);
min_y = x.minCoeff() - 0.01;
max_y = x.maxCoeff() + 0.01;
}
std::vector<cv::Point2f> to_array() const
{
std::vector<cv::Point2f> lane;
for (int y = 710; y >= 150; y -= 10)
{
double x = spline(y)(0);
if (x >= 0 && x < 1)
{
lane.emplace_back(x * 1280, y);
}
}
return lane;
}
private:
std::vector<cv::Point2f> points;
float invalid_value;
Eigen::Spline<double, 1> spline;
double min_y, max_y;
};
class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char *msg) noexcept override
{
if (severity <= Severity::kINFO)
{
std::cerr << msg << std::endl;
}
}
};
class CLRNetDemo
{
public:
CLRNetDemo(const std::string &engine_path)
{
// Load TensorRT engine
std::ifstream engine_file(engine_path, std::ios::binary);
std::vector<char> engine_data((std::istreambuf_iterator<char>(engine_file)), std::istreambuf_iterator<char>());
runtime = createInferRuntime(logger);
engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
context = engine->createExecutionContext();
// Initialize input and output bindings
for (int i = 0; i < engine->getNbBindings(); ++i)
{
if (engine->bindingIsInput(i))
{
input_binding = i;
}
else
{
output_binding = i;
}
}
// Allocate memory for input and output
auto input_dims = engine->getBindingDimensions(input_binding);
auto output_dims = engine->getBindingDimensions(output_binding);
size_t input_size = 1;
size_t output_size = 1;
for (int i = 0; i < input_dims.nbDims; ++i)
{
input_size *= input_dims.d[i];
}
for (int i = 0; i < output_dims.nbDims; ++i)
{
output_size *= output_dims.d[i];
}
cudaMalloc(&buffers[input_binding], input_size * sizeof(float));
cudaMalloc(&buffers[output_binding], output_size * sizeof(float));
cudaStreamCreate(&stream);
}
~CLRNetDemo()
{
cudaFree(buffers[input_binding]);
cudaFree(buffers[output_binding]);
cudaStreamDestroy(stream);
context->destroy();
engine->destroy();
runtime->destroy();
}
cv::Mat forward(const cv::Mat &img)
{
// Preprocess input image
cv::Mat input_img = img(cv::Rect(0, 160, img.cols, img.rows - 160));
cv::resize(input_img, input_img, cv::Size(800, 320), cv::INTER_CUBIC);
input_img.convertTo(input_img, CV_32FC3, 1.0 / 255.0);
// Transpose the image to match the model input
cv::Mat input_img_transposed;
cv::dnn::blobFromImage(input_img, input_img_transposed);
// Allocate memory for input and output
std::vector<float> input_data(input_img_transposed.total() * input_img_transposed.channels());
std::memcpy(input_data.data(), input_img_transposed.data, input_data.size() * sizeof(float));
auto output_dims = engine->getBindingDimensions(output_binding);
size_t output_size = 1;
for (int i = 0; i < output_dims.nbDims; ++i)
{
output_size *= output_dims.d[i];
}
std::vector<float> output_data(output_size);
// Execute inference
cudaMemcpyAsync(buffers[input_binding], input_data.data(), input_data.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
context->enqueueV2(buffers, stream, nullptr);
cudaMemcpyAsync(output_data.data(), buffers[output_binding], output_data.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// Postprocess output
auto lanes = get_lanes(output_data);
return imshow_lanes(img, lanes);
}
private:
IRuntime *runtime;
ICudaEngine *engine;
IExecutionContext *context;
int input_binding, output_binding;
void *buffers[2];
cudaStream_t stream;
Logger logger;
std::vector<Lane> get_lanes(const std::vector<float> &output)
{
std::vector<Lane> decoded;
std::vector<std::vector<float>> predictions(output.size() / 78, std::vector<float>(78));
for (size_t i = 0; i < predictions.size(); ++i)
{
std::copy(output.begin() + i * 78, output.begin() + (i + 1) * 78, predictions[i].begin());
}
for (auto &prediction : predictions)
{
std::vector<float> scores = softmax({prediction[0], prediction[1]});
std::cout << "scores: " << scores[0] << ", " << scores[1] << "\n";
if (scores[1] < 0.4)
{
continue;
}
std::vector<std::vector<float>> nms_predictions;
for (size_t i = 0; i < prediction.size(); ++i)
{
if (i < 4 || i >= 5)
{
nms_predictions.push_back(prediction);
}
}
std::cout << "nms_predictions: " << nms_predictions.size() << "\n";
for (auto &nms_prediction : nms_predictions)
{
nms_prediction[4] *= 71;
for (size_t i = 5; i < nms_prediction.size(); ++i)
{
nms_prediction[i] *= 1279;
}
}
auto keep = Lane_nms(nms_predictions, scores, 50, 5);
std::vector<std::vector<float>> filtered_predictions;
for (auto idx : keep)
{
filtered_predictions.push_back(predictions[idx]);
}
for (auto &filtered_prediction : filtered_predictions)
{
filtered_prediction[5] = std::round(filtered_prediction[5] * 71);
}
std::cout << "filtered_predictions: " << filtered_predictions.size() << "\n";
auto pred = predictions_to_pred(filtered_predictions);
decoded.insert(decoded.end(), pred.begin(), pred.end());
}
return decoded;
}
cv::Mat imshow_lanes(const cv::Mat &img, const std::vector<Lane> &lanes)
{
cv::Mat output_img = img.clone();
for (size_t i = 0; i < lanes.size(); ++i)
{
auto lane_points = lanes[i].to_array();
for (const auto &point : lane_points)
{
if (point.x > 0 && point.y > 0)
{
cv::circle(output_img, point, 5, COLORS[i % COLORS.size()], -1);
}
}
for (size_t j = 1; j < lane_points.size(); ++j)
{
if (lane_points[j - 1].x > 0 && lane_points[j - 1].y > 0 && lane_points[j].x > 0 && lane_points[j].y > 0)
{
cv::line(output_img, lane_points[j - 1], lane_points[j], COLORS[i % COLORS.size()], 4);
}
}
}
return output_img;
}
std::vector<float> softmax(const std::vector<float> &x)
{
std::vector<float> y(x.size());
float max_val = *std::max_element(x.begin(), x.end());
float sum = 0.0f;
for (size_t i = 0; i < x.size(); ++i)
{
y[i] = std::exp(x[i] - max_val);
sum += y[i];
}
for (size_t i = 0; i < x.size(); ++i)
{
y[i] /= sum;
}
return y;
}
bool Lane_IOU(const std::vector<float> &parent_box, const std::vector<float> &compared_box, float threshold)
{
int n_offsets = 72;
int n_strips = n_offsets - 1;
int start_a = static_cast<int>(parent_box[2] * n_strips + 0.5);
int start_b = static_cast<int>(compared_box[2] * n_strips + 0.5);
int start = std::max(start_a, start_b);
int end_a = start_a + static_cast<int>(parent_box[4] - 1 + 0.5 - ((parent_box[4] - 1) < 0));
int end_b = start_b + static_cast<int>(compared_box[4] - 1 + 0.5 - ((compared_box[4] - 1) < 0));
int end = std::min({end_a, end_b, 71});
if ((end - start) < 0)
{
return false;
}
float dist = 0.0f;
for (int i = 5 + start; i <= 5 + end; ++i)
{
if (parent_box[i] < compared_box[i])
{
dist += compared_box[i] - parent_box[i];
}
else
{
dist += parent_box[i] - compared_box[i];
}
}
return dist < (threshold * (end - start + 1));
}
std::vector<int> Lane_nms(const std::vector<std::vector<float>> &proposals, const std::vector<float> &scores, float overlap, int top_k)
{
std::vector<int> keep_index;
std::vector<int> indices(scores.size());
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(), [&scores](int a, int b)
{ return scores[a] > scores[b]; });
std::vector<int> r_filters(scores.size(), 0);
for (size_t i = 0; i < indices.size(); ++i)
{
if (r_filters[i] == 1)
{
continue;
}
keep_index.push_back(indices[i]);
if (keep_index.size() > static_cast<size_t>(top_k))
{
break;
}
if (i == indices.size() - 1)
{
break;
}
for (size_t j = i + 1; j < indices.size(); ++j)
{
if (Lane_IOU(proposals[indices[i]], proposals[indices[j]], overlap))
{
r_filters[j] = 1;
}
}
}
return keep_index;
}
std::vector<Lane> predictions_to_pred(const std::vector<std::vector<float>> &predictions)
{
std::vector<Lane> lanes;
for (const auto &lane : predictions)
{
std::vector<float> lane_xs(lane.begin() + 6, lane.end());
int start = std::min(std::max(0, static_cast<int>(std::round(lane[2] * 71))), 71);
int length = static_cast<int>(std::round(lane[5]));
int end = start + length - 1;
end = std::min(end, 71);
std::vector<bool> mask(start, false);
for (int i = 0; i < start; ++i)
{
if (lane_xs[i] >= 0 && lane_xs[i] <= 1)
{
mask[i] = true;
}
}
for (int i = 0; i < start; ++i)
{
if (!mask[i])
{
lane_xs[i] = -2;
}
}
for (int i = end + 1; i < lane_xs.size(); ++i)
{
lane_xs[i] = -2;
}
std::vector<float> lane_ys;
for (int i = 0; i < lane_xs.size(); ++i)
{
if (lane_xs[i] >= 0)
{
lane_ys.push_back(1.0f - static_cast<float>(i) / 71.0f);
}
}
std::vector<cv::Point2f> points;
for (int i = 0; i < lane_xs.size(); ++i)
{
if (lane_xs[i] >= 0)
{
points.emplace_back(lane_xs[i] * 1280, lane_ys[i] * (720 - 160) + 160);
}
}
if (points.size() > 1)
{
lanes.emplace_back(points);
}
}
return lanes;
}
};
int main(int argc, char *argv[])
{
if (argc != 3)
{
std::cout << argv[0] << ": <engine> <image>" << std::endl;
return 0;
}
CLRNetDemo isnet(argv[1]);
cv::Mat image = cv::imread(argv[2]);
if (image.empty())
{
std::cerr << "Error: Could not open or find the image!" << std::endl;
return -1;
}
cv::Mat output = isnet.forward(image);
cv::imwrite("output_trt.png", output);
return 0;
}