yolo_preprocess.cu 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // Part of the following code in this file refs to
  16. // https://github.com/wang-xinyu/tensorrtx/blob/yolov5-v6.0/yolov5/preprocess.cu
  17. //
  18. // Copyright (c) 2022 tensorrtx
  19. // Licensed under The MIT License
  20. // \file preprocess.cu
  21. // \brief
  22. // \author Qi Liu, Xinyu Wang
  23. #ifdef WITH_GPU
  24. #include <opencv2/opencv.hpp>
  25. #include "ultra_infer/vision/utils/cuda_utils.h"
  26. namespace ultra_infer {
  27. namespace vision {
  28. namespace utils {
  29. struct AffineMatrix {
  30. float value[6];
  31. };
  32. __global__ void
  33. YoloPreprocessCudaKernel(uint8_t *src, int src_line_size, int src_width,
  34. int src_height, float *dst, int dst_width,
  35. int dst_height, uint8_t padding_color_b,
  36. uint8_t padding_color_g, uint8_t padding_color_r,
  37. AffineMatrix d2s, int edge) {
  38. int position = blockDim.x * blockIdx.x + threadIdx.x;
  39. if (position >= edge)
  40. return;
  41. float m_x1 = d2s.value[0];
  42. float m_y1 = d2s.value[1];
  43. float m_z1 = d2s.value[2];
  44. float m_x2 = d2s.value[3];
  45. float m_y2 = d2s.value[4];
  46. float m_z2 = d2s.value[5];
  47. int dx = position % dst_width;
  48. int dy = position / dst_width;
  49. float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
  50. float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
  51. float c0, c1, c2;
  52. if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
  53. // out of range
  54. c0 = padding_color_b;
  55. c1 = padding_color_g;
  56. c2 = padding_color_r;
  57. } else {
  58. int y_low = floorf(src_y);
  59. int x_low = floorf(src_x);
  60. int y_high = y_low + 1;
  61. int x_high = x_low + 1;
  62. uint8_t const_value[] = {padding_color_b, padding_color_g, padding_color_r};
  63. float ly = src_y - y_low;
  64. float lx = src_x - x_low;
  65. float hy = 1 - ly;
  66. float hx = 1 - lx;
  67. float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
  68. uint8_t *v1 = const_value;
  69. uint8_t *v2 = const_value;
  70. uint8_t *v3 = const_value;
  71. uint8_t *v4 = const_value;
  72. if (y_low >= 0) {
  73. if (x_low >= 0)
  74. v1 = src + y_low * src_line_size + x_low * 3;
  75. if (x_high < src_width)
  76. v2 = src + y_low * src_line_size + x_high * 3;
  77. }
  78. if (y_high < src_height) {
  79. if (x_low >= 0)
  80. v3 = src + y_high * src_line_size + x_low * 3;
  81. if (x_high < src_width)
  82. v4 = src + y_high * src_line_size + x_high * 3;
  83. }
  84. c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
  85. c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
  86. c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
  87. }
  88. // bgr to rgb
  89. float t = c2;
  90. c2 = c0;
  91. c0 = t;
  92. // normalization
  93. c0 = c0 / 255.0f;
  94. c1 = c1 / 255.0f;
  95. c2 = c2 / 255.0f;
  96. // rgbrgbrgb to rrrgggbbb
  97. int area = dst_width * dst_height;
  98. float *pdst_c0 = dst + dy * dst_width + dx;
  99. float *pdst_c1 = pdst_c0 + area;
  100. float *pdst_c2 = pdst_c1 + area;
  101. *pdst_c0 = c0;
  102. *pdst_c1 = c1;
  103. *pdst_c2 = c2;
  104. }
  105. void CudaYoloPreprocess(uint8_t *src, int src_width, int src_height, float *dst,
  106. int dst_width, int dst_height,
  107. const std::vector<float> padding_value,
  108. cudaStream_t stream) {
  109. AffineMatrix s2d, d2s;
  110. float scale =
  111. std::min(dst_height / (float)src_height, dst_width / (float)src_width);
  112. s2d.value[0] = scale;
  113. s2d.value[1] = 0;
  114. s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
  115. s2d.value[3] = 0;
  116. s2d.value[4] = scale;
  117. s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
  118. cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
  119. cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
  120. cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
  121. memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
  122. int jobs = dst_height * dst_width;
  123. int threads = 256;
  124. int blocks = ceil(jobs / (float)threads);
  125. YoloPreprocessCudaKernel<<<blocks, threads, 0, stream>>>(
  126. src, src_width * 3, src_width, src_height, dst, dst_width, dst_height,
  127. padding_value[0], padding_value[1], padding_value[2], d2s, jobs);
  128. }
  129. } // namespace utils
  130. } // namespace vision
  131. } // namespace ultra_infer
  132. #endif