normalize.cu 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifdef WITH_GPU
  15. #include "ultra_infer/vision/common/processors/normalize.h"
  16. namespace ultra_infer {
  17. namespace vision {
  18. __global__ void NormalizeKernel(const uint8_t *src, float *dst,
  19. const float *alpha, const float *beta,
  20. int num_channel, bool swap_rb, int batch_size,
  21. int edge) {
  22. int idx = blockDim.x * blockIdx.x + threadIdx.x;
  23. if (idx >= edge)
  24. return;
  25. int img_size = edge / batch_size;
  26. int n = idx / img_size; // batch index
  27. int p = idx - (n * img_size); // pixel index within the image
  28. for (int i = 0; i < num_channel; ++i) {
  29. int j = i;
  30. if (swap_rb) {
  31. j = 2 - i;
  32. }
  33. dst[num_channel * idx + j] =
  34. src[num_channel * idx + j] * alpha[i] + beta[i];
  35. }
  36. }
  37. bool Normalize::ImplByCuda(FDMat *mat) {
  38. if (mat->layout != Layout::HWC) {
  39. FDERROR << "The input data must be NHWC format!" << std::endl;
  40. return false;
  41. }
  42. // Prepare input tensor
  43. FDTensor *src = CreateCachedGpuInputTensor(mat);
  44. src->ExpandDim(0);
  45. FDMatBatch mat_batch;
  46. mat_batch.SetTensor(src);
  47. mat_batch.mat_type = ProcLib::CUDA;
  48. mat_batch.input_cache = mat->input_cache;
  49. mat_batch.output_cache = mat->output_cache;
  50. bool ret = ImplByCuda(&mat_batch);
  51. FDTensor *dst = mat_batch.Tensor();
  52. dst->Squeeze(0);
  53. mat->SetTensor(dst);
  54. mat->mat_type = ProcLib::CUDA;
  55. return true;
  56. }
  57. bool Normalize::ImplByCuda(FDMatBatch *mat_batch) {
  58. if (mat_batch->layout != FDMatBatchLayout::NHWC) {
  59. FDERROR << "The input data must be NHWC format!" << std::endl;
  60. return false;
  61. }
  62. // Prepare input tensor
  63. FDTensor *src = CreateCachedGpuInputTensor(mat_batch);
  64. // Prepare output tensor
  65. mat_batch->output_cache->Resize(src->Shape(), FDDataType::FP32,
  66. "batch_output_cache", Device::GPU);
  67. // Copy alpha and beta to GPU
  68. gpu_alpha_.Resize({1, 1, static_cast<int>(alpha_.size())}, FDDataType::FP32,
  69. "alpha", Device::GPU);
  70. cudaMemcpy(gpu_alpha_.Data(), alpha_.data(), gpu_alpha_.Nbytes(),
  71. cudaMemcpyHostToDevice);
  72. gpu_beta_.Resize({1, 1, static_cast<int>(beta_.size())}, FDDataType::FP32,
  73. "beta", Device::GPU);
  74. cudaMemcpy(gpu_beta_.Data(), beta_.data(), gpu_beta_.Nbytes(),
  75. cudaMemcpyHostToDevice);
  76. int jobs =
  77. mat_batch->output_cache->Numel() / mat_batch->output_cache->shape[3];
  78. int threads = 256;
  79. int blocks = ceil(jobs / (float)threads);
  80. NormalizeKernel<<<blocks, threads, 0, mat_batch->Stream()>>>(
  81. reinterpret_cast<uint8_t *>(src->Data()),
  82. reinterpret_cast<float *>(mat_batch->output_cache->Data()),
  83. reinterpret_cast<float *>(gpu_alpha_.Data()),
  84. reinterpret_cast<float *>(gpu_beta_.Data()),
  85. mat_batch->output_cache->shape[3], swap_rb_,
  86. mat_batch->output_cache->shape[0], jobs);
  87. mat_batch->SetTensor(mat_batch->output_cache);
  88. mat_batch->mat_type = ProcLib::CUDA;
  89. return true;
  90. }
  91. #ifdef ENABLE_CVCUDA
  92. bool Normalize::ImplByCvCuda(FDMat *mat) { return ImplByCuda(mat); }
  93. bool Normalize::ImplByCvCuda(FDMatBatch *mat_batch) {
  94. return ImplByCuda(mat_batch);
  95. }
  96. #endif
  97. } // namespace vision
  98. } // namespace ultra_infer
  99. #endif