float16.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #pragma once
  15. #include <stdint.h>
  16. #include <cmath>
  17. #include <iostream>
  18. #include <limits>
  19. #if !defined(_WIN32)
  20. #define FD_ALIGN(x) __attribute__((aligned(x)))
  21. #else
  22. #define FD_ALIGN(x) __declspec(align(x))
  23. #endif
  24. namespace ultra_infer {
  25. struct FD_ALIGN(2) float16 {
  26. public:
  27. uint16_t x;
  28. // The following defaulted special class member functions
  29. // are added to make float16 pass the std::is_trivial test
  30. float16() = default;
  31. float16(const float16 &o) = default;
  32. float16 &operator=(const float16 &o) = default;
  33. float16(float16 &&o) = default;
  34. float16 &operator=(float16 &&o) = default;
  35. ~float16() = default;
  36. // Constructors
  37. #ifdef FD_WITH_NATIVE_FP16
  38. // __fp16 is a native half precision data type for arm cpu,
  39. // float16_t is an alias for __fp16
  40. inline explicit float16(const float16_t &h) {
  41. x = *reinterpret_cast<const uint16_t *>(&h);
  42. }
  43. #endif
  44. inline explicit float16(float val) {
  45. #if defined(FD_WITH_NATIVE_FP16)
  46. float32x4_t tmp = vld1q_dup_f32(&val);
  47. float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
  48. x = *reinterpret_cast<uint16_t *>(&res);
  49. #elif defined(__F16C__)
  50. x = _cvtss_sh(val, 0);
  51. #else
  52. // Conversion routine adapted from
  53. // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
  54. Bits v, s;
  55. v.f = val;
  56. uint32_t sign = v.si & sigN;
  57. v.si ^= sign;
  58. sign >>= shiftSign; // logical shift
  59. s.si = mulN;
  60. s.si = s.f * v.f; // correct subnormals
  61. v.si ^= (s.si ^ v.si) & -(minN > v.si);
  62. v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
  63. v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
  64. v.ui >>= shift; // logical shift
  65. v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
  66. v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
  67. x = v.ui | sign;
  68. #endif
  69. }
  70. inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
  71. template <class T>
  72. inline explicit float16(const T &val)
  73. : x(float16(static_cast<float>(val)).x) {}
  74. // Assignment operators
  75. #ifdef FD_WITH_NATIVE_FP16
  76. inline float16 &operator=(const float16_t &rhs) {
  77. x = *reinterpret_cast<const uint16_t *>(&rhs);
  78. return *this;
  79. }
  80. #endif
  81. inline float16 &operator=(bool b) {
  82. x = b ? 0x3c00 : 0;
  83. return *this;
  84. }
  85. inline float16 &operator=(int8_t val) {
  86. x = float16(val).x;
  87. return *this;
  88. }
  89. inline float16 &operator=(uint8_t val) {
  90. x = float16(val).x;
  91. return *this;
  92. }
  93. inline float16 &operator=(int16_t val) {
  94. x = float16(val).x;
  95. return *this;
  96. }
  97. inline float16 &operator=(uint16_t val) {
  98. x = float16(val).x;
  99. return *this;
  100. }
  101. inline float16 &operator=(int32_t val) {
  102. x = float16(val).x;
  103. return *this;
  104. }
  105. inline float16 &operator=(uint32_t val) {
  106. x = float16(val).x;
  107. return *this;
  108. }
  109. inline float16 &operator=(int64_t val) {
  110. x = float16(val).x;
  111. return *this;
  112. }
  113. inline float16 &operator=(uint64_t val) {
  114. x = float16(val).x;
  115. return *this;
  116. }
  117. inline float16 &operator=(float val) {
  118. x = float16(val).x;
  119. return *this;
  120. }
  121. inline float16 &operator=(double val) {
  122. x = float16(val).x;
  123. return *this;
  124. }
  125. // Conversion operators
  126. #ifdef FD_WITH_NATIVE_FP16
  127. HOSTDEVICE inline explicit operator float16_t() const {
  128. return *reinterpret_cast<const float16_t *>(this);
  129. }
  130. #endif
  131. inline operator float() const {
  132. #if defined(FD_WITH_NATIVE_FP16)
  133. float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t *>(this));
  134. return vgetq_lane_f32(vcvt_f32_f16(res), 0);
  135. #elif defined(__F16C__)
  136. return _cvtsh_ss(this->x);
  137. #else
  138. // Conversion routine adapted from
  139. // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
  140. Bits v;
  141. v.ui = this->x;
  142. int32_t sign = v.si & sigC;
  143. v.si ^= sign;
  144. sign <<= shiftSign;
  145. v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
  146. v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
  147. Bits s;
  148. s.si = mulC;
  149. s.f *= v.si;
  150. int32_t mask = -(norC > v.si);
  151. v.si <<= shift;
  152. v.si ^= (s.si ^ v.si) & mask;
  153. v.si |= sign;
  154. return v.f;
  155. #endif
  156. }
  157. inline explicit operator bool() const { return (x & 0x7fff) != 0; }
  158. inline explicit operator int8_t() const {
  159. return static_cast<int8_t>(static_cast<float>(*this));
  160. }
  161. inline explicit operator uint8_t() const {
  162. return static_cast<uint8_t>(static_cast<float>(*this));
  163. }
  164. inline explicit operator int16_t() const {
  165. return static_cast<int16_t>(static_cast<float>(*this));
  166. }
  167. inline explicit operator uint16_t() const {
  168. return static_cast<uint16_t>(static_cast<float>(*this));
  169. }
  170. inline explicit operator int32_t() const {
  171. return static_cast<int32_t>(static_cast<float>(*this));
  172. }
  173. inline explicit operator uint32_t() const {
  174. return static_cast<uint32_t>(static_cast<float>(*this));
  175. }
  176. inline explicit operator int64_t() const {
  177. return static_cast<int64_t>(static_cast<float>(*this));
  178. }
  179. inline explicit operator uint64_t() const {
  180. return static_cast<uint64_t>(static_cast<float>(*this));
  181. }
  182. inline operator double() const {
  183. return static_cast<double>(static_cast<float>(*this));
  184. }
  185. inline bool operator>(const float &other) const {
  186. return this->operator float() > other;
  187. }
  188. inline bool operator>(const double &other) const {
  189. return this->operator double() > other;
  190. }
  191. inline bool operator<(const float &other) const {
  192. return this->operator float() > other;
  193. }
  194. inline bool operator<(const double &other) const {
  195. return this->operator double() > other;
  196. }
  197. template <typename T,
  198. typename std::enable_if<!std::is_same<T, float16>::value,
  199. bool>::type = true>
  200. inline float16 &operator+=(const T &other) {
  201. *this = float16(static_cast<T>(*this) + other);
  202. return *this;
  203. }
  204. private:
  205. union Bits {
  206. float f;
  207. int32_t si;
  208. uint32_t ui;
  209. };
  210. static const int shift = 13;
  211. static const int shiftSign = 16;
  212. static const int32_t infN = 0x7F800000;
  213. static const int32_t maxN = 0x477FE000; // max flt16 as flt32
  214. static const int32_t minN = 0x38800000; // min flt16 normal as flt32
  215. static const int32_t sigN = 0x80000000; // sign bit
  216. static constexpr int32_t infC = infN >> shift;
  217. static constexpr int32_t nanN = (infC + 1)
  218. << shift; // minimum flt16 nan as float32
  219. static constexpr int32_t maxC = maxN >> shift;
  220. static constexpr int32_t minC = minN >> shift;
  221. static constexpr int32_t sigC = sigN >> shiftSign;
  222. static const int32_t mulN = 0x52000000; // (1 << 23) / minN
  223. static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift))
  224. static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted
  225. static const int32_t norC = 0x00400; // min flt32 normal downshifted
  226. static constexpr int32_t maxD = infC - maxC - 1;
  227. static constexpr int32_t minD = minC - subC - 1;
  228. };
  229. // Arithmetic operators for float16 on ARMv8.2-A CPU
  230. #if defined(FD_WITH_NATIVE_FP16)
  231. inline float16 operator+(const float16 &a, const float16 &b) {
  232. float16 res;
  233. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  234. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  235. "fadd h0, h0, h1\n"
  236. "st1 {v0.h}[0], [%[res_ptr]]\n"
  237. : // outputs
  238. : // inputs
  239. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  240. [res_ptr] "r"(&(res.x))
  241. : // clobbers
  242. "memory", "v0", "v1");
  243. return res;
  244. }
  245. inline float16 operator-(const float16 &a, const float16 &b) {
  246. float16 res;
  247. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  248. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  249. "fsub h0, h0, h1\n"
  250. "st1 {v0.h}[0], [%[res_ptr]]\n"
  251. : // outputs
  252. : // inputs
  253. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  254. [res_ptr] "r"(&(res.x))
  255. : // clobbers
  256. "memory", "v0", "v1");
  257. return res;
  258. }
  259. inline float16 operator*(const float16 &a, const float16 &b) {
  260. float16 res;
  261. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  262. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  263. "fmul h0, h0, h1\n"
  264. "st1 {v0.h}[0], [%[res_ptr]]\n"
  265. : // outputs
  266. : // inputs
  267. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  268. [res_ptr] "r"(&(res.x))
  269. : // clobbers
  270. "memory", "v0", "v1");
  271. return res;
  272. }
  273. inline float16 operator/(const float16 &a, const float16 &b) {
  274. float16 res;
  275. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  276. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  277. "fdiv h0, h0, h1\n"
  278. "st1 {v0.h}[0], [%[res_ptr]]\n"
  279. : // outputs
  280. : // inputs
  281. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  282. [res_ptr] "r"(&(res.x))
  283. : // clobbers
  284. "memory", "v0", "v1");
  285. return res;
  286. }
  287. inline float16 operator-(const float16 &a) {
  288. float16 res;
  289. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  290. "fneg h0, h0\n"
  291. "st1 {v0.h}[0], [%[res_ptr]]\n"
  292. : // outputs
  293. : // inputs
  294. [a_ptr] "r"(&(a.x)),
  295. [res_ptr] "r"(&(res.x))
  296. : // clobbers
  297. "memory", "v0");
  298. return res;
  299. }
  300. inline float16 &operator+=(float16 &a, const float16 &b) { // NOLINT
  301. a = a + b;
  302. return a;
  303. }
  304. inline float16 &operator-=(float16 &a, const float16 &b) { // NOLINT
  305. a = a - b;
  306. return a;
  307. }
  308. inline float16 &operator*=(float16 &a, const float16 &b) { // NOLINT
  309. a = a * b;
  310. return a;
  311. }
  312. inline float16 &operator/=(float16 &a, const float16 &b) { // NOLINT
  313. a = a / b;
  314. return a;
  315. }
  316. inline bool operator==(const float16 &a, const float16 &b) {
  317. uint16_t res;
  318. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  319. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  320. "fcmeq h0, h0, h1\n"
  321. "st1 {v0.h}[0], [%[res_ptr]]\n"
  322. : // outputs
  323. : // inputs
  324. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  325. [res_ptr] "r"(&res)
  326. : // clobbers
  327. "memory", "v0", "v1");
  328. return (res & 0xffff) != 0;
  329. }
  330. inline bool operator!=(const float16 &a, const float16 &b) { return !(a == b); }
  331. inline bool operator<(const float16 &a, const float16 &b) {
  332. uint16_t res;
  333. asm volatile("ld1 {v1.h}[0], [%[a_ptr]]\n"
  334. "ld1 {v0.h}[0], [%[b_ptr]]\n"
  335. "fcmgt h0, h0, h1\n"
  336. "st1 {v0.h}[0], [%[res_ptr]]\n"
  337. : // outputs
  338. : // inputs
  339. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  340. [res_ptr] "r"(&res)
  341. : // clobbers
  342. "memory", "v0", "v1");
  343. return (res & 0xffff) != 0;
  344. }
  345. inline bool operator<=(const float16 &a, const float16 &b) {
  346. uint16_t res;
  347. asm volatile("ld1 {v1.h}[0], [%[a_ptr]]\n"
  348. "ld1 {v0.h}[0], [%[b_ptr]]\n"
  349. "fcmge h0, h0, h1\n"
  350. "st1 {v0.h}[0], [%[res_ptr]]\n"
  351. : // outputs
  352. : // inputs
  353. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  354. [res_ptr] "r"(&res)
  355. : // clobbers
  356. "memory", "v0", "v1");
  357. return (res & 0xffff) != 0;
  358. }
  359. inline bool operator>(const float16 &a, const float16 &b) {
  360. uint16_t res;
  361. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  362. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  363. "fcmgt h0, h0, h1\n"
  364. "st1 {v0.h}[0], [%[res_ptr]]\n"
  365. : // outputs
  366. : // inputs
  367. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  368. [res_ptr] "r"(&res)
  369. : // clobbers
  370. "memory", "v0", "v1");
  371. return (res & 0xffff) != 0;
  372. }
  373. inline bool operator>=(const float16 &a, const float16 &b) {
  374. uint16_t res;
  375. asm volatile("ld1 {v0.h}[0], [%[a_ptr]]\n"
  376. "ld1 {v1.h}[0], [%[b_ptr]]\n"
  377. "fcmge h0, h0, h1\n"
  378. "st1 {v0.h}[0], [%[res_ptr]]\n"
  379. : // outputs
  380. : // inputs
  381. [a_ptr] "r"(&(a.x)), [b_ptr] "r"(&(b.x)),
  382. [res_ptr] "r"(&res)
  383. : // clobbers
  384. "memory", "v0", "v1");
  385. return (res & 0xffff) != 0;
  386. #else
  387. inline float16 operator+(const float16 &a, const float16 &b) {
  388. return float16(static_cast<float>(a) + static_cast<float>(b));
  389. }
  390. inline float16 operator-(const float16 &a, const float16 &b) {
  391. return float16(static_cast<float>(a) - static_cast<float>(b));
  392. }
  393. inline float16 operator*(const float16 &a, const float16 &b) {
  394. return float16(static_cast<float>(a) * static_cast<float>(b));
  395. }
  396. inline float16 operator/(const float16 &a, const float16 &b) {
  397. return float16(static_cast<float>(a) / static_cast<float>(b));
  398. }
  399. inline float16 operator-(const float16 &a) {
  400. float16 res;
  401. res.x = a.x ^ 0x8000;
  402. return res;
  403. }
  404. inline float16 &operator+=(float16 &a, const float16 &b) { // NOLINT
  405. a = float16(static_cast<float>(a) + static_cast<float>(b));
  406. return a;
  407. }
  408. inline float16 &operator-=(float16 &a, const float16 &b) { // NOLINT
  409. a = float16(static_cast<float>(a) - static_cast<float>(b));
  410. return a;
  411. }
  412. inline float16 &operator*=(float16 &a, const float16 &b) { // NOLINT
  413. a = float16(static_cast<float>(a) * static_cast<float>(b));
  414. return a;
  415. }
  416. inline float16 &operator/=(float16 &a, const float16 &b) { // NOLINT
  417. a = float16(static_cast<float>(a) / static_cast<float>(b));
  418. return a;
  419. }
  420. inline bool operator==(const float16 &a, const float16 &b) {
  421. return static_cast<float>(a) == static_cast<float>(b);
  422. }
  423. inline bool operator!=(const float16 &a, const float16 &b) {
  424. return static_cast<float>(a) != static_cast<float>(b);
  425. }
  426. inline bool operator<(const float16 &a, const float16 &b) {
  427. return static_cast<float>(a) < static_cast<float>(b);
  428. }
  429. inline bool operator<=(const float16 &a, const float16 &b) {
  430. return static_cast<float>(a) <= static_cast<float>(b);
  431. }
  432. inline bool operator>(const float16 &a, const float16 &b) {
  433. return static_cast<float>(a) > static_cast<float>(b);
  434. }
  435. inline bool operator>=(const float16 &a, const float16 &b) {
  436. return static_cast<float>(a) >= static_cast<float>(b);
  437. }
  438. #endif
  439. template <typename T,
  440. typename std::enable_if<std::is_integral<T>::value ||
  441. std::is_same<T, float>::value,
  442. bool>::type = true>
  443. inline T &operator+=(T &a, const float16 &b) { // NOLINT
  444. auto c = static_cast<float>(a) + static_cast<float>(b);
  445. a = static_cast<T>(c);
  446. return a;
  447. }
  448. inline double &operator+=(double &a, const float16 &b) { // NOLINT
  449. a = a + static_cast<double>(b);
  450. return a;
  451. }
  452. inline float16 raw_uint16_to_float16(uint16_t a) {
  453. float16 res;
  454. res.x = a;
  455. return res;
  456. }
  457. inline bool(isnan)(const float16 &a) { return (a.x & 0x7fff) > 0x7c00; }
  458. inline bool(isinf)(const float16 &a) { return (a.x & 0x7fff) == 0x7c00; }
  459. inline bool(isfinite)(const float16 &a) {
  460. return !((isnan)(a)) && !((isinf)(a));
  461. }
  462. inline float16(abs)(const float16 &a) {
  463. return float16(std::abs(static_cast<float>(a)));
  464. }
  465. inline std::ostream &operator<<(std::ostream &os, const float16 &a) {
  466. os << static_cast<float>(a);
  467. return os;
  468. }
  469. } // namespace ultra_infer
  470. namespace std {
  471. // Override the std::is_pod::value for float16
  472. // The reason is that different compilers implemented std::is_pod based on
  473. // different C++ standards. float16 class is a plain old data in C++11 given
  474. // that it is both trivial and standard_layout.
  475. // However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
  476. // more restricted in that you cannot provide any customized
  477. // constructor in float16. Hence, we override is_pod here following C++11
  478. // so that .cu files can be successfully compiled by nvcc.
  479. template <> struct is_pod<ultra_infer::float16> {
  480. static const bool value = is_trivial<ultra_infer::float16>::value &&
  481. is_standard_layout<ultra_infer::float16>::value;
  482. };
  483. template <>
  484. struct is_floating_point<ultra_infer::float16>
  485. : std::integral_constant<
  486. bool, std::is_same<ultra_infer::float16,
  487. typename std::remove_cv<
  488. ultra_infer::float16>::type>::value> {};
  489. template <> struct is_signed<ultra_infer::float16> {
  490. static const bool value = true;
  491. };
  492. template <> struct is_unsigned<ultra_infer::float16> {
  493. static const bool value = false;
  494. };
  495. inline bool isnan(const ultra_infer::float16 &a) {
  496. return ultra_infer::isnan(a);
  497. }
  498. inline bool isinf(const ultra_infer::float16 &a) {
  499. return ultra_infer::isinf(a);
  500. }
  501. template <> struct numeric_limits<ultra_infer::float16> {
  502. static const bool is_specialized = true;
  503. static const bool is_signed = true;
  504. static const bool is_integer = false;
  505. static const bool is_exact = false;
  506. static const bool has_infinity = true;
  507. static const bool has_quiet_NaN = true;
  508. static const bool has_signaling_NaN = true;
  509. static const float_denorm_style has_denorm = denorm_present;
  510. static const bool has_denorm_loss = false;
  511. static const std::float_round_style round_style = std::round_to_nearest;
  512. static const bool is_iec559 = false;
  513. static const bool is_bounded = false;
  514. static const bool is_modulo = false;
  515. static const int digits = 11;
  516. static const int digits10 = 3;
  517. static const int max_digits10 = 5;
  518. static const int radix = 2;
  519. static const int min_exponent = -13;
  520. static const int min_exponent10 = -4;
  521. static const int max_exponent = 16;
  522. static const int max_exponent10 = 4;
  523. static const bool traps = true;
  524. static const bool tinyness_before = false;
  525. static ultra_infer::float16(min)() {
  526. return ultra_infer::raw_uint16_to_float16(0x400);
  527. }
  528. static ultra_infer::float16 lowest() {
  529. return ultra_infer::raw_uint16_to_float16(0xfbff);
  530. }
  531. static ultra_infer::float16(max)() {
  532. return ultra_infer::raw_uint16_to_float16(0x7bff);
  533. }
  534. static ultra_infer::float16 epsilon() {
  535. return ultra_infer::raw_uint16_to_float16(0x0800);
  536. }
  537. static ultra_infer::float16 round_error() {
  538. return ultra_infer::float16(0.5);
  539. }
  540. static ultra_infer::float16 infinity() {
  541. return ultra_infer::raw_uint16_to_float16(0x7c00);
  542. }
  543. static ultra_infer::float16 quiet_NaN() {
  544. return ultra_infer::raw_uint16_to_float16(0x7e00);
  545. }
  546. static ultra_infer::float16 signaling_NaN() {
  547. return ultra_infer::raw_uint16_to_float16(0x7e00);
  548. }
  549. static ultra_infer::float16 denorm_min() {
  550. return ultra_infer::raw_uint16_to_float16(0x1);
  551. }
  552. };
  553. inline ultra_infer::float16 abs(const ultra_infer::float16 &a) {
  554. return ultra_infer::abs(a);
  555. }
  556. } // namespace std