33 const auto scale =
static_cast<float>((1 << N) - 1);
38 return static_cast<int>(v * scale + 0.5f);
43 return q /
static_cast<float>((1 << Nbits) - 1);
54 const auto scale =
static_cast<float>((1 << (N - 1)) - 1);
56 float round = (v >= 0 ? 0.5f : -0.5f);
58 v = (v >= -1) ? v : -1;
59 v = (v <= +1) ? v : +1;
61 return static_cast<int>(v * scale + round);
66 return q /
static_cast<float>((1 << (Nbits - 1)) - 1);
Mathematical utilities and types.
Definition Math.hpp:21
float dequantizeFP16(unsigned short h)
Definition Quantization.cpp:53
uint32_t quantizeSnormShifted(float v, int32_t Nbits)
Definition Quantization.hpp:70
int32_t quantizeSnorm(float v, int32_t N)
Definition Quantization.hpp:53
float dequantizeSnorm(int32_t q, int32_t Nbits)
Definition Quantization.hpp:65
float dequantizeUnorm(int32_t q, int32_t Nbits)
Definition Quantization.hpp:42
float dequantizeSnormShifted(uint32_t q, int32_t Nbits)
Definition Quantization.hpp:75
float quantizeFP32(float v, int N)
Definition Quantization.cpp:32
uint32_t quantizeUnorm(float v, int32_t N)
Definition Quantization.hpp:32
unsigned short quantizeFP16(float v)
Definition Quantization.cpp:9