30 const auto scale =
static_cast<float>((1 << N) - 1);
35 return static_cast<int>(v * scale + 0.5f);
39 inline float dequantizeUnorm(int32_t q, int32_t Nbits) {
return q /
static_cast<float>((1 << Nbits) - 1); }
50 const auto scale =
static_cast<float>((1 << (N - 1)) - 1);
52 float round = (v >= 0 ? 0.5f : -0.5f);
54 v = (v >= -1) ? v : -1;
55 v = (v <= +1) ? v : +1;
57 return static_cast<int>(v * scale + round);
61 inline float dequantizeSnorm(int32_t q, int32_t Nbits) {
return q /
static_cast<float>((1 << (Nbits - 1)) - 1); }
Definition Decompose.hpp:4
float dequantizeFP16(unsigned short h)
Definition Quantize.cpp:54
uint32_t quantizeSnormShifted(float v, int32_t Nbits)
Definition Quantize.hpp:64
int32_t quantizeSnorm(float v, int32_t N)
Definition Quantize.hpp:48
float dequantizeSnorm(int32_t q, int32_t Nbits)
Definition Quantize.hpp:61
float dequantizeUnorm(int32_t q, int32_t Nbits)
Definition Quantize.hpp:39
float dequantizeSnormShifted(uint32_t q, int32_t Nbits)
Definition Quantize.hpp:70
float quantizeFP32(float v, int N)
Definition Quantize.cpp:33
uint32_t quantizeUnorm(float v, int32_t N)
Definition Quantize.hpp:28
unsigned short quantizeFP16(float v)
Definition Quantize.cpp:10