Foundation
Loading...
Searching...
No Matches
Quantize.hpp
Go to the documentation of this file.
1#pragma once
2#include <bit>
3#include "Math.hpp"
4namespace Foundation::Math
5{
11 float quantizeFP32(float v, int32_t N);
12
19 uint16_t quantizeFP16(float v);
20
25 float dequantizeFP16(uint16_t h);
26
27 /* [0,1] range -> [0, 1 << NBits) \in N */
28 inline uint32_t quantizeUnorm(float v, int32_t N)
29 {
30 const auto scale = static_cast<float>((1 << N) - 1);
31
32 v = (v >= 0) ? v : 0;
33 v = (v <= 1) ? v : 1;
34
35 return static_cast<int>(v * scale + 0.5f);
36 }
37
38 /* [0, 1 << NBits) \in N -> [0, 1] range */
39 inline float dequantizeUnorm(int32_t q, int32_t Nbits) { return q / static_cast<float>((1 << Nbits) - 1); }
40
48 inline int32_t quantizeSnorm(float v, int32_t N)
49 {
50 const auto scale = static_cast<float>((1 << (N - 1)) - 1);
51
52 float round = (v >= 0 ? 0.5f : -0.5f);
53
54 v = (v >= -1) ? v : -1;
55 v = (v <= +1) ? v : +1;
56
57 return static_cast<int>(v * scale + round);
58 }
59
60 // [-(1<< (Nbits - 1)) - 1, (1 << (Nbits - 1))] \in N -> [-1, 1]
61 inline float dequantizeSnorm(int32_t q, int32_t Nbits) { return q / static_cast<float>((1 << (Nbits - 1)) - 1); }
62
63 // [-1, 1] range -> [0, 1 << NBits) \in N
64 inline uint32_t quantizeSnormShifted(float v, int32_t Nbits)
65 {
66 return quantizeSnorm(v, Nbits) + (1 << (Nbits - 1));
67 }
68
69 // [0, 1 << NBits) \in N -> [-1, 1] range
70 inline float dequantizeSnormShifted(uint32_t q, int32_t Nbits)
71 {
72 return dequantizeSnorm(q - (1 << (Nbits - 1)), Nbits);
73 }
74} // namespace Foundation::Math
Definition Decompose.hpp:4
float dequantizeFP16(unsigned short h)
Definition Quantize.cpp:54
uint32_t quantizeSnormShifted(float v, int32_t Nbits)
Definition Quantize.hpp:64
int32_t quantizeSnorm(float v, int32_t N)
Definition Quantize.hpp:48
float dequantizeSnorm(int32_t q, int32_t Nbits)
Definition Quantize.hpp:61
float dequantizeUnorm(int32_t q, int32_t Nbits)
Definition Quantize.hpp:39
float dequantizeSnormShifted(uint32_t q, int32_t Nbits)
Definition Quantize.hpp:70
float quantizeFP32(float v, int N)
Definition Quantize.cpp:33
uint32_t quantizeUnorm(float v, int32_t N)
Definition Quantize.hpp:28
unsigned short quantizeFP16(float v)
Definition Quantize.cpp:10