Foundation
Loading...
Searching...
No Matches
Quantization.hpp
Go to the documentation of this file.
1
6#pragma once
7#include <cstdint>
8#include <bitset>
9namespace Foundation::Math {
15 float quantizeFP32(float v, int32_t N);
16
23 uint16_t quantizeFP16(float v);
24
29 float dequantizeFP16(uint16_t h);
30
31 /* [0,1] range -> [0, 1 << NBits) \in N */
32 inline uint32_t quantizeUnorm(float v, int32_t N) {
33 const auto scale = static_cast<float>((1 << N) - 1);
34
35 v = (v >= 0) ? v : 0;
36 v = (v <= 1) ? v : 1;
37
38 return static_cast<int>(v * scale + 0.5f);
39 }
40
41 /* [0, 1 << NBits) \in N -> [0, 1] range */
42 inline float dequantizeUnorm(int32_t q, int32_t Nbits) {
43 return q / static_cast<float>((1 << Nbits) - 1);
44 }
45
53 inline int32_t quantizeSnorm(float v, int32_t N) {
54 const auto scale = static_cast<float>((1 << (N - 1)) - 1);
55
56 float round = (v >= 0 ? 0.5f : -0.5f);
57
58 v = (v >= -1) ? v : -1;
59 v = (v <= +1) ? v : +1;
60
61 return static_cast<int>(v * scale + round);
62 }
63
64 // [-(1<< (Nbits - 1)) - 1, (1 << (Nbits - 1))] \in N -> [-1, 1]
65 inline float dequantizeSnorm(int32_t q, int32_t Nbits) {
66 return q / static_cast<float>((1 << (Nbits - 1)) - 1);
67 }
68
69 // [-1, 1] range -> [0, 1 << NBits) \in N
70 inline uint32_t quantizeSnormShifted(float v, int32_t Nbits) {
71 return quantizeSnorm(v, Nbits) + (1 << (Nbits - 1));
72 }
73
74 // [0, 1 << NBits) \in N -> [-1, 1] range
75 inline float dequantizeSnormShifted(uint32_t q, int32_t Nbits) {
76 return dequantizeSnorm(q - (1 << (Nbits - 1)), Nbits);
77 }
78}
Mathematical utilities and types.
Definition Math.hpp:21
float dequantizeFP16(unsigned short h)
Definition Quantization.cpp:53
uint32_t quantizeSnormShifted(float v, int32_t Nbits)
Definition Quantization.hpp:70
int32_t quantizeSnorm(float v, int32_t N)
Definition Quantization.hpp:53
float dequantizeSnorm(int32_t q, int32_t Nbits)
Definition Quantization.hpp:65
float dequantizeUnorm(int32_t q, int32_t Nbits)
Definition Quantization.hpp:42
float dequantizeSnormShifted(uint32_t q, int32_t Nbits)
Definition Quantization.hpp:75
float quantizeFP32(float v, int N)
Definition Quantization.cpp:32
uint32_t quantizeUnorm(float v, int32_t N)
Definition Quantization.hpp:32
unsigned short quantizeFP16(float v)
Definition Quantization.cpp:9