CoCalc -- customizing_quantization.py

GitHub Repository: keras-team/keras-io
Path: blob/master/guides/customizing_quantization.py
⁷⁷⁵³ views
1
"""
2
Title: Customizing Quantization with QuantizationConfig
3
Author: [Jyotinder Singh](https://x.com/Jyotinder_Singh)
4
Date created: 2025/12/18
5
Last modified: 2025/12/18
6
Description: Guide on using QuantizationConfig for weight-only quantization and custom quantizers.
7
Accelerator: GPU
8
"""
9

10
"""
11
## Introduction
12

13
This guide explores the flexible `QuantizationConfig` API in Keras, introduced to give you granular control over how your models are quantized.
14
While `model.quantize("int8")` provides a great default, you often need more control. For example, to perform **weight-only quantization** (common in LLMs) or to use **custom quantization schemes** (like percentile-based clipping).
15

16
We will cover:
17

18
1.  **Customizing INT8 Quantization**: Modifying the default parameters (e.g., custom value range).
19
2.  **Weight-Only Quantization (INT4)**: Quantizing weights to 4-bit while keeping activations in float, using `Int4QuantizationConfig`.
20
3.  **Custom Quantizers**: Implementing a completely custom quantizer (e.g., `PercentileQuantizer`) and using it with `QuantizationConfig`.
21
"""
22

23
"""
24
## Setup
25
"""
26

27
import keras
28
import numpy as np
29
from keras import ops
30

31
rng = np.random.default_rng()
32

33

34
def get_model():
35
    """Builds a simple Sequential model for demonstration."""
36
    return keras.Sequential(
37
        [
38
            keras.Input(shape=(10,)),
39
            keras.layers.Dense(32, activation="relu"),
40
            keras.layers.Dense(1),
41
        ]
42
    )
43

44

45
"""
46
## 1. Customizing INT8 Quantization
47

48
By default, `model.quantize("int8")` uses `AbsMaxQuantizer` for both weights and activations which uses the default value range of [-127, 127].
49
You might want to specify different parameters, such as a restricted value range (if you expect your activations to be within a certain range).
50
You can do this by creating an `Int8QuantizationConfig`.
51
"""
52

53
from keras.quantizers import Int8QuantizationConfig, AbsMaxQuantizer
54

55
model = get_model()
56

57
# Create a custom config
58
# Here we restrict the weight range to [-100, 100] instead of the default [-127, 127]
59
custom_int8_config = Int8QuantizationConfig(
60
    weight_quantizer=AbsMaxQuantizer(value_range=(-100, 100), axis=0),
61
    activation_quantizer=AbsMaxQuantizer(value_range=(-100, 100), axis=-1),
62
)
63

64
# Apply quantization with the custom config
65
model.quantize(config=custom_int8_config)
66

67
print("Layer 0 kernel dtype:", model.layers[0].kernel.dtype)
68
# Ensure all kernel values are within the specified range
69
assert ops.all(
70
    ops.less_equal(model.layers[0].kernel, 100)
71
), "Kernel values are not <= 100"
72
assert ops.all(
73
    ops.greater_equal(model.layers[0].kernel, -100)
74
), "Kernel values are not >= -100"
75

76
"""
77
## 2. Weight-Only Quantization (INT4)
78

79
By default, `model.quantize("int4")` quantizes activations to INT8 while keeping weights in INT4.
80
For large language models and memory-constrained environments, **weight-only quantization** is a popular technique.
81
It reduces the model size significantly (keeping weights in 4-bit) while maintaining higher precision for activations.
82

83
To achieve this, we set `activation_quantizer=None` in the `Int4QuantizationConfig`.
84
"""
85

86
from keras.quantizers import Int4QuantizationConfig
87

88
model = get_model()
89

90
# Define Int4 weight-only config
91
# We enable Int4 for weights, but disable activation quantization by setting it to None.
92
# Note that we use `"int8"` as the output dtype since TensorFlow and PyTorch don't support
93
# `int4`. However, we still benefit from the lower memory usage of int4 weights because of
94
# bitpacking implemented by Keras.
95
custom_int4_config = Int4QuantizationConfig(
96
    weight_quantizer=AbsMaxQuantizer(value_range=(-8, 7), output_dtype="int8", axis=0),
97
    activation_quantizer=None,
98
)
99

100
model.quantize(config=custom_int4_config)
101

102
# Verify that weights are quantized (int8 backing int4) but no activation quantization logic is added
103
print("Layer 0 kernel dtype:", model.layers[0].kernel.dtype)
104
print("Layer 0 has inputs_quantizer:", model.layers[0].inputs_quantizer is not None)
105

106
"""
107
## 3. Custom Quantizers: Implementing a Percentile Quantizer
108

109
Sometimes, standard absolute-max quantization isn't enough. You might want to be robust to outliers by using **percentile-based quantization**.
110
Keras allows you to define your own quantizer by subclassing `keras.quantizers.Quantizer`.
111

112
Below is an implementation of a `PercentileQuantizer` that sets the scale based on a specified percentile of the absolute values.
113
"""
114

115
from keras.quantizers import Quantizer
116
from keras import backend
117

118

119
class PercentileQuantizer(Quantizer):
120
    """Quantizes x using the percentile-based scale."""
121

122
    def __init__(
123
        self,
124
        percentile=99.9,
125
        value_range=(-127, 127),  # Default range for int8
126
        epsilon=backend.epsilon(),
127
        output_dtype="int8",  # Default dtype for int8
128
    ):
129
        super().__init__(output_dtype=output_dtype)
130
        self.percentile = percentile
131
        self.value_range = value_range
132
        self.epsilon = epsilon
133

134
    def __call__(self, x, axis, to_numpy=False):
135
        """Quantizes x using the percentile-based scale.
136

137
        `to_numpy` can be set to True to perform the computation on the host CPU,
138
        which saves device memory.
139
        """
140
        # 1. Compute the percentile value of absolute inputs
141
        x_abs = ops.abs(x)
142

143
        if to_numpy:
144
            x_np = ops.convert_to_numpy(x_abs)
145
            max_val = np.percentile(x_np, self.percentile, axis=axis, keepdims=True)
146
        else:
147
            max_val = ops.quantile(
148
                x_abs, self.percentile / 100, axis=axis, keepdims=True
149
            )
150

151
        # 2. Compute scale
152
        # scale = range_max / max_val
153
        # We ensure max_val is at least epsilon
154
        scale = ops.divide(self.value_range[1], ops.add(max_val, self.epsilon))
155
        if not to_numpy:
156
            scale = ops.cast(scale, backend.standardize_dtype(x.dtype))
157

158
        # 3. Quantize
159
        # q = x * scale
160
        outputs = ops.multiply(x, scale)
161
        outputs = ops.clip(ops.round(outputs), self.value_range[0], self.value_range[1])
162
        outputs = ops.cast(outputs, self.output_dtype)
163

164
        return outputs, scale
165

166
    def get_config(self):
167
        """Returns the config of the quantizer for serialization support."""
168
        return {
169
            "percentile": self.percentile,
170
            "value_range": self.value_range,
171
            "epsilon": self.epsilon,
172
            "output_dtype": self.output_dtype,
173
        }
174

175

176
"""
177
Now we can use this `PercentileQuantizer` in our configuration.
178
"""
179

180
model = get_model()
181

182
# Use the custom quantizer for activations
183
custom_int8_config = Int8QuantizationConfig(
184
    weight_quantizer=AbsMaxQuantizer(axis=0),
185
    activation_quantizer=PercentileQuantizer(percentile=99.9),
186
)
187

188
model.quantize(config=custom_int8_config)
189

190
# Verify the integration
191
print(
192
    "Layer 0 uses custom activation quantizer:",
193
    isinstance(model.layers[0].inputs_quantizer, PercentileQuantizer),
194
)
195

196
"""
197
## Conclusion
198

199
With `QuantizationConfig`, you are no longer limited to stock quantization options.
200
Whether you need weight-only quantization or custom quantizers for specialized hardware or research,
201
Keras provides the modularity to build exactly what you need.
202
"""
203

204
Product

Resources

Company