Path: blob/master/guides/customizing_quantization.py
7753 views
"""1Title: Customizing Quantization with QuantizationConfig2Author: [Jyotinder Singh](https://x.com/Jyotinder_Singh)3Date created: 2025/12/184Last modified: 2025/12/185Description: Guide on using QuantizationConfig for weight-only quantization and custom quantizers.6Accelerator: GPU7"""89"""10## Introduction1112This guide explores the flexible `QuantizationConfig` API in Keras, introduced to give you granular control over how your models are quantized.13While `model.quantize("int8")` provides a great default, you often need more control. For example, to perform **weight-only quantization** (common in LLMs) or to use **custom quantization schemes** (like percentile-based clipping).1415We will cover:16171. **Customizing INT8 Quantization**: Modifying the default parameters (e.g., custom value range).182. **Weight-Only Quantization (INT4)**: Quantizing weights to 4-bit while keeping activations in float, using `Int4QuantizationConfig`.193. **Custom Quantizers**: Implementing a completely custom quantizer (e.g., `PercentileQuantizer`) and using it with `QuantizationConfig`.20"""2122"""23## Setup24"""2526import keras27import numpy as np28from keras import ops2930rng = np.random.default_rng()313233def get_model():34"""Builds a simple Sequential model for demonstration."""35return keras.Sequential(36[37keras.Input(shape=(10,)),38keras.layers.Dense(32, activation="relu"),39keras.layers.Dense(1),40]41)424344"""45## 1. Customizing INT8 Quantization4647By default, `model.quantize("int8")` uses `AbsMaxQuantizer` for both weights and activations which uses the default value range of [-127, 127].48You might want to specify different parameters, such as a restricted value range (if you expect your activations to be within a certain range).49You can do this by creating an `Int8QuantizationConfig`.50"""5152from keras.quantizers import Int8QuantizationConfig, AbsMaxQuantizer5354model = get_model()5556# Create a custom config57# Here we restrict the weight range to [-100, 100] instead of the default [-127, 127]58custom_int8_config = Int8QuantizationConfig(59weight_quantizer=AbsMaxQuantizer(value_range=(-100, 100), axis=0),60activation_quantizer=AbsMaxQuantizer(value_range=(-100, 100), axis=-1),61)6263# Apply quantization with the custom config64model.quantize(config=custom_int8_config)6566print("Layer 0 kernel dtype:", model.layers[0].kernel.dtype)67# Ensure all kernel values are within the specified range68assert ops.all(69ops.less_equal(model.layers[0].kernel, 100)70), "Kernel values are not <= 100"71assert ops.all(72ops.greater_equal(model.layers[0].kernel, -100)73), "Kernel values are not >= -100"7475"""76## 2. Weight-Only Quantization (INT4)7778By default, `model.quantize("int4")` quantizes activations to INT8 while keeping weights in INT4.79For large language models and memory-constrained environments, **weight-only quantization** is a popular technique.80It reduces the model size significantly (keeping weights in 4-bit) while maintaining higher precision for activations.8182To achieve this, we set `activation_quantizer=None` in the `Int4QuantizationConfig`.83"""8485from keras.quantizers import Int4QuantizationConfig8687model = get_model()8889# Define Int4 weight-only config90# We enable Int4 for weights, but disable activation quantization by setting it to None.91# Note that we use `"int8"` as the output dtype since TensorFlow and PyTorch don't support92# `int4`. However, we still benefit from the lower memory usage of int4 weights because of93# bitpacking implemented by Keras.94custom_int4_config = Int4QuantizationConfig(95weight_quantizer=AbsMaxQuantizer(value_range=(-8, 7), output_dtype="int8", axis=0),96activation_quantizer=None,97)9899model.quantize(config=custom_int4_config)100101# Verify that weights are quantized (int8 backing int4) but no activation quantization logic is added102print("Layer 0 kernel dtype:", model.layers[0].kernel.dtype)103print("Layer 0 has inputs_quantizer:", model.layers[0].inputs_quantizer is not None)104105"""106## 3. Custom Quantizers: Implementing a Percentile Quantizer107108Sometimes, standard absolute-max quantization isn't enough. You might want to be robust to outliers by using **percentile-based quantization**.109Keras allows you to define your own quantizer by subclassing `keras.quantizers.Quantizer`.110111Below is an implementation of a `PercentileQuantizer` that sets the scale based on a specified percentile of the absolute values.112"""113114from keras.quantizers import Quantizer115from keras import backend116117118class PercentileQuantizer(Quantizer):119"""Quantizes x using the percentile-based scale."""120121def __init__(122self,123percentile=99.9,124value_range=(-127, 127), # Default range for int8125epsilon=backend.epsilon(),126output_dtype="int8", # Default dtype for int8127):128super().__init__(output_dtype=output_dtype)129self.percentile = percentile130self.value_range = value_range131self.epsilon = epsilon132133def __call__(self, x, axis, to_numpy=False):134"""Quantizes x using the percentile-based scale.135136`to_numpy` can be set to True to perform the computation on the host CPU,137which saves device memory.138"""139# 1. Compute the percentile value of absolute inputs140x_abs = ops.abs(x)141142if to_numpy:143x_np = ops.convert_to_numpy(x_abs)144max_val = np.percentile(x_np, self.percentile, axis=axis, keepdims=True)145else:146max_val = ops.quantile(147x_abs, self.percentile / 100, axis=axis, keepdims=True148)149150# 2. Compute scale151# scale = range_max / max_val152# We ensure max_val is at least epsilon153scale = ops.divide(self.value_range[1], ops.add(max_val, self.epsilon))154if not to_numpy:155scale = ops.cast(scale, backend.standardize_dtype(x.dtype))156157# 3. Quantize158# q = x * scale159outputs = ops.multiply(x, scale)160outputs = ops.clip(ops.round(outputs), self.value_range[0], self.value_range[1])161outputs = ops.cast(outputs, self.output_dtype)162163return outputs, scale164165def get_config(self):166"""Returns the config of the quantizer for serialization support."""167return {168"percentile": self.percentile,169"value_range": self.value_range,170"epsilon": self.epsilon,171"output_dtype": self.output_dtype,172}173174175"""176Now we can use this `PercentileQuantizer` in our configuration.177"""178179model = get_model()180181# Use the custom quantizer for activations182custom_int8_config = Int8QuantizationConfig(183weight_quantizer=AbsMaxQuantizer(axis=0),184activation_quantizer=PercentileQuantizer(percentile=99.9),185)186187model.quantize(config=custom_int8_config)188189# Verify the integration190print(191"Layer 0 uses custom activation quantizer:",192isinstance(model.layers[0].inputs_quantizer, PercentileQuantizer),193)194195"""196## Conclusion197198With `QuantizationConfig`, you are no longer limited to stock quantization options.199Whether you need weight-only quantization or custom quantizers for specialized hardware or research,200Keras provides the modularity to build exactly what you need.201"""202203204