Quantized T5-XXL of FLUX.1[schnell] using HuggingFace optimum-quanto.
Quantize
import torch
from transformers import T5EncoderModel
from optimum.quanto import (
QuantizedTransformersModel,
qfloat8_e4m3fn,
qfloat8_e5m2,
qint8,
qint4,
)
REPO_NAME = "black-forest-labs/FLUX.1-schnell"
TEXT_ENCODER = "text_encoder_2"
model = T5EncoderModel.from_pretrained(
REPO_NAME, subfolder=TEXT_ENCODER, torch_dtype=torch.bfloat16
)
qmodel = QuantizedTransformersModel.quantize(
model,
weights=qfloat8_e4m3fn,
)
qmodel.save_pretrained("./t5_xxl/qfloat8_e4m3fn")
Load
Currently QuantizedTransformersModel
does not support load a quantized model from huggingface hub.
from transformers import T5EncoderModel, AutoModelForTextEncoding
from optimum.quanto import QuantizedTransformersModel
MODEL_PATH = "./t5_xxl/qfloat8_e4m3fn"
class QuantizedModelForTextEncoding(QuantizedTransformersModel):
auto_class = AutoModelForTextEncoding
qmodel = QuantizedModelForTextEncoding.from_pretrained(
"./t5_xxl/qint8",
)