From dd64bae183b4bedb3cefeaa1ae090bcc455957ef Mon Sep 17 00:00:00 2001 From: liusy58 Date: Fri, 17 Apr 2026 16:08:13 +0800 Subject: [PATCH] support quant --- specforge/args.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/specforge/args.py b/specforge/args.py index 2cd5efc30..389497458 100644 --- a/specforge/args.py +++ b/specforge/args.py @@ -102,6 +102,7 @@ class SGLangBackendArgs: sglang_ep_size: int = 1 sglang_max_running_requests: int = None # assign based on batch size sglang_max_total_tokens: int = None # assign based on batch size and seq length + sglang_quantization: str = None @staticmethod def add_args(parser: argparse.ArgumentParser) -> None: @@ -174,6 +175,13 @@ def add_args(parser: argparse.ArgumentParser) -> None: default=1, help="The ep size of the SGLang backend", ) + parser.add_argument( + "--sglang-quantization", + type=str, + default=None, + help="The quantization method for the SGLang backend (e.g., w8a8_int8). " + "If not set, no quantization is applied.", + ) @staticmethod def from_args(args: argparse.Namespace) -> "SGLangBackendArgs": @@ -198,6 +206,7 @@ def from_args(args: argparse.Namespace) -> "SGLangBackendArgs": if hasattr(args, "target_batch_size") and hasattr(args, "max_length") else None ), + sglang_quantization=getattr(args, "sglang_quantization", None), ) def to_kwargs(self) -> Dict[str, Any]: @@ -214,6 +223,7 @@ def to_kwargs(self) -> Dict[str, Any]: piecewise_cuda_graph_max_tokens=self.sglang_piecewise_cuda_graph_max_tokens, piecewise_cuda_graph_tokens=self.sglang_piecewise_cuda_graph_tokens, ep_size=self.sglang_ep_size, + quantization=self.sglang_quantization, max_running_requests=self.sglang_max_running_requests, max_total_tokens=self.sglang_max_total_tokens, )