Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions specforge/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class SGLangBackendArgs:
sglang_ep_size: int = 1
sglang_max_running_requests: int = None # assign based on batch size
sglang_max_total_tokens: int = None # assign based on batch size and seq length
sglang_quantization: str = None

@staticmethod
def add_args(parser: argparse.ArgumentParser) -> None:
Expand Down Expand Up @@ -174,6 +175,13 @@ def add_args(parser: argparse.ArgumentParser) -> None:
default=1,
help="The ep size of the SGLang backend",
)
parser.add_argument(
"--sglang-quantization",
type=str,
default=None,
help="The quantization method for the SGLang backend (e.g., w8a8_int8). "
"If not set, no quantization is applied.",
)

@staticmethod
def from_args(args: argparse.Namespace) -> "SGLangBackendArgs":
Expand All @@ -198,6 +206,7 @@ def from_args(args: argparse.Namespace) -> "SGLangBackendArgs":
if hasattr(args, "target_batch_size") and hasattr(args, "max_length")
else None
),
sglang_quantization=getattr(args, "sglang_quantization", None),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For consistency with the other sglang_* fields in this method (lines 189-200), you should access args.sglang_quantization directly. Since this argument is explicitly added to the parser in the add_args method of this class, it is guaranteed to be present in the args namespace when from_args is called.

Suggested change
sglang_quantization=getattr(args, "sglang_quantization", None),
sglang_quantization=args.sglang_quantization,

)

def to_kwargs(self) -> Dict[str, Any]:
Expand All @@ -214,6 +223,7 @@ def to_kwargs(self) -> Dict[str, Any]:
piecewise_cuda_graph_max_tokens=self.sglang_piecewise_cuda_graph_max_tokens,
piecewise_cuda_graph_tokens=self.sglang_piecewise_cuda_graph_tokens,
ep_size=self.sglang_ep_size,
quantization=self.sglang_quantization,
max_running_requests=self.sglang_max_running_requests,
max_total_tokens=self.sglang_max_total_tokens,
)