-
Notifications
You must be signed in to change notification settings - Fork 88
Add user_vision_size in VLM's get_specializations for chunked embedding in vLLM v1 #996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: release/v1.21.6
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -731,7 +731,12 @@ def get_specializations( | |
| elif img_size is None: | ||
| img_size = 896 # FIXME based on gemma3 Image size | ||
| logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config") | ||
| mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) | ||
| user_vision_size = compiler_options.pop("vision_size", None) | ||
| if user_vision_size: | ||
| assert user_vision_size < ctx_len, "vision_size must be less than ctx_len" | ||
| vision_size = user_vision_size | ||
| else: | ||
| vision_size = getattr(self.config, "mm_tokens_per_image", 256) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a warning mm_tokens_per_image will be deprecated in the next release. Having two input arguments for a single input introduces ambiguity. |
||
|
|
||
| vision = [ | ||
| { | ||
|
|
@@ -752,7 +757,7 @@ def get_specializations( | |
| "comp_ctx_lengths": comp_ctx_lengths_prefill[i], | ||
| "sliding_window": self.language_model.config.sliding_window, | ||
| "img_size": img_size, | ||
| "mm_tokens_per_image": mm_tokens_per_image, | ||
| "vision_size": vision_size, | ||
| "vision_batch_size": batch_size, | ||
| } | ||
| if continuous_batching: | ||
|
|
@@ -771,7 +776,7 @@ def get_specializations( | |
| "comp_ctx_lengths": comp_ctx_lengths_decode[i], | ||
| "sliding_window": self.language_model.config.sliding_window, | ||
| "img_size": img_size, | ||
| "mm_tokens_per_image": mm_tokens_per_image, | ||
| "vision_size": vision_size, | ||
| "vision_batch_size": batch_size, | ||
| } | ||
| if continuous_batching: | ||
|
|
@@ -787,7 +792,7 @@ def get_specializations( | |
| "ctx_len": ctx_len, | ||
| "sliding_window": self.language_model.config.sliding_window, | ||
| "img_size": img_size, | ||
| "mm_tokens_per_image": mm_tokens_per_image, | ||
| "vision_size": vision_size, | ||
| "vision_batch_size": batch_size, | ||
| } | ||
| if continuous_batching: | ||
|
|
@@ -803,7 +808,7 @@ def get_specializations( | |
| "ctx_len": ctx_len, | ||
| "sliding_window": self.language_model.config.sliding_window, | ||
| "img_size": img_size, | ||
| "mm_tokens_per_image": mm_tokens_per_image, | ||
| "vision_size": vision_size, | ||
| "vision_batch_size": batch_size, | ||
| } | ||
| if continuous_batching: | ||
|
|
@@ -829,7 +834,7 @@ def get_onnx_dynamic_axes( | |
| lang_dynamic_axes = {} | ||
| lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} | ||
| lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} | ||
| lang_dynamic_axes["vision_embeds"] = {0: "vision_batch_size", 1: "mm_tokens_per_image"} | ||
| lang_dynamic_axes["vision_embeds"] = {0: "vision_batch_size", 1: "vision_size"} | ||
| if continuous_batching: | ||
| lang_dynamic_axes["batch_index"] = {0: "batch_size"} | ||
| vision_dynamic_axes["pixel_values"] = {0: "batch_size", 2: "img_size", 3: "img_size"} | ||
|
|
@@ -911,13 +916,13 @@ def get_dummy_inputs( | |
| else: | ||
| img_size = 896 | ||
|
|
||
| mm_tokens_per_image = getattr(self.config, "mm_tokens_per_image", 256) | ||
| vision_size = getattr(self.config, "mm_tokens_per_image", 256) | ||
| # Define shapes | ||
| inputs_shapes = {} | ||
| inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) | ||
| inputs_shapes["vision_embeds"] = ( | ||
| 1, # constants.INTERN_NUM_PATCHES, | ||
| mm_tokens_per_image, # constants.INTERN_FEATURE_SIZE, | ||
| vision_size, # constants.INTERN_FEATURE_SIZE, | ||
| self.language_model.config.hidden_size, # 5120 | ||
| ) | ||
| inputs_shapes["position_ids"] = ( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -972,13 +972,14 @@ def get_specializations( | |
| "resolution." | ||
| ) | ||
| else: | ||
| assert vision_size * f <= user_vision_size, ( | ||
| f"Computed vision_size of {vision_size * f} tokens " | ||
| f"(vision_size={vision_size}, num_frames={f}) for image resolution " | ||
| f"(width={w}, height={h}) cannot exceed the provided " | ||
| f"vision_size={user_vision_size}. Please adjust the image resolution or " | ||
| "increase the vision_size." | ||
| ) | ||
| if vision_size * f >= user_vision_size: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Do we want the warning to be raised even if vision_size * f == user_vision_size? I think we should raise the warning only when the user_vision_size is strictly less than the calculated size. |
||
| logger.warning_once( | ||
| f"Computed vision_size of {vision_size * f} tokens " | ||
| f"(vision_size={vision_size}, num_frames={f}) for image resolution " | ||
| f"(width={w}, height={h}) exceed the provided " | ||
| f"vision_size={user_vision_size}. " | ||
| f"Vision embedding need to be chunked during prefill." | ||
| ) | ||
|
|
||
| vision.append( | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: raise an exception instead of assert in all modelling changes