From f72d5d1e32b62dba17efccc33a2a0e1f66892167 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 8 Jun 2026 15:56:55 -0400 Subject: [PATCH 1/7] Make HF generate work on transformers v4.57 and v5 Override `can_generate() -> True` on the HF inference wrapper base. v5's `PreTrainedModel.can_generate` walks `__bases__` by name and stops at any base whose name contains "PreTrainedModel"; the intermediate base hides the `GenerationMixin` inheritance, so the check returned False and `generate()` died with "no attribute 'generation_config'". The override is unconditional, correct on both transformers majors. Also fix stale assertions in the generate tests: `vocab_size` now lives under `base_model.embeddings`, and the returned hidden-states count is `num_blocks + 1`. Co-Authored-By: Claude Opus 4.8 (1M context) --- fast_llm/engine/inference/huggingface.py | 6 ++++++ tests/models/test_generate.py | 14 +++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 8c6365a5f..e38dd073b 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -249,3 +249,9 @@ def stop_workers(self): def inner_forward(self, *args, **kwargs) -> tuple | transformers.utils.generic.ModelOutput: # Meant to be overridden in derived classes raise NotImplementedError() + + @classmethod + def can_generate(cls) -> bool: + # `PreTrainedModel.can_generate` walks `__bases__` by name and stops at any base containing + # "PreTrainedModel"; this intermediate base hides the `GenerationMixin` inheritance from that check. + return True diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index c595b5148..9ca925448 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -151,7 +151,9 @@ def _test_for_batches( if tokenizer is not None: inputs = _prepare_data(tokenizer, use_batch_size2=False) else: - inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=False) + inputs = _prepare_rand_data( + fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size, use_batch_size2=False + ) outputs = _generate( inputs, hf_model, @@ -163,7 +165,9 @@ def _test_for_batches( if tokenizer is not None: inputs = _prepare_data(tokenizer, use_batch_size2=True) else: - inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=True) + inputs = _prepare_rand_data( + fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size, use_batch_size2=True + ) outputs = _generate( inputs, hf_model, @@ -334,7 +338,7 @@ def _test_forward_return_hidden_states( inputs_ids = torch.randint( 1, - fast_llm_model.config.fast_llm_config.base_model.vocab_size if vocab_size is None else vocab_size, + fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size if vocab_size is None else vocab_size, [1, 10], dtype=torch.int64, generator=torch.Generator().manual_seed(42), @@ -345,8 +349,8 @@ def _test_forward_return_hidden_states( input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False ) - # hidden_states include embeddings layer - assert len(res_fast_llm.hidden_states) - 1 == len(fast_llm_model.config.fast_llm_config.base_model.decoder) + # Embeddings + one state per decoder block (the last block's output carries the final norm). + assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks + 1 @pytest.mark.extra_slow From 429c62df533b6f9758643c35db530e15160eb0f7 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 10 Jun 2026 14:36:19 -0400 Subject: [PATCH 2/7] Fix v4.57 generate signature, honor source EOS, repair generate test harness Three fixes needed to make the per-model generate tests run and match HF on transformers 4.57: - `inner_forward` absorbs `cache_position` (and other version-dependent generate plumbing) via `**kwargs`. v4.57's `generate` passes `cache_position` to forward; v5 filters it out. Ignoring it is correct on the `use_cache=False` path. - The HF wrapper's `from_pretrained` applies the source HF config's `bos/eos/pad` token ids to `generation_config` (Fast-LLM's import drops them as non-architecture metadata), so `generate` stops at EOS like `AutoModelForCausalLM`. Exposed as `_apply_generation_token_ids` so manually-constructed wrappers can opt in. - `test_export_for_generate` now passes a `DistributedTestingConfig` (the helper's current signature) instead of a positional list. Co-Authored-By: Claude Opus 4.8 (1M context) --- fast_llm/engine/inference/huggingface.py | 17 ++++++++++++++++- fast_llm/models/gpt/huggingface.py | 4 ++++ tests/models/test_generate.py | 18 ++++++++++++------ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index e38dd073b..a23801d5c 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -10,6 +10,7 @@ from fast_llm.core.distributed import broadcast, broadcast_object, safe_barrier from fast_llm.engine.checkpoint.config import CheckpointLoadConfig, FastLLMCheckpointFormat +from fast_llm.engine.checkpoint.huggingface import HuggingfaceStateDictCheckpointHandler from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.inference.config import _TRANSFORMERS_V4, HuggingfaceModelConfig from fast_llm.engine.inference.runner import InferenceRunner @@ -113,7 +114,21 @@ def from_pretrained( stage_filter=stage_filter, ) - return cls(fast_llm_model, **kwargs) + model = cls(fast_llm_model, **kwargs) + model._apply_generation_token_ids(pretrained_model_name_or_path) + return model + + def _apply_generation_token_ids(self, pretrained: CheckpointLoadConfig) -> None: + # Honor the source HF config's generation token ids: Fast-LLM's import drops them (they are + # generation metadata, not architecture), so `generate` would otherwise never stop at EOS. + # Only external (HF) checkpoints carry them; native Fast-LLM checkpoints leave the defaults. + handler_class = pretrained.format.get_handler_class() + if not issubclass(handler_class, HuggingfaceStateDictCheckpointHandler): + return + hf_config = handler_class._load_config(pretrained.path) + for key in ("bos_token_id", "eos_token_id", "pad_token_id"): + if (token_id := hf_config.get(key)) is not None: + setattr(self.generation_config, key, token_id) def _init_weights(self, module) -> None: raise NotImplementedError(module) diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py index 7d1383b00..252c53d20 100644 --- a/fast_llm/models/gpt/huggingface.py +++ b/fast_llm/models/gpt/huggingface.py @@ -47,6 +47,10 @@ def inner_forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, return_all_prediction_heads: bool = False, + # `generate` passes version-dependent plumbing kwargs (`cache_position`, `logits_to_keep`, ...). + # They don't apply to the `use_cache=False` path: positions are reconstructed from `attention_mask`, + # and the full logits are computed and the last position selected downstream. + **kwargs, ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: return self._inner_forward( self._get_batch(input_ids, attention_mask), diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index 9ca925448..3fb24296c 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -10,6 +10,7 @@ from fast_llm.models.gpt.config import PretrainedGPTModelConfig from fast_llm.models.gpt.conversion.config import LlamaCheckpointFormat from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM +from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingGroup @@ -108,7 +109,9 @@ def _get_fast_llm_model_from_model( multi_stage.load_checkpoint(config.pretrained) - return HuggingfaceGPTModelForCausalLM(multi_stage, runner=runner) + model = HuggingfaceGPTModelForCausalLM(multi_stage, runner=runner) + model._apply_generation_token_ids(config.pretrained) + return model def _trim_output(output, inputs): @@ -246,11 +249,14 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi if model_testing_config.checkpoint_format is None: pytest.skip(f"Conversion not supported for {model_testing_config.name}") run_test_script_for_all_models( - [ - "training.train_iters=1", - f"training.export.format={model_testing_config.checkpoint_format.name}", - "training.export.interval=1", - ], + distributed_testing_config=DistributedTestingConfig( + name="test_export_for_generate", + config_args=[ + "training.train_iters=1", + f"training.export.format={model_testing_config.checkpoint_format.name}", + "training.export.interval=1", + ], + ) ) From 1081d8514bd206808c6581b3157481da4c3eb3ae Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 10 Jun 2026 14:45:30 -0400 Subject: [PATCH 3/7] Enable generate test group for llama/mistral/mixtral MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the `generate` model-testing group from `broken` to `normal` for the models where it now passes end-to-end (verified on transformers 4.57 GPU). The CUDA-bound generate tests gain `@requires_cuda` so they skip on the CPU-only CI runner instead of crashing; `test_export_for_generate` stays CPU-runnable as the dependency root. Models left `broken`, with reasons: qwen_2 (bf16/flash near-tie argmax flip), mtp_llama (forward hidden-states count not modeled for multi-head), starcoder_2 (no converter to export through), diffusion_llama/dream (bidirectional decoding). Split lm_eval into its own `lm_eval` testing group (it previously shared `generate`) so enabling generate doesn't pull the lm_eval tests — broken on transformers v5 — into normal CI. The lm_eval group is unlisted per-config, defaulting to extra-slow. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/models/test_generate.py | 7 +++++++ tests/models/test_lm_eval.py | 6 +++--- tests/utils/model_configs.py | 15 ++++++++------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index 3fb24296c..e40cda12d 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -12,6 +12,7 @@ from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingGroup +from tests.utils.utils import requires_cuda def _prepare_data(tokenizer, use_batch_size2: bool): @@ -211,6 +212,7 @@ def _test_generate( ) +@requires_cuda @pytest.mark.extra_slow @pytest.mark.parametrize( "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2", @@ -260,6 +262,7 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi ) +@requires_cuda @pytest.mark.slow @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) @pytest.mark.parametrize( @@ -313,6 +316,7 @@ def _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format) ) +@requires_cuda @pytest.mark.extra_slow def test_generate_from_model( model_path, @@ -320,6 +324,7 @@ def test_generate_from_model( _test_generate_from_model(model_path, AutoTokenizer.from_pretrained(model_path), LlamaCheckpointFormat) +@requires_cuda @pytest.mark.slow @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.generate) @@ -359,6 +364,7 @@ def _test_forward_return_hidden_states( assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks + 1 +@requires_cuda @pytest.mark.extra_slow def test_forward_return_hidden_states(model_path): _test_forward_return_hidden_states( @@ -366,6 +372,7 @@ def test_forward_return_hidden_states(model_path): ) +@requires_cuda @pytest.mark.slow @pytest.mark.model_testing_group(ModelTestingGroup.generate) @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) diff --git a/tests/models/test_lm_eval.py b/tests/models/test_lm_eval.py index 7ae26c2d6..545f42d7c 100644 --- a/tests/models/test_lm_eval.py +++ b/tests/models/test_lm_eval.py @@ -54,7 +54,7 @@ def do_get_lm_eval_config(base_path): # "gsm8k,xnli_en,wikitext" -@pytest.mark.model_testing_group(ModelTestingGroup.generate) +@pytest.mark.model_testing_group(ModelTestingGroup.lm_eval) def test_lm_eval_in_training(run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config): run_test_script_for_all_models( distributed_testing_config=DistributedTestingConfig( @@ -75,7 +75,7 @@ def do_copy_training_output(distributed_testing_config: DistributedTestingConfig @pytest.mark.depends_on(on=["test_lm_eval_in_training[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.generate) +@pytest.mark.model_testing_group(ModelTestingGroup.lm_eval) def test_lm_eval_evaluation_last_checkpoint( run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config, copy_training_output ): @@ -89,7 +89,7 @@ def test_lm_eval_evaluation_last_checkpoint( @pytest.mark.depends_on(on=["test_lm_eval_in_training[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.generate) +@pytest.mark.model_testing_group(ModelTestingGroup.lm_eval) def test_lm_eval_evaluation_from_pretrained( run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config ): diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 3a54be088..f2f4a4aa9 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -48,6 +48,7 @@ class ModelTestingGroup(enum.StrEnum): checkpoint = "checkpoint" convert = "convert" generate = "generate" + lm_eval = "lm_eval" megatron = "megatron" distributed = "distributed" streaming = "streaming" @@ -393,12 +394,11 @@ def update_and_add_testing_config( "--untie-embeddings-and-output-weights", ], checkpoint_format=LlamaCheckpointFormat, - # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.main, ModelTestingGroup.checkpoint: ModelTestingGroupAction.main, ModelTestingGroup.convert: ModelTestingGroupAction.main, - ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.generate: ModelTestingGroupAction.normal, ModelTestingGroup.megatron: ModelTestingGroupAction.normal, ModelTestingGroup.distributed: ModelTestingGroupAction.normal, ModelTestingGroup.streaming: ModelTestingGroupAction.normal, @@ -486,7 +486,8 @@ def update_and_add_testing_config( # Megatron doesn't support multi-token prediction. megatron_args=None, checkpoint_format=MTPLlamaCheckpointFormat, - # TODO: Add back generate as `normal` when stable. + # `generate` matches HF, but the forward hidden-states check stays `broken`: multi-token prediction + # returns extra per-head states the single-head count assertion doesn't model. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, @@ -514,7 +515,8 @@ def update_and_add_testing_config( # Megatron doesn't support per sub layer biases. megatron_args=None, checkpoint_format=Qwen2CheckpointFormat, - # TODO: Add back generate as `normal` when stable. + # `generate` matches HF in fp32 but diverges in bf16/flash: a near-tie argmax flips on numerical + # noise within the compared horizon. Stays `broken` pending a curated low-margin-free case. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, @@ -560,12 +562,11 @@ def update_and_add_testing_config( # Megatron doesn't support sliding windows. megatron_args=None, checkpoint_format=MistralCheckpointFormat, - # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, - ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.generate: ModelTestingGroupAction.normal, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, @@ -653,7 +654,7 @@ def update_and_add_testing_config( ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, - ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.generate: ModelTestingGroupAction.normal, ModelTestingGroup.megatron: ModelTestingGroupAction.normal, ModelTestingGroup.distributed: ModelTestingGroupAction.normal, }, From 4a30b4751b942bc8b325da816e95a29bf3edef71 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 10 Jun 2026 16:08:53 -0400 Subject: [PATCH 4/7] Allowlist `is_llama_config` in HF config coverage check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit transformers v4 LlamaConfig carries an `is_llama_config` marker (dropped in v5) that Fast-LLM doesn't consume and that a bare PretrainedConfig omits, so the import-boundary coverage check rejected it — Fast-LLM could not import a real transformers-4.x Llama checkpoint. Add it to the static metadata allowlist. Verified on transformers 4.57.5: all supported HF converters (llama, qwen2, mistral, mixtral, mtp_llama) now report no unconsumed config keys. Co-Authored-By: Claude Opus 4.8 (1M context) --- fast_llm/engine/checkpoint/huggingface.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fast_llm/engine/checkpoint/huggingface.py b/fast_llm/engine/checkpoint/huggingface.py index bbc3a0a91..5b4fc2250 100644 --- a/fast_llm/engine/checkpoint/huggingface.py +++ b/fast_llm/engine/checkpoint/huggingface.py @@ -135,6 +135,9 @@ def _export_config(cls, config: FastLLMModelConfig) -> dict[str, typing.Any]: "auto_map", "torch_dtype", "use_cache", + # Architecture-family marker some transformers v4 configs carry (e.g. LlamaConfig); dropped + # in v5, not consumed by Fast-LLM, and absent from a bare ``PretrainedConfig``. + "is_llama_config", # Token ids — generation/inference, not architecture (a bare v5 config omits these). "bos_token_id", "decoder_start_token_id", From 1e8fa93597eb48d273b71266de215dd74a8524e1 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 10 Jun 2026 16:38:31 -0400 Subject: [PATCH 5/7] Demote starcoder_2 generate/convert (no HF converter) starcoder_2 has no checkpoint_format. The export-based generate tests always skip, so `generate` becomes `not_implemented` (was the misleading `broken`). The convert group still runs the native Distributed<->FastLLM round-trip, but that machinery is exercised by other models, so it drops to `unimportant`. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/utils/model_configs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index f2f4a4aa9..4e07cf84b 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -359,12 +359,13 @@ def update_and_add_testing_config( "--no-position-embedding", ], checkpoint_format=None, - # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, - ModelTestingGroup.convert: ModelTestingGroupAction.normal, - ModelTestingGroup.generate: ModelTestingGroupAction.broken, + # No HF checkpoint format: the native conversion round-trip is redundant with other models, + # and the export-based generate tests can't run. + ModelTestingGroup.convert: ModelTestingGroupAction.unimportant, + ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, ModelTestingGroup.megatron: ModelTestingGroupAction.unimportant, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, From c5cc067a9b1d8869a3f996b412408a9a5ea47188 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 10 Jun 2026 18:33:30 -0400 Subject: [PATCH 6/7] Stop prediction-head logits leaking into generate hidden states; enable mtp_llama `inner_forward` popped only the main head's logits out of the hidden-states namespace, so on multi-token-prediction models the extra heads' logits leaked into the returned `hidden_states`. Pop every head's logits (discarding the prediction heads' when not stacking them). The forward hidden-states count is then `num_blocks + prediction_heads` for any head configuration, generalizing the previous single-head `num_blocks + 1`. With both fixed, mtp_llama's generate tests pass, so its `generate` group moves to `normal` (verified on 4.57 GPU). Co-Authored-By: Claude Opus 4.8 (1M context) --- fast_llm/models/gpt/huggingface.py | 17 +++++++---------- tests/models/test_generate.py | 6 ++++-- tests/models/test_lm_eval.py | 2 +- tests/utils/model_configs.py | 4 +--- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py index 252c53d20..3d6a85c4b 100644 --- a/fast_llm/models/gpt/huggingface.py +++ b/fast_llm/models/gpt/huggingface.py @@ -133,16 +133,13 @@ def _inner_forward( for name, (meta, tensor) in model_input.hidden_states.items() } - logits = hidden_states.pop(f"{self.fast_llm_base_model.head.module_name}.logits") - if return_all_prediction_heads: - logits = torch.stack( - [logits] - + [ - hidden_states.pop(f"{head.module_name}.logits") - for head in self.fast_llm_base_model.multi_token_prediction.heads - ], - dim=-2, - ) + # Every head emits its logits into the hidden-states namespace; pop them all so the prediction + # heads' logits don't leak into the returned hidden states. + head_logits = [ + hidden_states.pop(f"{head.module_name}.logits") + for head in (self.fast_llm_base_model.head, *self.fast_llm_base_model.multi_token_prediction.heads) + ] + logits = torch.stack(head_logits, dim=-2) if return_all_prediction_heads else head_logits[0] output = transformers.modeling_outputs.CausalLMOutputWithPast( logits=logits, diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index e40cda12d..6eabe8683 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -360,8 +360,10 @@ def _test_forward_return_hidden_states( input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False ) - # Embeddings + one state per decoder block (the last block's output carries the final norm). - assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks + 1 + # Embeddings + one state per decoder block + one final-norm state per prediction head + # (the last block's output is carried by the heads' final norms). + base_model = fast_llm_model.config.fast_llm_config.base_model + assert len(res_fast_llm.hidden_states) == base_model.decoder.num_blocks + base_model.head.prediction_heads @requires_cuda diff --git a/tests/models/test_lm_eval.py b/tests/models/test_lm_eval.py index 545f42d7c..75a1b3e98 100644 --- a/tests/models/test_lm_eval.py +++ b/tests/models/test_lm_eval.py @@ -108,7 +108,7 @@ def test_lm_eval_evaluation_from_pretrained( # TODO: rewrite for a new distributed test function # @pytest.mark.depends_on(on=["test_lm_eval_in_training[{model_testing_config}]"]) -# @pytest.mark.model_testing_group(ModelTestingGroup.generate, ModelTestingGroup.distributed) +# @pytest.mark.model_testing_group(ModelTestingGroup.lm_eval, ModelTestingGroup.distributed) # def test_lm_eval_in_training_dp2(run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config): # run_test_script_for_all_models( # distributed_testing_config=DistributedTestingConfig( diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 4e07cf84b..2057cce07 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -487,13 +487,11 @@ def update_and_add_testing_config( # Megatron doesn't support multi-token prediction. megatron_args=None, checkpoint_format=MTPLlamaCheckpointFormat, - # `generate` matches HF, but the forward hidden-states check stays `broken`: multi-token prediction - # returns extra per-head states the single-head count assertion doesn't model. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, - ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.generate: ModelTestingGroupAction.normal, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, From 85ffad78f038e18d58d7ce2d436bd781b16ab0f8 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 10 Jun 2026 19:47:38 -0400 Subject: [PATCH 7/7] Clarify qwen_2 broken-reason comment Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/utils/model_configs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 2057cce07..4f2da8b54 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -515,7 +515,8 @@ def update_and_add_testing_config( megatron_args=None, checkpoint_format=Qwen2CheckpointFormat, # `generate` matches HF in fp32 but diverges in bf16/flash: a near-tie argmax flips on numerical - # noise within the compared horizon. Stays `broken` pending a curated low-margin-free case. + # noise within the compared horizon. Stays `broken` pending a curated case free of near-tie + # (low-margin) argmax positions. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,