From f72d5d1e32b62dba17efccc33a2a0e1f66892167 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 8 Jun 2026 15:56:55 -0400
Subject: [PATCH 1/7] Make HF generate work on transformers v4.57 and v5

Override `can_generate() -> True` on the HF inference wrapper base. v5's
`PreTrainedModel.can_generate` walks `__bases__` by name and stops at any base
whose name contains "PreTrainedModel"; the intermediate base hides the
`GenerationMixin` inheritance, so the check returned False and `generate()`
died with "no attribute 'generation_config'". The override is unconditional,
correct on both transformers majors.

Also fix stale assertions in the generate tests: `vocab_size` now lives under
`base_model.embeddings`, and the returned hidden-states count is
`num_blocks + 1`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 fast_llm/engine/inference/huggingface.py |  6 ++++++
 tests/models/test_generate.py            | 14 +++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index 8c6365a5f..e38dd073b 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -249,3 +249,9 @@ def stop_workers(self):
     def inner_forward(self, *args, **kwargs) -> tuple | transformers.utils.generic.ModelOutput:
         # Meant to be overridden in derived classes
         raise NotImplementedError()
+
+    @classmethod
+    def can_generate(cls) -> bool:
+        # `PreTrainedModel.can_generate` walks `__bases__` by name and stops at any base containing
+        # "PreTrainedModel"; this intermediate base hides the `GenerationMixin` inheritance from that check.
+        return True
diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py
index c595b5148..9ca925448 100644
--- a/tests/models/test_generate.py
+++ b/tests/models/test_generate.py
@@ -151,7 +151,9 @@ def _test_for_batches(
     if tokenizer is not None:
         inputs = _prepare_data(tokenizer, use_batch_size2=False)
     else:
-        inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=False)
+        inputs = _prepare_rand_data(
+            fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size, use_batch_size2=False
+        )
     outputs = _generate(
         inputs,
         hf_model,
@@ -163,7 +165,9 @@ def _test_for_batches(
     if tokenizer is not None:
         inputs = _prepare_data(tokenizer, use_batch_size2=True)
     else:
-        inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=True)
+        inputs = _prepare_rand_data(
+            fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size, use_batch_size2=True
+        )
     outputs = _generate(
         inputs,
         hf_model,
@@ -334,7 +338,7 @@ def _test_forward_return_hidden_states(
 
     inputs_ids = torch.randint(
         1,
-        fast_llm_model.config.fast_llm_config.base_model.vocab_size if vocab_size is None else vocab_size,
+        fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size if vocab_size is None else vocab_size,
         [1, 10],
         dtype=torch.int64,
         generator=torch.Generator().manual_seed(42),
@@ -345,8 +349,8 @@ def _test_forward_return_hidden_states(
         input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False
     )
 
-    # hidden_states include embeddings layer
-    assert len(res_fast_llm.hidden_states) - 1 == len(fast_llm_model.config.fast_llm_config.base_model.decoder)
+    # Embeddings + one state per decoder block (the last block's output carries the final norm).
+    assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks + 1
 
 
 @pytest.mark.extra_slow

From 429c62df533b6f9758643c35db530e15160eb0f7 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 10 Jun 2026 14:36:19 -0400
Subject: [PATCH 2/7] Fix v4.57 generate signature, honor source EOS, repair
 generate test harness

Three fixes needed to make the per-model generate tests run and match HF on
transformers 4.57:

- `inner_forward` absorbs `cache_position` (and other version-dependent generate
  plumbing) via `**kwargs`. v4.57's `generate` passes `cache_position` to forward;
  v5 filters it out. Ignoring it is correct on the `use_cache=False` path.
- The HF wrapper's `from_pretrained` applies the source HF config's `bos/eos/pad`
  token ids to `generation_config` (Fast-LLM's import drops them as non-architecture
  metadata), so `generate` stops at EOS like `AutoModelForCausalLM`. Exposed as
  `_apply_generation_token_ids` so manually-constructed wrappers can opt in.
- `test_export_for_generate` now passes a `DistributedTestingConfig` (the helper's
  current signature) instead of a positional list.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 fast_llm/engine/inference/huggingface.py | 17 ++++++++++++++++-
 fast_llm/models/gpt/huggingface.py       |  4 ++++
 tests/models/test_generate.py            | 18 ++++++++++++------
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index e38dd073b..a23801d5c 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -10,6 +10,7 @@
 
 from fast_llm.core.distributed import broadcast, broadcast_object, safe_barrier
 from fast_llm.engine.checkpoint.config import CheckpointLoadConfig, FastLLMCheckpointFormat
+from fast_llm.engine.checkpoint.huggingface import HuggingfaceStateDictCheckpointHandler
 from fast_llm.engine.distributed.distributed import Distributed
 from fast_llm.engine.inference.config import _TRANSFORMERS_V4, HuggingfaceModelConfig
 from fast_llm.engine.inference.runner import InferenceRunner
@@ -113,7 +114,21 @@ def from_pretrained(
             stage_filter=stage_filter,
         )
 
-        return cls(fast_llm_model, **kwargs)
+        model = cls(fast_llm_model, **kwargs)
+        model._apply_generation_token_ids(pretrained_model_name_or_path)
+        return model
+
+    def _apply_generation_token_ids(self, pretrained: CheckpointLoadConfig) -> None:
+        # Honor the source HF config's generation token ids: Fast-LLM's import drops them (they are
+        # generation metadata, not architecture), so `generate` would otherwise never stop at EOS.
+        # Only external (HF) checkpoints carry them; native Fast-LLM checkpoints leave the defaults.
+        handler_class = pretrained.format.get_handler_class()
+        if not issubclass(handler_class, HuggingfaceStateDictCheckpointHandler):
+            return
+        hf_config = handler_class._load_config(pretrained.path)
+        for key in ("bos_token_id", "eos_token_id", "pad_token_id"):
+            if (token_id := hf_config.get(key)) is not None:
+                setattr(self.generation_config, key, token_id)
 
     def _init_weights(self, module) -> None:
         raise NotImplementedError(module)
diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py
index 7d1383b00..252c53d20 100644
--- a/fast_llm/models/gpt/huggingface.py
+++ b/fast_llm/models/gpt/huggingface.py
@@ -47,6 +47,10 @@ def inner_forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         return_all_prediction_heads: bool = False,
+        # `generate` passes version-dependent plumbing kwargs (`cache_position`, `logits_to_keep`, ...).
+        # They don't apply to the `use_cache=False` path: positions are reconstructed from `attention_mask`,
+        # and the full logits are computed and the last position selected downstream.
+        **kwargs,
     ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast:
         return self._inner_forward(
             self._get_batch(input_ids, attention_mask),
diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py
index 9ca925448..3fb24296c 100644
--- a/tests/models/test_generate.py
+++ b/tests/models/test_generate.py
@@ -10,6 +10,7 @@
 from fast_llm.models.gpt.config import PretrainedGPTModelConfig
 from fast_llm.models.gpt.conversion.config import LlamaCheckpointFormat
 from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
+from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingGroup
 
 
@@ -108,7 +109,9 @@ def _get_fast_llm_model_from_model(
 
     multi_stage.load_checkpoint(config.pretrained)
 
-    return HuggingfaceGPTModelForCausalLM(multi_stage, runner=runner)
+    model = HuggingfaceGPTModelForCausalLM(multi_stage, runner=runner)
+    model._apply_generation_token_ids(config.pretrained)
+    return model
 
 
 def _trim_output(output, inputs):
@@ -246,11 +249,14 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi
     if model_testing_config.checkpoint_format is None:
         pytest.skip(f"Conversion not supported for {model_testing_config.name}")
     run_test_script_for_all_models(
-        [
-            "training.train_iters=1",
-            f"training.export.format={model_testing_config.checkpoint_format.name}",
-            "training.export.interval=1",
-        ],
+        distributed_testing_config=DistributedTestingConfig(
+            name="test_export_for_generate",
+            config_args=[
+                "training.train_iters=1",
+                f"training.export.format={model_testing_config.checkpoint_format.name}",
+                "training.export.interval=1",
+            ],
+        )
     )
 
 

From 1081d8514bd206808c6581b3157481da4c3eb3ae Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 10 Jun 2026 14:45:30 -0400
Subject: [PATCH 3/7] Enable generate test group for llama/mistral/mixtral
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the `generate` model-testing group from `broken` to `normal` for the models
where it now passes end-to-end (verified on transformers 4.57 GPU). The CUDA-bound
generate tests gain `@requires_cuda` so they skip on the CPU-only CI runner instead
of crashing; `test_export_for_generate` stays CPU-runnable as the dependency root.

Models left `broken`, with reasons: qwen_2 (bf16/flash near-tie argmax flip),
mtp_llama (forward hidden-states count not modeled for multi-head), starcoder_2
(no converter to export through), diffusion_llama/dream (bidirectional decoding).

Split lm_eval into its own `lm_eval` testing group (it previously shared `generate`)
so enabling generate doesn't pull the lm_eval tests — broken on transformers v5 —
into normal CI. The lm_eval group is unlisted per-config, defaulting to extra-slow.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/models/test_generate.py |  7 +++++++
 tests/models/test_lm_eval.py  |  6 +++---
 tests/utils/model_configs.py  | 15 ++++++++-------
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py
index 3fb24296c..e40cda12d 100644
--- a/tests/models/test_generate.py
+++ b/tests/models/test_generate.py
@@ -12,6 +12,7 @@
 from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import ModelTestingGroup
+from tests.utils.utils import requires_cuda
 
 
 def _prepare_data(tokenizer, use_batch_size2: bool):
@@ -211,6 +212,7 @@ def _test_generate(
     )
 
 
+@requires_cuda
 @pytest.mark.extra_slow
 @pytest.mark.parametrize(
     "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2",
@@ -260,6 +262,7 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi
     )
 
 
+@requires_cuda
 @pytest.mark.slow
 @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"])
 @pytest.mark.parametrize(
@@ -313,6 +316,7 @@ def _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format)
     )
 
 
+@requires_cuda
 @pytest.mark.extra_slow
 def test_generate_from_model(
     model_path,
@@ -320,6 +324,7 @@ def test_generate_from_model(
     _test_generate_from_model(model_path, AutoTokenizer.from_pretrained(model_path), LlamaCheckpointFormat)
 
 
+@requires_cuda
 @pytest.mark.slow
 @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"])
 @pytest.mark.model_testing_group(ModelTestingGroup.generate)
@@ -359,6 +364,7 @@ def _test_forward_return_hidden_states(
     assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks + 1
 
 
+@requires_cuda
 @pytest.mark.extra_slow
 def test_forward_return_hidden_states(model_path):
     _test_forward_return_hidden_states(
@@ -366,6 +372,7 @@ def test_forward_return_hidden_states(model_path):
     )
 
 
+@requires_cuda
 @pytest.mark.slow
 @pytest.mark.model_testing_group(ModelTestingGroup.generate)
 @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"])
diff --git a/tests/models/test_lm_eval.py b/tests/models/test_lm_eval.py
index 7ae26c2d6..545f42d7c 100644
--- a/tests/models/test_lm_eval.py
+++ b/tests/models/test_lm_eval.py
@@ -54,7 +54,7 @@ def do_get_lm_eval_config(base_path):
 # "gsm8k,xnli_en,wikitext"
 
 
-@pytest.mark.model_testing_group(ModelTestingGroup.generate)
+@pytest.mark.model_testing_group(ModelTestingGroup.lm_eval)
 def test_lm_eval_in_training(run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config):
     run_test_script_for_all_models(
         distributed_testing_config=DistributedTestingConfig(
@@ -75,7 +75,7 @@ def do_copy_training_output(distributed_testing_config: DistributedTestingConfig
 
 
 @pytest.mark.depends_on(on=["test_lm_eval_in_training[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.generate)
+@pytest.mark.model_testing_group(ModelTestingGroup.lm_eval)
 def test_lm_eval_evaluation_last_checkpoint(
     run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config, copy_training_output
 ):
@@ -89,7 +89,7 @@ def test_lm_eval_evaluation_last_checkpoint(
 
 
 @pytest.mark.depends_on(on=["test_lm_eval_in_training[{model_testing_config}]"])
-@pytest.mark.model_testing_group(ModelTestingGroup.generate)
+@pytest.mark.model_testing_group(ModelTestingGroup.lm_eval)
 def test_lm_eval_evaluation_from_pretrained(
     run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config
 ):
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 3a54be088..f2f4a4aa9 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -48,6 +48,7 @@ class ModelTestingGroup(enum.StrEnum):
     checkpoint = "checkpoint"
     convert = "convert"
     generate = "generate"
+    lm_eval = "lm_eval"
     megatron = "megatron"
     distributed = "distributed"
     streaming = "streaming"
@@ -393,12 +394,11 @@ def update_and_add_testing_config(
         "--untie-embeddings-and-output-weights",
     ],
     checkpoint_format=LlamaCheckpointFormat,
-    # TODO: Add back generate as `normal` when stable.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.main,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.main,
         ModelTestingGroup.convert: ModelTestingGroupAction.main,
-        ModelTestingGroup.generate: ModelTestingGroupAction.broken,
+        ModelTestingGroup.generate: ModelTestingGroupAction.normal,
         ModelTestingGroup.megatron: ModelTestingGroupAction.normal,
         ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
         ModelTestingGroup.streaming: ModelTestingGroupAction.normal,
@@ -486,7 +486,8 @@ def update_and_add_testing_config(
     # Megatron doesn't support multi-token prediction.
     megatron_args=None,
     checkpoint_format=MTPLlamaCheckpointFormat,
-    # TODO: Add back generate as `normal` when stable.
+    # `generate` matches HF, but the forward hidden-states check stays `broken`: multi-token prediction
+    # returns extra per-head states the single-head count assertion doesn't model.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
@@ -514,7 +515,8 @@ def update_and_add_testing_config(
     # Megatron doesn't support per sub layer biases.
     megatron_args=None,
     checkpoint_format=Qwen2CheckpointFormat,
-    # TODO: Add back generate as `normal` when stable.
+    # `generate` matches HF in fp32 but diverges in bf16/flash: a near-tie argmax flips on numerical
+    # noise within the compared horizon. Stays `broken` pending a curated low-margin-free case.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
@@ -560,12 +562,11 @@ def update_and_add_testing_config(
     # Megatron doesn't support sliding windows.
     megatron_args=None,
     checkpoint_format=MistralCheckpointFormat,
-    # TODO: Add back generate as `normal` when stable.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
         ModelTestingGroup.convert: ModelTestingGroupAction.normal,
-        ModelTestingGroup.generate: ModelTestingGroupAction.broken,
+        ModelTestingGroup.generate: ModelTestingGroupAction.normal,
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },
@@ -653,7 +654,7 @@ def update_and_add_testing_config(
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
         ModelTestingGroup.convert: ModelTestingGroupAction.normal,
-        ModelTestingGroup.generate: ModelTestingGroupAction.broken,
+        ModelTestingGroup.generate: ModelTestingGroupAction.normal,
         ModelTestingGroup.megatron: ModelTestingGroupAction.normal,
         ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
     },

From 4a30b4751b942bc8b325da816e95a29bf3edef71 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 10 Jun 2026 16:08:53 -0400
Subject: [PATCH 4/7] Allowlist `is_llama_config` in HF config coverage check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

transformers v4 LlamaConfig carries an `is_llama_config` marker (dropped in v5)
that Fast-LLM doesn't consume and that a bare PretrainedConfig omits, so the
import-boundary coverage check rejected it — Fast-LLM could not import a real
transformers-4.x Llama checkpoint. Add it to the static metadata allowlist.

Verified on transformers 4.57.5: all supported HF converters (llama, qwen2,
mistral, mixtral, mtp_llama) now report no unconsumed config keys.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 fast_llm/engine/checkpoint/huggingface.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fast_llm/engine/checkpoint/huggingface.py b/fast_llm/engine/checkpoint/huggingface.py
index bbc3a0a91..5b4fc2250 100644
--- a/fast_llm/engine/checkpoint/huggingface.py
+++ b/fast_llm/engine/checkpoint/huggingface.py
@@ -135,6 +135,9 @@ def _export_config(cls, config: FastLLMModelConfig) -> dict[str, typing.Any]:
             "auto_map",
             "torch_dtype",
             "use_cache",
+            # Architecture-family marker some transformers v4 configs carry (e.g. LlamaConfig); dropped
+            # in v5, not consumed by Fast-LLM, and absent from a bare ``PretrainedConfig``.
+            "is_llama_config",
             # Token ids — generation/inference, not architecture (a bare v5 config omits these).
             "bos_token_id",
             "decoder_start_token_id",

From 1e8fa93597eb48d273b71266de215dd74a8524e1 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 10 Jun 2026 16:38:31 -0400
Subject: [PATCH 5/7] Demote starcoder_2 generate/convert (no HF converter)

starcoder_2 has no checkpoint_format. The export-based generate tests always
skip, so `generate` becomes `not_implemented` (was the misleading `broken`).
The convert group still runs the native Distributed<->FastLLM round-trip, but
that machinery is exercised by other models, so it drops to `unimportant`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/utils/model_configs.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index f2f4a4aa9..4e07cf84b 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -359,12 +359,13 @@ def update_and_add_testing_config(
         "--no-position-embedding",
     ],
     checkpoint_format=None,
-    # TODO: Add back generate as `normal` when stable.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
-        ModelTestingGroup.convert: ModelTestingGroupAction.normal,
-        ModelTestingGroup.generate: ModelTestingGroupAction.broken,
+        # No HF checkpoint format: the native conversion round-trip is redundant with other models,
+        # and the export-based generate tests can't run.
+        ModelTestingGroup.convert: ModelTestingGroupAction.unimportant,
+        ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.megatron: ModelTestingGroupAction.unimportant,
         ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },

From c5cc067a9b1d8869a3f996b412408a9a5ea47188 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 10 Jun 2026 18:33:30 -0400
Subject: [PATCH 6/7] Stop prediction-head logits leaking into generate hidden
 states; enable mtp_llama

`inner_forward` popped only the main head's logits out of the hidden-states
namespace, so on multi-token-prediction models the extra heads' logits leaked
into the returned `hidden_states`. Pop every head's logits (discarding the
prediction heads' when not stacking them). The forward hidden-states count is
then `num_blocks + prediction_heads` for any head configuration, generalizing
the previous single-head `num_blocks + 1`. With both fixed, mtp_llama's generate
tests pass, so its `generate` group moves to `normal` (verified on 4.57 GPU).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 fast_llm/models/gpt/huggingface.py | 17 +++++++----------
 tests/models/test_generate.py      |  6 ++++--
 tests/models/test_lm_eval.py       |  2 +-
 tests/utils/model_configs.py       |  4 +---
 4 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py
index 252c53d20..3d6a85c4b 100644
--- a/fast_llm/models/gpt/huggingface.py
+++ b/fast_llm/models/gpt/huggingface.py
@@ -133,16 +133,13 @@ def _inner_forward(
             for name, (meta, tensor) in model_input.hidden_states.items()
         }
 
-        logits = hidden_states.pop(f"{self.fast_llm_base_model.head.module_name}.logits")
-        if return_all_prediction_heads:
-            logits = torch.stack(
-                [logits]
-                + [
-                    hidden_states.pop(f"{head.module_name}.logits")
-                    for head in self.fast_llm_base_model.multi_token_prediction.heads
-                ],
-                dim=-2,
-            )
+        # Every head emits its logits into the hidden-states namespace; pop them all so the prediction
+        # heads' logits don't leak into the returned hidden states.
+        head_logits = [
+            hidden_states.pop(f"{head.module_name}.logits")
+            for head in (self.fast_llm_base_model.head, *self.fast_llm_base_model.multi_token_prediction.heads)
+        ]
+        logits = torch.stack(head_logits, dim=-2) if return_all_prediction_heads else head_logits[0]
 
         output = transformers.modeling_outputs.CausalLMOutputWithPast(
             logits=logits,
diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py
index e40cda12d..6eabe8683 100644
--- a/tests/models/test_generate.py
+++ b/tests/models/test_generate.py
@@ -360,8 +360,10 @@ def _test_forward_return_hidden_states(
         input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False
     )
 
-    # Embeddings + one state per decoder block (the last block's output carries the final norm).
-    assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks + 1
+    # Embeddings + one state per decoder block + one final-norm state per prediction head
+    # (the last block's output is carried by the heads' final norms).
+    base_model = fast_llm_model.config.fast_llm_config.base_model
+    assert len(res_fast_llm.hidden_states) == base_model.decoder.num_blocks + base_model.head.prediction_heads
 
 
 @requires_cuda
diff --git a/tests/models/test_lm_eval.py b/tests/models/test_lm_eval.py
index 545f42d7c..75a1b3e98 100644
--- a/tests/models/test_lm_eval.py
+++ b/tests/models/test_lm_eval.py
@@ -108,7 +108,7 @@ def test_lm_eval_evaluation_from_pretrained(
 
 # TODO: rewrite for a new distributed test function
 # @pytest.mark.depends_on(on=["test_lm_eval_in_training[{model_testing_config}]"])
-# @pytest.mark.model_testing_group(ModelTestingGroup.generate, ModelTestingGroup.distributed)
+# @pytest.mark.model_testing_group(ModelTestingGroup.lm_eval, ModelTestingGroup.distributed)
 # def test_lm_eval_in_training_dp2(run_test_script_for_all_models, run_test_script_base_path, get_lm_eval_config):
 #     run_test_script_for_all_models(
 #         distributed_testing_config=DistributedTestingConfig(
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 4e07cf84b..2057cce07 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -487,13 +487,11 @@ def update_and_add_testing_config(
     # Megatron doesn't support multi-token prediction.
     megatron_args=None,
     checkpoint_format=MTPLlamaCheckpointFormat,
-    # `generate` matches HF, but the forward hidden-states check stays `broken`: multi-token prediction
-    # returns extra per-head states the single-head count assertion doesn't model.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
         ModelTestingGroup.convert: ModelTestingGroupAction.normal,
-        ModelTestingGroup.generate: ModelTestingGroupAction.broken,
+        ModelTestingGroup.generate: ModelTestingGroupAction.normal,
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },

From 85ffad78f038e18d58d7ce2d436bd781b16ab0f8 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 10 Jun 2026 19:47:38 -0400
Subject: [PATCH 7/7] Clarify qwen_2 broken-reason comment

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/utils/model_configs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 2057cce07..4f2da8b54 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -515,7 +515,8 @@ def update_and_add_testing_config(
     megatron_args=None,
     checkpoint_format=Qwen2CheckpointFormat,
     # `generate` matches HF in fp32 but diverges in bf16/flash: a near-tie argmax flips on numerical
-    # noise within the compared horizon. Stays `broken` pending a curated low-margin-free case.
+    # noise within the compared horizon. Stays `broken` pending a curated case free of near-tie
+    # (low-margin) argmax positions.
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,