vllm - 💡(How to fix) Fix [Bug]: Qwen3.5-dense wfp8afp8 w: per-tensor a: per-tensor Output garbled text, but in sglang is norm [1 participants]

Official PRs (…)
ON THIS PAGE

Recommended Tools

×6

Utilities matched from this issue’s tags and category — try them while you read without losing context.

GitHub issue graph ai analysis

Paste a GitHub issue URL. We fetch that issue, discover linked issues from bodies/comments/timeline, collect linked pull requests, and produce a structured English report.

The report is written in English Markdown for sharing and archival.

Helpful · Quick feedback

Loading…
GitHub stats
vllm-project/vllm#38195Fetched 2026-04-08 01:31:44
View on GitHub
Comments
0
Participants
1
Timeline
2
Reactions
0
Author
Participants
Timeline (top)
closed ×1labeled ×1

Fix Action

Fix / Workaround

config.json

{
  "architectures": [
    "Qwen3_5ForConditionalGeneration"
  ],
  "dtype": "bfloat16",
  "image_token_id": 248056,
  "model_type": "qwen3_5",
  "quantization_config": {
    "config_groups": {
      "group_0": {
        "input_activations": {
          "dynamic": false,
          "group_size": -1,
          "num_bits": 8,
          "strategy": "tensor",
          "type": "float"
        },
        "output_activations": null,
        "targets": [
          "Linear"
        ],
        "weights": {
          "dynamic": false,
          "group_size": -1,
          "num_bits": 8,
          "strategy": "tensor",
          "type": "float"
        }
      }
    },
    "format": "float-quantized",
    "ignore": [
      "lm_head",
      "re:.*embed_tokens$",
      "re:.*conv1d$",
      "re:.*in_proj_a$",
      "re:.*in_proj_b$",
      "re:.*mtp.*",
      "re:.*visual.*"
    ],
    "kv_cache_scheme": null,
    "quant_method": "compressed-tensors",
    "quantization_status": "compressed"
  },
  "text_config": {
    "attention_bias": false,
    "attention_dropout": 0.0,
    "attn_output_gate": true,
    "bos_token_id": null,
    "dtype": "bfloat16",
    "eos_token_id": 248044,
    "full_attention_interval": 4,
    "head_dim": 256,
    "hidden_act": "silu",
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 3584,
    "layer_types": [
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention"
    ],
    "linear_conv_kernel_dim": 4,
    "linear_key_head_dim": 128,
    "linear_num_key_heads": 16,
    "linear_num_value_heads": 16,
    "linear_value_head_dim": 128,
    "mamba_ssm_dtype": "float32",
    "max_position_embeddings": 262144,
    "mlp_only_layers": [],
    "model_type": "qwen3_5_text",
    "mtp_num_hidden_layers": 1,
    "mtp_use_dedicated_embeddings": false,
    "num_attention_heads": 8,
    "num_hidden_layers": 24,
    "num_key_value_heads": 2,
    "pad_token_id": null,
    "partial_rotary_factor": 0.25,
    "rms_norm_eps": 1e-06,
    "rope_parameters": {
      "mrope_interleaved": true,
      "mrope_section": [
        11,
        11,
        10
      ],
      "partial_rotary_factor": 0.25,
      "rope_theta": 10000000,
      "rope_type": "default"
    },
    "tie_word_embeddings": true,
    "use_cache": true,
    "vocab_size": 248320
  },
  "tie_word_embeddings": true,
  "transformers_version": "5.3.0",
  "video_token_id": 248057,
  "vision_config": {
    "deepstack_visual_indexes": [],
    "depth": 12,
    "dtype": "bfloat16",
    "hidden_act": "gelu_pytorch_tanh",
    "hidden_size": 768,
    "in_channels": 3,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "model_type": "qwen3_5",
    "num_heads": 12,
    "num_position_embeddings": 2304,
    "out_hidden_size": 1024,
    "patch_size": 16,
    "spatial_merge_size": 2,
    "temporal_patch_size": 2
  },
  "vision_end_token_id": 248054,
  "vision_start_token_id": 248053
}

Code Example

Versions of relevant libraries
==============================
[pip3] flashinfer-python==0.6.4
[pip3] numpy==2.2.6
[pip3] nvidia-cublas-cu12==12.8.4.1
[pip3] nvidia-cuda-cupti-cu12==12.8.90
[pip3] nvidia-cuda-nvrtc-cu12==12.8.93
[pip3] nvidia-cuda-runtime-cu12==12.8.90
[pip3] nvidia-cudnn-cu12==9.10.2.21
[pip3] nvidia-cudnn-frontend==1.18.0
[pip3] nvidia-cufft-cu12==11.3.3.83
[pip3] nvidia-cufile-cu12==1.13.1.3
[pip3] nvidia-curand-cu12==10.3.9.90
[pip3] nvidia-cusolver-cu12==11.7.3.90
[pip3] nvidia-cusparse-cu12==12.5.8.93
[pip3] nvidia-cusparselt-cu12==0.7.1
[pip3] nvidia-cutlass-dsl==4.4.2
[pip3] nvidia-cutlass-dsl-libs-base==4.4.2
[pip3] nvidia-ml-py==13.595.45
[pip3] nvidia-nccl-cu12==2.27.5
[pip3] nvidia-nvjitlink-cu12==12.8.93
[pip3] nvidia-nvshmem-cu12==3.4.5
[pip3] nvidia-nvtx-cu12==12.8.90
[pip3] pyzmq==27.1.0
[pip3] torch==2.10.0
[pip3] torch_c_dlpack_ext==0.1.5
[pip3] torchaudio==2.10.0
[pip3] torchvision==0.25.0
[pip3] transformers==5.3.0
[pip3] triton==3.6.0
[conda] flashinfer-python                           0.6.4            pypi_0           pypi
[conda] numpy                                       2.2.6            pypi_0           pypi
[conda] nvidia-cublas-cu12                          12.8.4.1         pypi_0           pypi
[conda] nvidia-cuda-cupti-cu12                      12.8.90          pypi_0           pypi
[conda] nvidia-cuda-nvrtc-cu12                      12.8.93          pypi_0           pypi
[conda] nvidia-cuda-runtime-cu12                    12.8.90          pypi_0           pypi
[conda] nvidia-cudnn-cu12                           9.10.2.21        pypi_0           pypi
[conda] nvidia-cudnn-frontend                       1.18.0           pypi_0           pypi
[conda] nvidia-cufft-cu12                           11.3.3.83        pypi_0           pypi
[conda] nvidia-cufile-cu12                          1.13.1.3         pypi_0           pypi
[conda] nvidia-curand-cu12                          10.3.9.90        pypi_0           pypi
[conda] nvidia-cusolver-cu12                        11.7.3.90        pypi_0           pypi
[conda] nvidia-cusparse-cu12                        12.5.8.93        pypi_0           pypi
[conda] nvidia-cusparselt-cu12                      0.7.1            pypi_0           pypi
[conda] nvidia-cutlass-dsl                          4.4.2            pypi_0           pypi
[conda] nvidia-cutlass-dsl-libs-base                4.4.2            pypi_0           pypi
[conda] nvidia-ml-py                                13.595.45        pypi_0           pypi
[conda] nvidia-nccl-cu12                            2.27.5           pypi_0           pypi
[conda] nvidia-nvjitlink-cu12                       12.8.93          pypi_0           pypi
[conda] nvidia-nvshmem-cu12                         3.4.5            pypi_0           pypi
[conda] nvidia-nvtx-cu12                            12.8.90          pypi_0           pypi
[conda] pyzmq                                       27.1.0           pypi_0           pypi
[conda] torch                                       2.10.0           pypi_0           pypi
[conda] torch-c-dlpack-ext                          0.1.5            pypi_0           pypi
[conda] torchaudio                                  2.10.0           pypi_0           pypi
[conda] torchvision                                 0.25.0           pypi_0           pypi
[conda] transformers                                5.3.0            pypi_0           pypi
[conda] triton                                      3.6.0            pypi_0           pypi

==============================
         vLLM Info
==============================
ROCM Version                 : Could not collect
vLLM Version                 : 0.17.1
vLLM Build Flags:

---

{
  "architectures": [
    "Qwen3_5ForConditionalGeneration"
  ],
  "dtype": "bfloat16",
  "image_token_id": 248056,
  "model_type": "qwen3_5",
  "quantization_config": {
    "config_groups": {
      "group_0": {
        "input_activations": {
          "dynamic": false,
          "group_size": -1,
          "num_bits": 8,
          "strategy": "tensor",
          "type": "float"
        },
        "output_activations": null,
        "targets": [
          "Linear"
        ],
        "weights": {
          "dynamic": false,
          "group_size": -1,
          "num_bits": 8,
          "strategy": "tensor",
          "type": "float"
        }
      }
    },
    "format": "float-quantized",
    "ignore": [
      "lm_head",
      "re:.*embed_tokens$",
      "re:.*conv1d$",
      "re:.*in_proj_a$",
      "re:.*in_proj_b$",
      "re:.*mtp.*",
      "re:.*visual.*"
    ],
    "kv_cache_scheme": null,
    "quant_method": "compressed-tensors",
    "quantization_status": "compressed"
  },
  "text_config": {
    "attention_bias": false,
    "attention_dropout": 0.0,
    "attn_output_gate": true,
    "bos_token_id": null,
    "dtype": "bfloat16",
    "eos_token_id": 248044,
    "full_attention_interval": 4,
    "head_dim": 256,
    "hidden_act": "silu",
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": 3584,
    "layer_types": [
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention",
      "linear_attention",
      "linear_attention",
      "linear_attention",
      "full_attention"
    ],
    "linear_conv_kernel_dim": 4,
    "linear_key_head_dim": 128,
    "linear_num_key_heads": 16,
    "linear_num_value_heads": 16,
    "linear_value_head_dim": 128,
    "mamba_ssm_dtype": "float32",
    "max_position_embeddings": 262144,
    "mlp_only_layers": [],
    "model_type": "qwen3_5_text",
    "mtp_num_hidden_layers": 1,
    "mtp_use_dedicated_embeddings": false,
    "num_attention_heads": 8,
    "num_hidden_layers": 24,
    "num_key_value_heads": 2,
    "pad_token_id": null,
    "partial_rotary_factor": 0.25,
    "rms_norm_eps": 1e-06,
    "rope_parameters": {
      "mrope_interleaved": true,
      "mrope_section": [
        11,
        11,
        10
      ],
      "partial_rotary_factor": 0.25,
      "rope_theta": 10000000,
      "rope_type": "default"
    },
    "tie_word_embeddings": true,
    "use_cache": true,
    "vocab_size": 248320
  },
  "tie_word_embeddings": true,
  "transformers_version": "5.3.0",
  "video_token_id": 248057,
  "vision_config": {
    "deepstack_visual_indexes": [],
    "depth": 12,
    "dtype": "bfloat16",
    "hidden_act": "gelu_pytorch_tanh",
    "hidden_size": 768,
    "in_channels": 3,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "model_type": "qwen3_5",
    "num_heads": 12,
    "num_position_embeddings": 2304,
    "out_hidden_size": 1024,
    "patch_size": 16,
    "spatial_merge_size": 2,
    "temporal_patch_size": 2
  },
  "vision_end_token_id": 248054,
  "vision_start_token_id": 248053
}
RAW_BUFFERClick to expand / collapse

Your current environment

sglang 0.5.9 vllm 0.17.1 transformers: 5.3.0

<details> <summary>The output of <code>python collect_env.py</code></summary> ```text Versions of relevant libraries ============================== [pip3] flashinfer-python==0.6.4 [pip3] numpy==2.2.6 [pip3] nvidia-cublas-cu12==12.8.4.1 [pip3] nvidia-cuda-cupti-cu12==12.8.90 [pip3] nvidia-cuda-nvrtc-cu12==12.8.93 [pip3] nvidia-cuda-runtime-cu12==12.8.90 [pip3] nvidia-cudnn-cu12==9.10.2.21 [pip3] nvidia-cudnn-frontend==1.18.0 [pip3] nvidia-cufft-cu12==11.3.3.83 [pip3] nvidia-cufile-cu12==1.13.1.3 [pip3] nvidia-curand-cu12==10.3.9.90 [pip3] nvidia-cusolver-cu12==11.7.3.90 [pip3] nvidia-cusparse-cu12==12.5.8.93 [pip3] nvidia-cusparselt-cu12==0.7.1 [pip3] nvidia-cutlass-dsl==4.4.2 [pip3] nvidia-cutlass-dsl-libs-base==4.4.2 [pip3] nvidia-ml-py==13.595.45 [pip3] nvidia-nccl-cu12==2.27.5 [pip3] nvidia-nvjitlink-cu12==12.8.93 [pip3] nvidia-nvshmem-cu12==3.4.5 [pip3] nvidia-nvtx-cu12==12.8.90 [pip3] pyzmq==27.1.0 [pip3] torch==2.10.0 [pip3] torch_c_dlpack_ext==0.1.5 [pip3] torchaudio==2.10.0 [pip3] torchvision==0.25.0 [pip3] transformers==5.3.0 [pip3] triton==3.6.0 [conda] flashinfer-python 0.6.4 pypi_0 pypi [conda] numpy 2.2.6 pypi_0 pypi [conda] nvidia-cublas-cu12 12.8.4.1 pypi_0 pypi [conda] nvidia-cuda-cupti-cu12 12.8.90 pypi_0 pypi [conda] nvidia-cuda-nvrtc-cu12 12.8.93 pypi_0 pypi [conda] nvidia-cuda-runtime-cu12 12.8.90 pypi_0 pypi [conda] nvidia-cudnn-cu12 9.10.2.21 pypi_0 pypi [conda] nvidia-cudnn-frontend 1.18.0 pypi_0 pypi [conda] nvidia-cufft-cu12 11.3.3.83 pypi_0 pypi [conda] nvidia-cufile-cu12 1.13.1.3 pypi_0 pypi [conda] nvidia-curand-cu12 10.3.9.90 pypi_0 pypi [conda] nvidia-cusolver-cu12 11.7.3.90 pypi_0 pypi [conda] nvidia-cusparse-cu12 12.5.8.93 pypi_0 pypi [conda] nvidia-cusparselt-cu12 0.7.1 pypi_0 pypi [conda] nvidia-cutlass-dsl 4.4.2 pypi_0 pypi [conda] nvidia-cutlass-dsl-libs-base 4.4.2 pypi_0 pypi [conda] nvidia-ml-py 13.595.45 pypi_0 pypi [conda] nvidia-nccl-cu12 2.27.5 pypi_0 pypi [conda] nvidia-nvjitlink-cu12 12.8.93 pypi_0 pypi [conda] nvidia-nvshmem-cu12 3.4.5 pypi_0 pypi [conda] nvidia-nvtx-cu12 12.8.90 pypi_0 pypi [conda] pyzmq 27.1.0 pypi_0 pypi [conda] torch 2.10.0 pypi_0 pypi [conda] torch-c-dlpack-ext 0.1.5 pypi_0 pypi [conda] torchaudio 2.10.0 pypi_0 pypi [conda] torchvision 0.25.0 pypi_0 pypi [conda] transformers 5.3.0 pypi_0 pypi [conda] triton 3.6.0 pypi_0 pypi

============================== vLLM Info

ROCM Version : Could not collect vLLM Version : 0.17.1 vLLM Build Flags:


</details>


### 🐛 Describe the bug

## sglang
qwen3.5-dense wfp8afp8 dynamic(a: per-token, w: per-channel) ✅
qwen3.5-dense wfp8afp8 static(a: per-tensor, w: per-tensor)✅
## vllm
qwen3.5-dense wfp8afp8 dynamic(a: per-token, w: per-channel) ✅
qwen3.5-dense wfp8afp8 static(a: per-tensor, w: per-channel)✅
qwen3.5-dense wfp8afp8 static(a: per-tensor, w: per-tensor)❌
Output garbled text
**With the same model and FP8 static quantization (per-tensor for both activations and weights), the output is normal in SGLang but garbled in vLLM** 
### sglang
<img width="754" height="101" alt="Image" src="https://github.com/user-attachments/assets/d223b03a-c397-4796-9ad4-c119a9bce99d" />

### vllm
<img width="759" height="269" alt="Image" src="https://github.com/user-attachments/assets/1110af13-df34-4369-a274-b809c2c48be6" />

config.json

{ "architectures": [ "Qwen3_5ForConditionalGeneration" ], "dtype": "bfloat16", "image_token_id": 248056, "model_type": "qwen3_5", "quantization_config": { "config_groups": { "group_0": { "input_activations": { "dynamic": false, "group_size": -1, "num_bits": 8, "strategy": "tensor", "type": "float" }, "output_activations": null, "targets": [ "Linear" ], "weights": { "dynamic": false, "group_size": -1, "num_bits": 8, "strategy": "tensor", "type": "float" } } }, "format": "float-quantized", "ignore": [ "lm_head", "re:.*embed_tokens$", "re:.*conv1d$", "re:.*in_proj_a$", "re:.*in_proj_b$", "re:.mtp.", "re:.visual." ], "kv_cache_scheme": null, "quant_method": "compressed-tensors", "quantization_status": "compressed" }, "text_config": { "attention_bias": false, "attention_dropout": 0.0, "attn_output_gate": true, "bos_token_id": null, "dtype": "bfloat16", "eos_token_id": 248044, "full_attention_interval": 4, "head_dim": 256, "hidden_act": "silu", "hidden_size": 1024, "initializer_range": 0.02, "intermediate_size": 3584, "layer_types": [ "linear_attention", "linear_attention", "linear_attention", "full_attention", "linear_attention", "linear_attention", "linear_attention", "full_attention", "linear_attention", "linear_attention", "linear_attention", "full_attention", "linear_attention", "linear_attention", "linear_attention", "full_attention", "linear_attention", "linear_attention", "linear_attention", "full_attention", "linear_attention", "linear_attention", "linear_attention", "full_attention" ], "linear_conv_kernel_dim": 4, "linear_key_head_dim": 128, "linear_num_key_heads": 16, "linear_num_value_heads": 16, "linear_value_head_dim": 128, "mamba_ssm_dtype": "float32", "max_position_embeddings": 262144, "mlp_only_layers": [], "model_type": "qwen3_5_text", "mtp_num_hidden_layers": 1, "mtp_use_dedicated_embeddings": false, "num_attention_heads": 8, "num_hidden_layers": 24, "num_key_value_heads": 2, "pad_token_id": null, "partial_rotary_factor": 0.25, "rms_norm_eps": 1e-06, "rope_parameters": { "mrope_interleaved": true, "mrope_section": [ 11, 11, 10 ], "partial_rotary_factor": 0.25, "rope_theta": 10000000, "rope_type": "default" }, "tie_word_embeddings": true, "use_cache": true, "vocab_size": 248320 }, "tie_word_embeddings": true, "transformers_version": "5.3.0", "video_token_id": 248057, "vision_config": { "deepstack_visual_indexes": [], "depth": 12, "dtype": "bfloat16", "hidden_act": "gelu_pytorch_tanh", "hidden_size": 768, "in_channels": 3, "initializer_range": 0.02, "intermediate_size": 3072, "model_type": "qwen3_5", "num_heads": 12, "num_position_embeddings": 2304, "out_hidden_size": 1024, "patch_size": 16, "spatial_merge_size": 2, "temporal_patch_size": 2 }, "vision_end_token_id": 248054, "vision_start_token_id": 248053 }





### Before submitting a new issue...

- [x] Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.

extent analysis

Fix Plan

The issue seems to be related to the quantization configuration in the config.json file. To fix the problem, we need to adjust the quantization settings for the qwen3.5-dense model with wfp8afp8 static quantization.

Step 1: Update Quantization Configuration

Change the quantization_config section in the config.json file to use per-channel quantization for weights:

"quantization_config": {
  "config_groups": {
    "group_0": {
      "input_activations": {
        "dynamic": false,
        "group_size": -1,
        "num_bits": 8,
        "strategy": "tensor",
        "type": "float"
      },
      "output_activations": null,
      "targets": [
        "Linear"
      ],
      "weights": {
        "dynamic": false,
        "group_size": -1,
        "num_bits": 8,
        "strategy": "channel", // Change from "tensor" to "channel"
        "type": "float"
      }
    }
  },
  ...
}

Step 2: Verify Model Quantization

After updating the config.json file, re-run the model quantization process to ensure that the changes take effect.

Verification

To verify that the fix worked, run the model with the updated quantization configuration and check the output. The output should no longer be garbled.

Extra Tips

  • Make sure to update the config.json file correctly and re-run the model quantization process.
  • If the issue persists, try adjusting other quantization settings or seeking further assistance from the vLLM community.

Vote matrix · Quick signals

Works
Did the solution work? Tap to confirm.
Easy Fix
Was it a quick fix?
Time Saver
Did it save you time?
Blocking
Was it severely blocking?
Common Issue
Are others likely hitting this too?
Flaky / Intermittent
Is it intermittent?
Verified / Reproducible
Can you reproduce it reliably?
Loading…

Still need to ship something?

×6

Another batch ranked right after the header list — different links, same matching logic.

Back to top recommendations

TRENDING

vllm - 💡(How to fix) Fix [Bug]: Qwen3.5-dense wfp8afp8 w: per-tensor a: per-tensor Output garbled text, but in sglang is norm [1 participants]