litellm - 💡(How to fix) Fix [Bug]: Proxied Anthropic Response is Buffered

Code Example

server {
  listen 443 ssl http2;
  server_name ${litellm_host};

  ssl_certificate     /etc/letsencrypt/live/${litellm_host}/fullchain.pem;
  ssl_certificate_key /etc/letsencrypt/live/${litellm_host}/privkey.pem;
  ssl_protocols       TLSv1.2 TLSv1.3;
  ssl_prefer_server_ciphers on;

  location / {
    set $litellm_backend litellm.litellm.svc.cluster.local:${litellm_port};
    proxy_pass http://$litellm_backend;
    proxy_buffering off;
    proxy_cache off;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
    proxy_set_header X-Real-IP $remote_addr;
    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    proxy_set_header X-Forwarded-Proto $scheme;
    proxy_set_header Upgrade $http_upgrade;
    proxy_set_header Connection $connection_upgrade;
    proxy_read_timeout 600s;
  }
}

---

# LiteLLM Configuration
# For detailed documentation, visit: https://docs.litellm.ai/

# API Configuration
api_base: "http://0.0.0.0:4000"
debug: false

# Model Configuration - LiteLLM proxy expects model_list
model_list:
  - model_name: "gpt-3.5-turbo"
    litellm_params:
      model: "openai/gpt-3.5-turbo"
      api_key: "os.environ/OPENAI_API_KEY" # Set via environment variable

  - model_name: "gpt-4"
    litellm_params:
      model: "openai/gpt-4o"
      api_key: "os.environ/OPENAI_API_KEY" # Set via environment variable

  - model_name: "claude-4-haiku"
    litellm_params:
      model: "anthropic/claude-haiku-4-5-20251001"
      api_key: "os.environ/ANTHROPIC_API_KEY" # Set via environment variable

  - model_name: "claude-4-opus"
    litellm_params:
      model: "anthropic/claude-opus-4-7"
      api_key: "os.environ/ANTHROPIC_API_KEY" # Set via environment variable

# Database Configuration
database:
  type: "postgresql"
  connection_string: "os.environ/DATABASE_URL" # Injected from Kubernetes secret
  pool_size: 10
  max_overflow: 20
  pool_pre_ping: true
  echo: false

# Logging Configuration
logging:
  level: "INFO"
  format: "json"
  handlers:
    - type: "console"
    - type: "file"
      filename: "/var/log/litellm/app.log"
      max_bytes: 10485760 # 10MB
      backup_count: 5

# Authentication (optional)
# Uncomment and configure for API key authentication
auth:
  enabled: false
  # type: "bearer"
  # keys:
  #   - "your-api-key-here"

# Router Configuration
router:
  strategy: "least-cost"
  cooldown_window: 300
  emergency_fallback: true

# LiteLLM Logging Callbacks
litellm_settings:
  always_include_stream_usage: false
  cache: true
  require_auth_for_metrics_endpoint: true
  success_callback: ["s3_v2", "prometheus"]
  cache_params:
    type: "redis"
    ttl: 3600
    namespace: "litellm-cache"
    host: "os.environ/REDIS_HOST"
    port: "os.environ/REDIS_PORT"
    password: "os.environ/REDIS_PASSWORD"
  s3_callback_params:
    s3_bucket_name: "os.environ/S3_BUCKET"
    s3_region_name: "os.environ/S3_REGION"
    s3_aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID"
    s3_aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY"
    s3_path: "logs/litellm"
    s3_endpoint_url: "os.environ/S3_ENDPOINT_URL"

# General Settings
general_settings:
  store_model_in_db: true
  store_prompts_in_spend_logs: true

# Metrics
metrics:
  enabled: true
  export_to_s3: true
  s3_bucket: "os.environ/S3_BUCKET"
  s3_region: "os.environ/S3_REGION"
  s3_endpoint: "os.environ/S3_ENDPOINT_URL"

# Rate Limiting
rate_limit:
  enabled: false
  requests_per_minute: 100
  tokens_per_minute: 90000

# CORS Configuration
cors:
  enabled: true
  allow_origins: ["*"]
  allow_credentials: true
  allow_methods: ["*"]
  allow_headers: ["*"]

---

from anthropic import Anthropic
from dotenv import load_dotenv
import os

load_dotenv()
client = Anthropic(api_key=os.getenv("LITELLM_API_KEY"), base_url=os.getenv("LITELLM_BASE_URL"))

with client.messages.stream(
    model="claude-4-haiku",
    max_tokens=2048,
    messages=[{"role": "user", "content": f"Write a 500 word essay on the history of the internet. random_value={os.urandom(16).hex()}"}],
) as stream:
    for text in stream.text_stream:
        print(text, end="")

---

Check for existing issues

I have searched the existing issues and checked that my issue is not a duplicate.

What happened?

Whenever I make Anthropic LLM calls through the LiteLLM proxy, the response is buffered despite settings stream=true. Besides that, the request works correctly. The response has delta JSON lines, they are just sent in one large batch. This issue does NOT happen for requests to OpenAI models despite both having the same settings with no model-specific logic.

I'm using nginx as a TLS reverse proxy in front of LiteLLM, but buffering is disabled there:

server {
  listen 443 ssl http2;
  server_name ${litellm_host};

  ssl_certificate     /etc/letsencrypt/live/${litellm_host}/fullchain.pem;
  ssl_certificate_key /etc/letsencrypt/live/${litellm_host}/privkey.pem;
  ssl_protocols       TLSv1.2 TLSv1.3;
  ssl_prefer_server_ciphers on;

  location / {
    set $litellm_backend litellm.litellm.svc.cluster.local:${litellm_port};
    proxy_pass http://$litellm_backend;
    proxy_buffering off;
    proxy_cache off;
    proxy_http_version 1.1;
    proxy_set_header Host $host;
    proxy_set_header X-Real-IP $remote_addr;
    proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    proxy_set_header X-Forwarded-Proto $scheme;
    proxy_set_header Upgrade $http_upgrade;
    proxy_set_header Connection $connection_upgrade;
    proxy_read_timeout 600s;
  }
}

There is no CDN/VPN or any other module in the request path that would be buffering as far as I'm aware.

Here is my litellm config:

# LiteLLM Configuration
# For detailed documentation, visit: https://docs.litellm.ai/

# API Configuration
api_base: "http://0.0.0.0:4000"
debug: false

# Model Configuration - LiteLLM proxy expects model_list
model_list:
  - model_name: "gpt-3.5-turbo"
    litellm_params:
      model: "openai/gpt-3.5-turbo"
      api_key: "os.environ/OPENAI_API_KEY" # Set via environment variable

  - model_name: "gpt-4"
    litellm_params:
      model: "openai/gpt-4o"
      api_key: "os.environ/OPENAI_API_KEY" # Set via environment variable

  - model_name: "claude-4-haiku"
    litellm_params:
      model: "anthropic/claude-haiku-4-5-20251001"
      api_key: "os.environ/ANTHROPIC_API_KEY" # Set via environment variable

  - model_name: "claude-4-opus"
    litellm_params:
      model: "anthropic/claude-opus-4-7"
      api_key: "os.environ/ANTHROPIC_API_KEY" # Set via environment variable

# Database Configuration
database:
  type: "postgresql"
  connection_string: "os.environ/DATABASE_URL" # Injected from Kubernetes secret
  pool_size: 10
  max_overflow: 20
  pool_pre_ping: true
  echo: false

# Logging Configuration
logging:
  level: "INFO"
  format: "json"
  handlers:
    - type: "console"
    - type: "file"
      filename: "/var/log/litellm/app.log"
      max_bytes: 10485760 # 10MB
      backup_count: 5

# Authentication (optional)
# Uncomment and configure for API key authentication
auth:
  enabled: false
  # type: "bearer"
  # keys:
  #   - "your-api-key-here"

# Router Configuration
router:
  strategy: "least-cost"
  cooldown_window: 300
  emergency_fallback: true

# LiteLLM Logging Callbacks
litellm_settings:
  always_include_stream_usage: false
  cache: true
  require_auth_for_metrics_endpoint: true
  success_callback: ["s3_v2", "prometheus"]
  cache_params:
    type: "redis"
    ttl: 3600
    namespace: "litellm-cache"
    host: "os.environ/REDIS_HOST"
    port: "os.environ/REDIS_PORT"
    password: "os.environ/REDIS_PASSWORD"
  s3_callback_params:
    s3_bucket_name: "os.environ/S3_BUCKET"
    s3_region_name: "os.environ/S3_REGION"
    s3_aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID"
    s3_aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY"
    s3_path: "logs/litellm"
    s3_endpoint_url: "os.environ/S3_ENDPOINT_URL"

# General Settings
general_settings:
  store_model_in_db: true
  store_prompts_in_spend_logs: true

# Metrics
metrics:
  enabled: true
  export_to_s3: true
  s3_bucket: "os.environ/S3_BUCKET"
  s3_region: "os.environ/S3_REGION"
  s3_endpoint: "os.environ/S3_ENDPOINT_URL"

# Rate Limiting
rate_limit:
  enabled: false
  requests_per_minute: 100
  tokens_per_minute: 90000

# CORS Configuration
cors:
  enabled: true
  allow_origins: ["*"]
  allow_credentials: true
  allow_methods: ["*"]
  allow_headers: ["*"]

Steps to Reproduce

set LITELLM_API_KEY and LITELLM_BASE_URL environment variables, run this script, and observe whether the response is streamed or buffered: Example: LITELLM_BASE_URL=https://<litellm-host>/ LITELLM_API_KEY=sk-...

from anthropic import Anthropic
from dotenv import load_dotenv
import os

load_dotenv()
client = Anthropic(api_key=os.getenv("LITELLM_API_KEY"), base_url=os.getenv("LITELLM_BASE_URL"))

with client.messages.stream(
    model="claude-4-haiku",
    max_tokens=2048,
    messages=[{"role": "user", "content": f"Write a 500 word essay on the history of the internet. random_value={os.urandom(16).hex()}"}],
) as stream:
    for text in stream.text_stream:
        print(text, end="")

Relevant log output

What part of LiteLLM is this about?

Proxy

What LiteLLM version are you on ?

1.85.0

Twitter / LinkedIn details

No response

Data

Security

Network

Code

UI/UX

Text

System

Multimedia

Protocol

API

Engineering

litellm - 💡(How to fix) Fix [Bug]: Proxied Anthropic Response is Buffered

Recommended Tools

GitHub issue graph ai analysis