833 lines
30 KiB
Python
833 lines
30 KiB
Python
|
|
# Start tracing memory allocations
|
|||
|
|
import asyncio
|
|||
|
|
import gc
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import tracemalloc
|
|||
|
|
from collections import Counter
|
|||
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|||
|
|
|
|||
|
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
|||
|
|
|
|||
|
|
from litellm import get_secret_str
|
|||
|
|
from litellm._logging import verbose_proxy_logger
|
|||
|
|
from litellm.constants import PYTHON_GC_THRESHOLD
|
|||
|
|
from litellm.proxy._types import UserAPIKeyAuth
|
|||
|
|
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
|
|||
|
|
|
|||
|
|
router = APIRouter()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Configure garbage collection thresholds from environment variables
|
|||
|
|
def configure_gc_thresholds():
|
|||
|
|
"""Configure Python garbage collection thresholds from environment variables."""
|
|||
|
|
gc_threshold_env = PYTHON_GC_THRESHOLD
|
|||
|
|
if gc_threshold_env:
|
|||
|
|
try:
|
|||
|
|
# Parse threshold string like "1000,50,50"
|
|||
|
|
thresholds = [int(x.strip()) for x in gc_threshold_env.split(",")]
|
|||
|
|
if len(thresholds) == 3:
|
|||
|
|
gc.set_threshold(*thresholds)
|
|||
|
|
verbose_proxy_logger.info(f"GC thresholds set to: {thresholds}")
|
|||
|
|
else:
|
|||
|
|
verbose_proxy_logger.warning(
|
|||
|
|
f"GC threshold not set: {gc_threshold_env}. Expected format: 'gen0,gen1,gen2'"
|
|||
|
|
)
|
|||
|
|
except ValueError as e:
|
|||
|
|
verbose_proxy_logger.warning(
|
|||
|
|
f"Failed to parse GC threshold: {gc_threshold_env}. Error: {e}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Log current thresholds
|
|||
|
|
current_thresholds = gc.get_threshold()
|
|||
|
|
verbose_proxy_logger.info(
|
|||
|
|
f"Current GC thresholds: gen0={current_thresholds[0]}, gen1={current_thresholds[1]}, gen2={current_thresholds[2]}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Initialize GC configuration
|
|||
|
|
configure_gc_thresholds()
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/debug/asyncio-tasks")
|
|||
|
|
async def get_active_tasks_stats():
|
|||
|
|
"""
|
|||
|
|
Returns:
|
|||
|
|
total_active_tasks: int
|
|||
|
|
by_name: { coroutine_name: count }
|
|||
|
|
"""
|
|||
|
|
MAX_TASKS_TO_CHECK = 5000
|
|||
|
|
# Gather all tasks in this event loop (including this endpoint’s own task).
|
|||
|
|
all_tasks = asyncio.all_tasks()
|
|||
|
|
|
|||
|
|
# Filter out tasks that are already done.
|
|||
|
|
active_tasks = [t for t in all_tasks if not t.done()]
|
|||
|
|
|
|||
|
|
# Count how many active tasks exist, grouped by coroutine function name.
|
|||
|
|
counter = Counter()
|
|||
|
|
for idx, task in enumerate(active_tasks):
|
|||
|
|
# reasonable max circuit breaker
|
|||
|
|
if idx >= MAX_TASKS_TO_CHECK:
|
|||
|
|
break
|
|||
|
|
coro = task.get_coro()
|
|||
|
|
# Derive a human‐readable name from the coroutine:
|
|||
|
|
name = (
|
|||
|
|
getattr(coro, "__qualname__", None)
|
|||
|
|
or getattr(coro, "__name__", None)
|
|||
|
|
or repr(coro)
|
|||
|
|
)
|
|||
|
|
counter[name] += 1
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"total_active_tasks": len(active_tasks),
|
|||
|
|
"by_name": dict(counter),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
if os.environ.get("LITELLM_PROFILE", "false").lower() == "true":
|
|||
|
|
try:
|
|||
|
|
import objgraph # type: ignore
|
|||
|
|
|
|||
|
|
print("growth of objects") # noqa
|
|||
|
|
objgraph.show_growth()
|
|||
|
|
print("\n\nMost common types") # noqa
|
|||
|
|
objgraph.show_most_common_types()
|
|||
|
|
roots = objgraph.get_leaking_objects()
|
|||
|
|
print("\n\nLeaking objects") # noqa
|
|||
|
|
objgraph.show_most_common_types(objects=roots)
|
|||
|
|
except ImportError:
|
|||
|
|
raise ImportError(
|
|||
|
|
"objgraph not found. Please install objgraph to use this feature."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
tracemalloc.start(10)
|
|||
|
|
|
|||
|
|
@router.get("/memory-usage", include_in_schema=False)
|
|||
|
|
async def memory_usage():
|
|||
|
|
# Take a snapshot of the current memory usage
|
|||
|
|
snapshot = tracemalloc.take_snapshot()
|
|||
|
|
top_stats = snapshot.statistics("lineno")
|
|||
|
|
verbose_proxy_logger.debug("TOP STATS: %s", top_stats)
|
|||
|
|
|
|||
|
|
# Get the top 50 memory usage lines
|
|||
|
|
top_50 = top_stats[:50]
|
|||
|
|
result = []
|
|||
|
|
for stat in top_50:
|
|||
|
|
result.append(f"{stat.traceback.format(limit=10)}: {stat.size / 1024} KiB")
|
|||
|
|
|
|||
|
|
return {"top_50_memory_usage": result}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/memory-usage-in-mem-cache", include_in_schema=False)
|
|||
|
|
async def memory_usage_in_mem_cache(
|
|||
|
|
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
|||
|
|
):
|
|||
|
|
# returns the size of all in-memory caches on the proxy server
|
|||
|
|
"""
|
|||
|
|
1. user_api_key_cache
|
|||
|
|
2. router_cache
|
|||
|
|
3. proxy_logging_cache
|
|||
|
|
4. internal_usage_cache
|
|||
|
|
"""
|
|||
|
|
from litellm.proxy.proxy_server import (
|
|||
|
|
llm_router,
|
|||
|
|
proxy_logging_obj,
|
|||
|
|
user_api_key_cache,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if llm_router is None:
|
|||
|
|
num_items_in_llm_router_cache = 0
|
|||
|
|
else:
|
|||
|
|
num_items_in_llm_router_cache = len(
|
|||
|
|
llm_router.cache.in_memory_cache.cache_dict
|
|||
|
|
) + len(llm_router.cache.in_memory_cache.ttl_dict)
|
|||
|
|
|
|||
|
|
num_items_in_user_api_key_cache = len(
|
|||
|
|
user_api_key_cache.in_memory_cache.cache_dict
|
|||
|
|
) + len(user_api_key_cache.in_memory_cache.ttl_dict)
|
|||
|
|
|
|||
|
|
num_items_in_proxy_logging_obj_cache = len(
|
|||
|
|
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
|||
|
|
) + len(proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.ttl_dict)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"num_items_in_user_api_key_cache": num_items_in_user_api_key_cache,
|
|||
|
|
"num_items_in_llm_router_cache": num_items_in_llm_router_cache,
|
|||
|
|
"num_items_in_proxy_logging_obj_cache": num_items_in_proxy_logging_obj_cache,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/memory-usage-in-mem-cache-items", include_in_schema=False)
|
|||
|
|
async def memory_usage_in_mem_cache_items(
|
|||
|
|
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
|||
|
|
):
|
|||
|
|
# returns the size of all in-memory caches on the proxy server
|
|||
|
|
"""
|
|||
|
|
1. user_api_key_cache
|
|||
|
|
2. router_cache
|
|||
|
|
3. proxy_logging_cache
|
|||
|
|
4. internal_usage_cache
|
|||
|
|
"""
|
|||
|
|
from litellm.proxy.proxy_server import (
|
|||
|
|
llm_router,
|
|||
|
|
proxy_logging_obj,
|
|||
|
|
user_api_key_cache,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if llm_router is None:
|
|||
|
|
llm_router_in_memory_cache_dict = {}
|
|||
|
|
llm_router_in_memory_ttl_dict = {}
|
|||
|
|
else:
|
|||
|
|
llm_router_in_memory_cache_dict = llm_router.cache.in_memory_cache.cache_dict
|
|||
|
|
llm_router_in_memory_ttl_dict = llm_router.cache.in_memory_cache.ttl_dict
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"user_api_key_cache": user_api_key_cache.in_memory_cache.cache_dict,
|
|||
|
|
"user_api_key_ttl": user_api_key_cache.in_memory_cache.ttl_dict,
|
|||
|
|
"llm_router_cache": llm_router_in_memory_cache_dict,
|
|||
|
|
"llm_router_ttl": llm_router_in_memory_ttl_dict,
|
|||
|
|
"proxy_logging_obj_cache": proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict,
|
|||
|
|
"proxy_logging_obj_ttl": proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.ttl_dict,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/debug/memory/summary", include_in_schema=False)
|
|||
|
|
async def get_memory_summary(
|
|||
|
|
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
|||
|
|
) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
Get simplified memory usage summary for the proxy.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
- worker_pid: Process ID
|
|||
|
|
- status: Overall health based on memory usage
|
|||
|
|
- memory: Process memory usage and RAM info
|
|||
|
|
- caches: Cache item counts and descriptions
|
|||
|
|
- garbage_collector: GC status and pending object counts
|
|||
|
|
|
|||
|
|
Example usage:
|
|||
|
|
curl http://localhost:4000/debug/memory/summary -H "Authorization: Bearer sk-1234"
|
|||
|
|
|
|||
|
|
For detailed analysis, call GET /debug/memory/details
|
|||
|
|
For cache management, use the cache management endpoints
|
|||
|
|
"""
|
|||
|
|
from litellm.proxy.proxy_server import (
|
|||
|
|
llm_router,
|
|||
|
|
proxy_logging_obj,
|
|||
|
|
user_api_key_cache,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Get process memory info
|
|||
|
|
process_memory = {}
|
|||
|
|
health_status = "healthy"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import psutil
|
|||
|
|
|
|||
|
|
process = psutil.Process()
|
|||
|
|
memory_info = process.memory_info()
|
|||
|
|
memory_mb = memory_info.rss / (1024 * 1024)
|
|||
|
|
memory_percent = process.memory_percent()
|
|||
|
|
|
|||
|
|
process_memory = {
|
|||
|
|
"summary": f"{memory_mb:.1f} MB ({memory_percent:.1f}% of system memory)",
|
|||
|
|
"ram_usage_mb": round(memory_mb, 2),
|
|||
|
|
"system_memory_percent": round(memory_percent, 2),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Check memory health status
|
|||
|
|
if memory_percent > 80:
|
|||
|
|
health_status = "critical"
|
|||
|
|
elif memory_percent > 60:
|
|||
|
|
health_status = "warning"
|
|||
|
|
else:
|
|||
|
|
health_status = "healthy"
|
|||
|
|
|
|||
|
|
except ImportError:
|
|||
|
|
process_memory[
|
|||
|
|
"error"
|
|||
|
|
] = "Install psutil for memory monitoring: pip install psutil"
|
|||
|
|
except Exception as e:
|
|||
|
|
process_memory["error"] = str(e)
|
|||
|
|
|
|||
|
|
# Get cache information
|
|||
|
|
caches: Dict[str, Any] = {}
|
|||
|
|
total_cache_items = 0
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# User API key cache
|
|||
|
|
user_cache_items = len(user_api_key_cache.in_memory_cache.cache_dict)
|
|||
|
|
total_cache_items += user_cache_items
|
|||
|
|
caches["user_api_keys"] = {
|
|||
|
|
"count": user_cache_items,
|
|||
|
|
"count_readable": f"{user_cache_items:,}",
|
|||
|
|
"what_it_stores": "Validated API keys for faster authentication",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Router cache
|
|||
|
|
if llm_router is not None:
|
|||
|
|
router_cache_items = len(llm_router.cache.in_memory_cache.cache_dict)
|
|||
|
|
total_cache_items += router_cache_items
|
|||
|
|
caches["llm_responses"] = {
|
|||
|
|
"count": router_cache_items,
|
|||
|
|
"count_readable": f"{router_cache_items:,}",
|
|||
|
|
"what_it_stores": "LLM responses for identical requests",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Proxy logging cache
|
|||
|
|
logging_cache_items = len(
|
|||
|
|
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
|||
|
|
)
|
|||
|
|
total_cache_items += logging_cache_items
|
|||
|
|
caches["usage_tracking"] = {
|
|||
|
|
"count": logging_cache_items,
|
|||
|
|
"count_readable": f"{logging_cache_items:,}",
|
|||
|
|
"what_it_stores": "Usage metrics before database write",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
caches["error"] = str(e)
|
|||
|
|
|
|||
|
|
# Get garbage collector stats
|
|||
|
|
gc_enabled = gc.isenabled()
|
|||
|
|
objects_pending = gc.get_count()[0]
|
|||
|
|
uncollectable = len(gc.garbage)
|
|||
|
|
|
|||
|
|
gc_info = {
|
|||
|
|
"status": "enabled" if gc_enabled else "disabled",
|
|||
|
|
"objects_awaiting_collection": objects_pending,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Add warning if garbage collection issues detected
|
|||
|
|
if uncollectable > 0:
|
|||
|
|
gc_info[
|
|||
|
|
"warning"
|
|||
|
|
] = f"{uncollectable} uncollectable objects (possible memory leak)"
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"worker_pid": os.getpid(),
|
|||
|
|
"status": health_status,
|
|||
|
|
"memory": process_memory,
|
|||
|
|
"caches": {
|
|||
|
|
"total_items": total_cache_items,
|
|||
|
|
"breakdown": caches,
|
|||
|
|
},
|
|||
|
|
"garbage_collector": gc_info,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_gc_statistics() -> Dict[str, Any]:
|
|||
|
|
"""Get garbage collector statistics."""
|
|||
|
|
return {
|
|||
|
|
"enabled": gc.isenabled(),
|
|||
|
|
"thresholds": {
|
|||
|
|
"generation_0": gc.get_threshold()[0],
|
|||
|
|
"generation_1": gc.get_threshold()[1],
|
|||
|
|
"generation_2": gc.get_threshold()[2],
|
|||
|
|
"explanation": "Number of allocations before automatic collection for each generation",
|
|||
|
|
},
|
|||
|
|
"current_counts": {
|
|||
|
|
"generation_0": gc.get_count()[0],
|
|||
|
|
"generation_1": gc.get_count()[1],
|
|||
|
|
"generation_2": gc.get_count()[2],
|
|||
|
|
"explanation": "Current number of allocated objects in each generation",
|
|||
|
|
},
|
|||
|
|
"collection_history": [
|
|||
|
|
{
|
|||
|
|
"generation": i,
|
|||
|
|
"total_collections": stat["collections"],
|
|||
|
|
"total_collected": stat["collected"],
|
|||
|
|
"uncollectable": stat["uncollectable"],
|
|||
|
|
}
|
|||
|
|
for i, stat in enumerate(gc.get_stats())
|
|||
|
|
],
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_object_type_counts(top_n: int) -> Tuple[int, List[Dict[str, Any]]]:
|
|||
|
|
"""Count objects by type and return total count and top N types."""
|
|||
|
|
type_counts: Counter = Counter()
|
|||
|
|
total_objects = 0
|
|||
|
|
|
|||
|
|
for obj in gc.get_objects():
|
|||
|
|
total_objects += 1
|
|||
|
|
obj_type = type(obj).__name__
|
|||
|
|
type_counts[obj_type] += 1
|
|||
|
|
|
|||
|
|
top_object_types = [
|
|||
|
|
{"type": obj_type, "count": count, "count_readable": f"{count:,}"}
|
|||
|
|
for obj_type, count in type_counts.most_common(top_n)
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
return total_objects, top_object_types
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_uncollectable_objects_info() -> Dict[str, Any]:
|
|||
|
|
"""Get information about uncollectable objects (potential memory leaks)."""
|
|||
|
|
uncollectable = gc.garbage
|
|||
|
|
return {
|
|||
|
|
"count": len(uncollectable),
|
|||
|
|
"sample_types": [type(obj).__name__ for obj in uncollectable[:10]],
|
|||
|
|
"warning": "If count > 0, you may have reference cycles preventing garbage collection"
|
|||
|
|
if len(uncollectable) > 0
|
|||
|
|
else None,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_cache_memory_stats(
|
|||
|
|
user_api_key_cache, llm_router, proxy_logging_obj, redis_usage_cache
|
|||
|
|
) -> Dict[str, Any]:
|
|||
|
|
"""Calculate memory usage for all caches."""
|
|||
|
|
cache_stats: Dict[str, Any] = {}
|
|||
|
|
try:
|
|||
|
|
# User API key cache
|
|||
|
|
user_cache_size = sys.getsizeof(user_api_key_cache.in_memory_cache.cache_dict)
|
|||
|
|
user_ttl_size = sys.getsizeof(user_api_key_cache.in_memory_cache.ttl_dict)
|
|||
|
|
cache_stats["user_api_key_cache"] = {
|
|||
|
|
"num_items": len(user_api_key_cache.in_memory_cache.cache_dict),
|
|||
|
|
"cache_dict_size_bytes": user_cache_size,
|
|||
|
|
"ttl_dict_size_bytes": user_ttl_size,
|
|||
|
|
"total_size_mb": round(
|
|||
|
|
(user_cache_size + user_ttl_size) / (1024 * 1024), 2
|
|||
|
|
),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Router cache
|
|||
|
|
if llm_router is not None:
|
|||
|
|
router_cache_size = sys.getsizeof(
|
|||
|
|
llm_router.cache.in_memory_cache.cache_dict
|
|||
|
|
)
|
|||
|
|
router_ttl_size = sys.getsizeof(llm_router.cache.in_memory_cache.ttl_dict)
|
|||
|
|
cache_stats["llm_router_cache"] = {
|
|||
|
|
"num_items": len(llm_router.cache.in_memory_cache.cache_dict),
|
|||
|
|
"cache_dict_size_bytes": router_cache_size,
|
|||
|
|
"ttl_dict_size_bytes": router_ttl_size,
|
|||
|
|
"total_size_mb": round(
|
|||
|
|
(router_cache_size + router_ttl_size) / (1024 * 1024), 2
|
|||
|
|
),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Proxy logging cache
|
|||
|
|
logging_cache_size = sys.getsizeof(
|
|||
|
|
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
|||
|
|
)
|
|||
|
|
logging_ttl_size = sys.getsizeof(
|
|||
|
|
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.ttl_dict
|
|||
|
|
)
|
|||
|
|
cache_stats["proxy_logging_cache"] = {
|
|||
|
|
"num_items": len(
|
|||
|
|
proxy_logging_obj.internal_usage_cache.dual_cache.in_memory_cache.cache_dict
|
|||
|
|
),
|
|||
|
|
"cache_dict_size_bytes": logging_cache_size,
|
|||
|
|
"ttl_dict_size_bytes": logging_ttl_size,
|
|||
|
|
"total_size_mb": round(
|
|||
|
|
(logging_cache_size + logging_ttl_size) / (1024 * 1024), 2
|
|||
|
|
),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Redis cache info
|
|||
|
|
if redis_usage_cache is not None:
|
|||
|
|
cache_stats["redis_usage_cache"] = {
|
|||
|
|
"enabled": True,
|
|||
|
|
"cache_type": type(redis_usage_cache).__name__,
|
|||
|
|
}
|
|||
|
|
# Try to get Redis connection pool info if available
|
|||
|
|
try:
|
|||
|
|
if (
|
|||
|
|
hasattr(redis_usage_cache, "redis_client")
|
|||
|
|
and redis_usage_cache.redis_client
|
|||
|
|
):
|
|||
|
|
if hasattr(redis_usage_cache.redis_client, "connection_pool"):
|
|||
|
|
pool_info = redis_usage_cache.redis_client.connection_pool # type: ignore
|
|||
|
|
cache_stats["redis_usage_cache"]["connection_pool"] = {
|
|||
|
|
"max_connections": pool_info.max_connections
|
|||
|
|
if hasattr(pool_info, "max_connections")
|
|||
|
|
else None,
|
|||
|
|
"connection_class": pool_info.connection_class.__name__
|
|||
|
|
if hasattr(pool_info, "connection_class")
|
|||
|
|
else None,
|
|||
|
|
}
|
|||
|
|
except Exception as e:
|
|||
|
|
verbose_proxy_logger.debug(f"Error getting Redis pool info: {e}")
|
|||
|
|
else:
|
|||
|
|
cache_stats["redis_usage_cache"] = {"enabled": False}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
verbose_proxy_logger.debug(f"Error calculating cache stats: {e}")
|
|||
|
|
cache_stats["error"] = str(e)
|
|||
|
|
|
|||
|
|
return cache_stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_router_memory_stats(llm_router) -> Dict[str, Any]:
|
|||
|
|
"""Get memory usage statistics for LiteLLM router."""
|
|||
|
|
litellm_router_memory: Dict[str, Any] = {}
|
|||
|
|
try:
|
|||
|
|
if llm_router is not None:
|
|||
|
|
# Model list memory size
|
|||
|
|
if hasattr(llm_router, "model_list") and llm_router.model_list:
|
|||
|
|
model_list_size = sys.getsizeof(llm_router.model_list)
|
|||
|
|
litellm_router_memory["model_list"] = {
|
|||
|
|
"num_models": len(llm_router.model_list),
|
|||
|
|
"size_bytes": model_list_size,
|
|||
|
|
"size_mb": round(model_list_size / (1024 * 1024), 4),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Model names set
|
|||
|
|
if hasattr(llm_router, "model_names") and llm_router.model_names:
|
|||
|
|
model_names_size = sys.getsizeof(llm_router.model_names)
|
|||
|
|
litellm_router_memory["model_names_set"] = {
|
|||
|
|
"num_model_groups": len(llm_router.model_names),
|
|||
|
|
"size_bytes": model_names_size,
|
|||
|
|
"size_mb": round(model_names_size / (1024 * 1024), 4),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Deployment names list
|
|||
|
|
if hasattr(llm_router, "deployment_names") and llm_router.deployment_names:
|
|||
|
|
deployment_names_size = sys.getsizeof(llm_router.deployment_names)
|
|||
|
|
litellm_router_memory["deployment_names"] = {
|
|||
|
|
"num_deployments": len(llm_router.deployment_names),
|
|||
|
|
"size_bytes": deployment_names_size,
|
|||
|
|
"size_mb": round(deployment_names_size / (1024 * 1024), 4),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Deployment latency map
|
|||
|
|
if (
|
|||
|
|
hasattr(llm_router, "deployment_latency_map")
|
|||
|
|
and llm_router.deployment_latency_map
|
|||
|
|
):
|
|||
|
|
latency_map_size = sys.getsizeof(llm_router.deployment_latency_map)
|
|||
|
|
litellm_router_memory["deployment_latency_map"] = {
|
|||
|
|
"num_tracked_deployments": len(llm_router.deployment_latency_map),
|
|||
|
|
"size_bytes": latency_map_size,
|
|||
|
|
"size_mb": round(latency_map_size / (1024 * 1024), 4),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Fallback configuration
|
|||
|
|
if hasattr(llm_router, "fallbacks") and llm_router.fallbacks:
|
|||
|
|
fallbacks_size = sys.getsizeof(llm_router.fallbacks)
|
|||
|
|
litellm_router_memory["fallbacks"] = {
|
|||
|
|
"num_fallback_configs": len(llm_router.fallbacks),
|
|||
|
|
"size_bytes": fallbacks_size,
|
|||
|
|
"size_mb": round(fallbacks_size / (1024 * 1024), 4),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Total router object size
|
|||
|
|
router_obj_size = sys.getsizeof(llm_router)
|
|||
|
|
litellm_router_memory["router_object"] = {
|
|||
|
|
"size_bytes": router_obj_size,
|
|||
|
|
"size_mb": round(router_obj_size / (1024 * 1024), 4),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
litellm_router_memory = {"note": "Router not initialized"}
|
|||
|
|
except Exception as e:
|
|||
|
|
verbose_proxy_logger.debug(f"Error getting router memory info: {e}")
|
|||
|
|
litellm_router_memory = {"error": str(e)}
|
|||
|
|
|
|||
|
|
return litellm_router_memory
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _get_process_memory_info(
|
|||
|
|
worker_pid: int, include_process_info: bool
|
|||
|
|
) -> Optional[Dict[str, Any]]:
|
|||
|
|
"""Get process-level memory information using psutil."""
|
|||
|
|
if not include_process_info:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
import psutil
|
|||
|
|
|
|||
|
|
process = psutil.Process()
|
|||
|
|
memory_info = process.memory_info()
|
|||
|
|
ram_usage_mb = round(memory_info.rss / (1024 * 1024), 2)
|
|||
|
|
virtual_memory_mb = round(memory_info.vms / (1024 * 1024), 2)
|
|||
|
|
memory_percent = round(process.memory_percent(), 2)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"pid": worker_pid,
|
|||
|
|
"summary": f"Worker PID {worker_pid} using {ram_usage_mb:.1f} MB of RAM ({memory_percent:.1f}% of system memory)",
|
|||
|
|
"ram_usage": {
|
|||
|
|
"megabytes": ram_usage_mb,
|
|||
|
|
"description": "Actual physical RAM used by this process",
|
|||
|
|
},
|
|||
|
|
"virtual_memory": {
|
|||
|
|
"megabytes": virtual_memory_mb,
|
|||
|
|
"description": "Total virtual memory allocated (includes swapped memory)",
|
|||
|
|
},
|
|||
|
|
"system_memory_percent": {
|
|||
|
|
"percent": memory_percent,
|
|||
|
|
"description": "Percentage of total system RAM being used",
|
|||
|
|
},
|
|||
|
|
"open_file_handles": {
|
|||
|
|
"count": process.num_fds()
|
|||
|
|
if hasattr(process, "num_fds")
|
|||
|
|
else "N/A (Windows)",
|
|||
|
|
"description": "Number of open file descriptors/handles",
|
|||
|
|
},
|
|||
|
|
"threads": {
|
|||
|
|
"count": process.num_threads(),
|
|||
|
|
"description": "Number of active threads in this process",
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
except ImportError:
|
|||
|
|
return {
|
|||
|
|
"pid": worker_pid,
|
|||
|
|
"error": "psutil not installed. Install with: pip install psutil",
|
|||
|
|
}
|
|||
|
|
except Exception as e:
|
|||
|
|
verbose_proxy_logger.debug(f"Error getting process info: {e}")
|
|||
|
|
return {"pid": worker_pid, "error": str(e)}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/debug/memory/details", include_in_schema=False)
|
|||
|
|
async def get_memory_details(
|
|||
|
|
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
|||
|
|
top_n: int = Query(20, description="Number of top object types to return"),
|
|||
|
|
include_process_info: bool = Query(True, description="Include process memory info"),
|
|||
|
|
) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
Get detailed memory diagnostics for deep debugging.
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
- worker_pid: Process ID
|
|||
|
|
- process_memory: RAM usage, virtual memory, file handles, threads
|
|||
|
|
- garbage_collector: GC thresholds, counts, collection history
|
|||
|
|
- objects: Total tracked objects and top object types
|
|||
|
|
- uncollectable: Objects that can't be garbage collected (potential leaks)
|
|||
|
|
- cache_memory: Memory usage of user_api_key, router, and logging caches
|
|||
|
|
- router_memory: Memory usage of router components (model_list, deployment_names, etc.)
|
|||
|
|
|
|||
|
|
Query Parameters:
|
|||
|
|
- top_n: Number of top object types to return (default: 20)
|
|||
|
|
- include_process_info: Include process-level memory info using psutil (default: true)
|
|||
|
|
|
|||
|
|
Example usage:
|
|||
|
|
curl "http://localhost:4000/debug/memory/details?top_n=30" -H "Authorization: Bearer sk-1234"
|
|||
|
|
|
|||
|
|
All memory sizes are reported in both bytes and MB.
|
|||
|
|
"""
|
|||
|
|
from litellm.proxy.proxy_server import (
|
|||
|
|
llm_router,
|
|||
|
|
proxy_logging_obj,
|
|||
|
|
user_api_key_cache,
|
|||
|
|
redis_usage_cache,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
worker_pid = os.getpid()
|
|||
|
|
|
|||
|
|
# Collect all diagnostics using helper functions
|
|||
|
|
gc_stats = _get_gc_statistics()
|
|||
|
|
total_objects, top_object_types = _get_object_type_counts(top_n)
|
|||
|
|
uncollectable_info = _get_uncollectable_objects_info()
|
|||
|
|
cache_stats = _get_cache_memory_stats(
|
|||
|
|
user_api_key_cache, llm_router, proxy_logging_obj, redis_usage_cache
|
|||
|
|
)
|
|||
|
|
litellm_router_memory = _get_router_memory_stats(llm_router)
|
|||
|
|
process_info = _get_process_memory_info(worker_pid, include_process_info)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"worker_pid": worker_pid,
|
|||
|
|
"process_memory": process_info,
|
|||
|
|
"garbage_collector": gc_stats,
|
|||
|
|
"objects": {
|
|||
|
|
"total_tracked": total_objects,
|
|||
|
|
"total_tracked_readable": f"{total_objects:,}",
|
|||
|
|
"top_types": top_object_types,
|
|||
|
|
},
|
|||
|
|
"uncollectable": uncollectable_info,
|
|||
|
|
"cache_memory": cache_stats,
|
|||
|
|
"router_memory": litellm_router_memory,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.post("/debug/memory/gc/configure", include_in_schema=False)
|
|||
|
|
async def configure_gc_thresholds_endpoint(
|
|||
|
|
_: UserAPIKeyAuth = Depends(user_api_key_auth),
|
|||
|
|
generation_0: int = Query(700, description="Generation 0 threshold (default: 700)"),
|
|||
|
|
generation_1: int = Query(10, description="Generation 1 threshold (default: 10)"),
|
|||
|
|
generation_2: int = Query(10, description="Generation 2 threshold (default: 10)"),
|
|||
|
|
) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
Configure Python garbage collection thresholds.
|
|||
|
|
|
|||
|
|
Lower thresholds mean more frequent GC cycles (less memory, more CPU overhead).
|
|||
|
|
Higher thresholds mean less frequent GC cycles (more memory, less CPU overhead).
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
- message: Confirmation message
|
|||
|
|
- previous_thresholds: Old threshold values
|
|||
|
|
- new_thresholds: New threshold values
|
|||
|
|
- objects_awaiting_collection: Current object count in gen-0
|
|||
|
|
- tip: Hint about when next collection will occur
|
|||
|
|
|
|||
|
|
Query Parameters:
|
|||
|
|
- generation_0: Number of allocations before gen-0 collection (default: 700)
|
|||
|
|
- generation_1: Number of gen-0 collections before gen-1 collection (default: 10)
|
|||
|
|
- generation_2: Number of gen-1 collections before gen-2 collection (default: 10)
|
|||
|
|
|
|||
|
|
Example for more aggressive collection:
|
|||
|
|
curl -X POST "http://localhost:4000/debug/memory/gc/configure?generation_0=500" -H "Authorization: Bearer sk-1234"
|
|||
|
|
|
|||
|
|
Example for less aggressive collection:
|
|||
|
|
curl -X POST "http://localhost:4000/debug/memory/gc/configure?generation_0=1000" -H "Authorization: Bearer sk-1234"
|
|||
|
|
|
|||
|
|
Monitor memory usage with GET /debug/memory/summary after changes.
|
|||
|
|
"""
|
|||
|
|
# Get current thresholds for logging
|
|||
|
|
old_thresholds = gc.get_threshold()
|
|||
|
|
|
|||
|
|
# Set new thresholds with error handling
|
|||
|
|
try:
|
|||
|
|
gc.set_threshold(generation_0, generation_1, generation_2)
|
|||
|
|
verbose_proxy_logger.info(
|
|||
|
|
f"GC thresholds updated from {old_thresholds} to "
|
|||
|
|
f"({generation_0}, {generation_1}, {generation_2})"
|
|||
|
|
)
|
|||
|
|
except Exception as e:
|
|||
|
|
verbose_proxy_logger.error(f"Failed to set GC thresholds: {e}")
|
|||
|
|
raise HTTPException(
|
|||
|
|
status_code=500, detail=f"Failed to set GC thresholds: {str(e)}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Get current object count to show immediate impact
|
|||
|
|
current_count = gc.get_count()[0]
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"message": "GC thresholds updated",
|
|||
|
|
"previous_thresholds": f"{old_thresholds[0]}, {old_thresholds[1]}, {old_thresholds[2]}",
|
|||
|
|
"new_thresholds": f"{generation_0}, {generation_1}, {generation_2}",
|
|||
|
|
"objects_awaiting_collection": current_count,
|
|||
|
|
"tip": f"Next collection will run after {generation_0 - current_count} more allocations",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
@router.get("/otel-spans", include_in_schema=False)
|
|||
|
|
async def get_otel_spans():
|
|||
|
|
from litellm.proxy.proxy_server import open_telemetry_logger
|
|||
|
|
|
|||
|
|
if open_telemetry_logger is None:
|
|||
|
|
return {
|
|||
|
|
"otel_spans": [],
|
|||
|
|
"spans_grouped_by_parent": {},
|
|||
|
|
"most_recent_parent": None,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
otel_exporter = open_telemetry_logger.OTEL_EXPORTER
|
|||
|
|
if hasattr(otel_exporter, "get_finished_spans"):
|
|||
|
|
recorded_spans = otel_exporter.get_finished_spans() # type: ignore
|
|||
|
|
else:
|
|||
|
|
recorded_spans = []
|
|||
|
|
|
|||
|
|
print("Spans: ", recorded_spans) # noqa
|
|||
|
|
|
|||
|
|
most_recent_parent = None
|
|||
|
|
most_recent_start_time = 1000000
|
|||
|
|
spans_grouped_by_parent = {}
|
|||
|
|
for span in recorded_spans:
|
|||
|
|
if span.parent is not None:
|
|||
|
|
parent_trace_id = span.parent.trace_id
|
|||
|
|
if parent_trace_id not in spans_grouped_by_parent:
|
|||
|
|
spans_grouped_by_parent[parent_trace_id] = []
|
|||
|
|
spans_grouped_by_parent[parent_trace_id].append(span.name)
|
|||
|
|
|
|||
|
|
# check time of span
|
|||
|
|
if span.start_time > most_recent_start_time:
|
|||
|
|
most_recent_parent = parent_trace_id
|
|||
|
|
most_recent_start_time = span.start_time
|
|||
|
|
|
|||
|
|
# these are otel spans - get the span name
|
|||
|
|
span_names = [span.name for span in recorded_spans]
|
|||
|
|
return {
|
|||
|
|
"otel_spans": span_names,
|
|||
|
|
"spans_grouped_by_parent": spans_grouped_by_parent,
|
|||
|
|
"most_recent_parent": most_recent_parent,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Helper functions for debugging
|
|||
|
|
def init_verbose_loggers():
|
|||
|
|
try:
|
|||
|
|
worker_config = get_secret_str("WORKER_CONFIG")
|
|||
|
|
# if not, assume it's a json string
|
|||
|
|
if worker_config is None:
|
|||
|
|
return
|
|||
|
|
if os.path.isfile(worker_config):
|
|||
|
|
return
|
|||
|
|
_settings = json.loads(worker_config)
|
|||
|
|
if not isinstance(_settings, dict):
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
debug = _settings.get("debug", None)
|
|||
|
|
detailed_debug = _settings.get("detailed_debug", None)
|
|||
|
|
if debug is True: # this needs to be first, so users can see Router init debugg
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from litellm._logging import (
|
|||
|
|
verbose_logger,
|
|||
|
|
verbose_proxy_logger,
|
|||
|
|
verbose_router_logger,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# this must ALWAYS remain logging.INFO, DO NOT MODIFY THIS
|
|||
|
|
verbose_logger.setLevel(level=logging.INFO) # sets package logs to info
|
|||
|
|
verbose_router_logger.setLevel(
|
|||
|
|
level=logging.INFO
|
|||
|
|
) # set router logs to info
|
|||
|
|
verbose_proxy_logger.setLevel(level=logging.INFO) # set proxy logs to info
|
|||
|
|
if detailed_debug is True:
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from litellm._logging import (
|
|||
|
|
verbose_logger,
|
|||
|
|
verbose_proxy_logger,
|
|||
|
|
verbose_router_logger,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
verbose_logger.setLevel(level=logging.DEBUG) # set package log to debug
|
|||
|
|
verbose_router_logger.setLevel(
|
|||
|
|
level=logging.DEBUG
|
|||
|
|
) # set router logs to debug
|
|||
|
|
verbose_proxy_logger.setLevel(
|
|||
|
|
level=logging.DEBUG
|
|||
|
|
) # set proxy logs to debug
|
|||
|
|
elif debug is False and detailed_debug is False:
|
|||
|
|
# users can control proxy debugging using env variable = 'LITELLM_LOG'
|
|||
|
|
litellm_log_setting = os.environ.get("LITELLM_LOG", "")
|
|||
|
|
if litellm_log_setting is not None:
|
|||
|
|
if litellm_log_setting.upper() == "INFO":
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from litellm._logging import (
|
|||
|
|
verbose_proxy_logger,
|
|||
|
|
verbose_router_logger,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# this must ALWAYS remain logging.INFO, DO NOT MODIFY THIS
|
|||
|
|
|
|||
|
|
verbose_router_logger.setLevel(
|
|||
|
|
level=logging.INFO
|
|||
|
|
) # set router logs to info
|
|||
|
|
verbose_proxy_logger.setLevel(
|
|||
|
|
level=logging.INFO
|
|||
|
|
) # set proxy logs to info
|
|||
|
|
elif litellm_log_setting.upper() == "DEBUG":
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
from litellm._logging import (
|
|||
|
|
verbose_proxy_logger,
|
|||
|
|
verbose_router_logger,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
verbose_router_logger.setLevel(
|
|||
|
|
level=logging.DEBUG
|
|||
|
|
) # set router logs to info
|
|||
|
|
verbose_proxy_logger.setLevel(
|
|||
|
|
level=logging.DEBUG
|
|||
|
|
) # set proxy logs to debug
|
|||
|
|
except Exception as e:
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
logging.warning(f"Failed to init verbose loggers: {str(e)}")
|