chore: initial public snapshot for github upload

2026-03-26 20:06:14 +08:00
commit 0e5ecd930e
3497 changed files with 1586236 additions and 0 deletions
--- a/llm-gateway-competitors/litellm-wheel-src/litellm/router_utils/cooldown_handlers.py
+++ b/llm-gateway-competitors/litellm-wheel-src/litellm/router_utils/cooldown_handlers.py
@@ -0,0 +1,459 @@
+"""
+Router cooldown handlers
+- _set_cooldown_deployments: puts a deployment in the cooldown list
+- get_cooldown_deployments: returns the list of deployments in the cooldown list
+- async_get_cooldown_deployments: ASYNC: returns the list of deployments in the cooldown list
+
+"""
+
+import asyncio
+import math
+from typing import TYPE_CHECKING, Any, List, Optional, Union
+
+import litellm
+from litellm._logging import verbose_router_logger
+from litellm.constants import (
+    DEFAULT_COOLDOWN_TIME_SECONDS,
+    DEFAULT_FAILURE_THRESHOLD_MINIMUM_REQUESTS,
+    DEFAULT_FAILURE_THRESHOLD_PERCENT,
+    SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD,
+)
+from litellm.router_utils.cooldown_callbacks import router_cooldown_event_callback
+
+from .router_callbacks.track_deployment_metrics import (
+    get_deployment_failures_for_current_minute,
+    get_deployment_successes_for_current_minute,
+)
+
+if TYPE_CHECKING:
+    from opentelemetry.trace import Span as _Span
+
+    from litellm.router import Router as _Router
+
+    LitellmRouter = _Router
+    Span = Union[_Span, Any]
+else:
+    LitellmRouter = Any
+    Span = Any
+
+
+def _is_cooldown_required(
+    litellm_router_instance: LitellmRouter,
+    model_id: str,
+    exception_status: Union[str, int],
+    exception_str: Optional[str] = None,
+) -> bool:
+    """
+    A function to determine if a cooldown is required based on the exception status.
+
+    Parameters:
+        model_id (str) The id of the model in the model list
+        exception_status (Union[str, int]): The status of the exception.
+
+    Returns:
+        bool: True if a cooldown is required, False otherwise.
+    """
+    try:
+        ignored_strings = ["APIConnectionError"]
+        if (
+            exception_str is not None
+        ):  # don't cooldown on litellm api connection errors errors
+            for ignored_string in ignored_strings:
+                if ignored_string in exception_str:
+                    return False
+
+        if isinstance(exception_status, str):
+            if len(exception_status) == 0:
+                return False
+            exception_status = int(exception_status)
+
+        if exception_status >= 400 and exception_status < 500:
+            if exception_status == 429:
+                # Cool down 429 Rate Limit Errors
+                return True
+
+            elif exception_status == 401:
+                # Cool down 401 Auth Errors
+                return True
+
+            elif exception_status == 408:
+                return True
+
+            elif exception_status == 404:
+                return True
+
+            else:
+                # Do NOT cool down all other 4XX Errors
+                return False
+
+        else:
+            # should cool down for all other errors
+            return True
+
+    except Exception:
+        # Catch all - if any exceptions default to cooling down
+        return True
+
+
+def _should_run_cooldown_logic(
+    litellm_router_instance: LitellmRouter,
+    deployment: Optional[str],
+    exception_status: Union[str, int],
+    original_exception: Any,
+    time_to_cooldown: Optional[float] = None,
+) -> bool:
+    """
+    Helper that decides if cooldown logic should be run
+    Returns False if cooldown logic should not be run
+
+    Does not run cooldown logic when:
+    - router.disable_cooldowns is True
+    - deployment is None
+    - _is_cooldown_required() returns False
+    - deployment is in litellm_router_instance.provider_default_deployment_ids
+    - exception_status is not one that should be immediately retried (e.g. 401)
+    """
+    if (
+        deployment is None
+        or litellm_router_instance.get_model_group(id=deployment) is None
+    ):
+        verbose_router_logger.debug(
+            "Should Not Run Cooldown Logic: deployment id is none or model group can't be found."
+        )
+        return False
+
+    #########################################################
+    # If time_to_cooldown is 0 or 0.0000000, don't run cooldown logic
+    #########################################################
+    if time_to_cooldown is not None and math.isclose(
+        a=time_to_cooldown, b=0.0, abs_tol=1e-9
+    ):
+        verbose_router_logger.debug(
+            "Should Not Run Cooldown Logic: time_to_cooldown is effectively 0"
+        )
+        return False
+
+    if litellm_router_instance.disable_cooldowns:
+        verbose_router_logger.debug(
+            "Should Not Run Cooldown Logic: disable_cooldowns is True"
+        )
+        return False
+
+    if deployment is None:
+        verbose_router_logger.debug("Should Not Run Cooldown Logic: deployment is None")
+        return False
+
+    if not _is_cooldown_required(
+        litellm_router_instance=litellm_router_instance,
+        model_id=deployment,
+        exception_status=exception_status,
+        exception_str=str(original_exception),
+    ):
+        verbose_router_logger.debug(
+            "Should Not Run Cooldown Logic: _is_cooldown_required returned False"
+        )
+        return False
+
+    if deployment in litellm_router_instance.provider_default_deployment_ids:
+        verbose_router_logger.debug(
+            "Should Not Run Cooldown Logic: deployment is in provider_default_deployment_ids"
+        )
+        return False
+
+    return True
+
+
+def _should_cooldown_deployment(
+    litellm_router_instance: LitellmRouter,
+    deployment: str,
+    exception_status: Union[str, int],
+    original_exception: Any,
+) -> bool:
+    """
+    Helper that decides if a deployment should be put in cooldown
+
+    Returns True if the deployment should be put in cooldown
+    Returns False if the deployment should not be put in cooldown
+
+
+    Deployment is put in cooldown when:
+    - v2 logic (Current):
+    cooldown if:
+        - got a 429 error from LLM API
+        - if %fails/%(successes + fails) > ALLOWED_FAILURE_RATE_PER_MINUTE
+        - got 401 Auth error, 404 NotFounder - checked by litellm._should_retry()
+
+
+
+    - v1 logic (Legacy): if allowed fails or allowed fail policy set, coolsdown if num fails in this minute > allowed fails
+    """
+    ## BASE CASE - single deployment
+    model_group = litellm_router_instance.get_model_group(id=deployment)
+    is_single_deployment_model_group = False
+    if model_group is not None and len(model_group) == 1:
+        is_single_deployment_model_group = True
+    if (
+        litellm_router_instance.allowed_fails_policy is None
+        and _is_allowed_fails_set_on_router(
+            litellm_router_instance=litellm_router_instance
+        )
+        is False
+    ):
+        num_successes_this_minute = get_deployment_successes_for_current_minute(
+            litellm_router_instance=litellm_router_instance, deployment_id=deployment
+        )
+        num_fails_this_minute = get_deployment_failures_for_current_minute(
+            litellm_router_instance=litellm_router_instance, deployment_id=deployment
+        )
+
+        total_requests_this_minute = num_successes_this_minute + num_fails_this_minute
+        percent_fails = 0.0
+        if total_requests_this_minute > 0:
+            percent_fails = num_fails_this_minute / (
+                num_successes_this_minute + num_fails_this_minute
+            )
+        verbose_router_logger.debug(
+            "percent fails for deployment = %s, percent fails = %s, num successes = %s, num fails = %s",
+            deployment,
+            percent_fails,
+            num_successes_this_minute,
+            num_fails_this_minute,
+        )
+
+        exception_status_int = cast_exception_status_to_int(exception_status)
+        if exception_status_int == 429 and not is_single_deployment_model_group:
+            return True
+        elif (
+            percent_fails == 1.0
+            and total_requests_this_minute
+            >= SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD
+        ):
+            # Cooldown if all requests failed and we have reasonable traffic
+            return True
+        elif (
+            percent_fails > DEFAULT_FAILURE_THRESHOLD_PERCENT
+            and total_requests_this_minute >= DEFAULT_FAILURE_THRESHOLD_MINIMUM_REQUESTS
+            and not is_single_deployment_model_group  # by default we should avoid cooldowns on single deployment model groups
+        ):
+            # Only apply error rate cooldown when we have enough requests to make the percentage meaningful
+            return True
+
+        elif (
+            litellm._should_retry(
+                status_code=cast_exception_status_to_int(exception_status)
+            )
+            is False
+        ):
+            return True
+
+        return False
+    else:
+        return should_cooldown_based_on_allowed_fails_policy(
+            litellm_router_instance=litellm_router_instance,
+            deployment=deployment,
+            original_exception=original_exception,
+        )
+
+    return False
+
+
+def _set_cooldown_deployments(
+    litellm_router_instance: LitellmRouter,
+    original_exception: Any,
+    exception_status: Union[str, int],
+    deployment: Optional[str] = None,
+    time_to_cooldown: Optional[float] = None,
+) -> bool:
+    """
+    Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
+
+    or
+
+    the exception is not one that should be immediately retried (e.g. 401)
+
+    Returns:
+    - True if the deployment should be put in cooldown
+    - False if the deployment should not be put in cooldown
+    """
+    verbose_router_logger.debug("checks 'should_run_cooldown_logic'")
+
+    if (
+        _should_run_cooldown_logic(
+            litellm_router_instance=litellm_router_instance,
+            deployment=deployment,
+            exception_status=exception_status,
+            original_exception=original_exception,
+            time_to_cooldown=time_to_cooldown,
+        )
+        is False
+        or deployment is None
+    ):
+        verbose_router_logger.debug("should_run_cooldown_logic returned False")
+        return False
+
+    exception_status_int = cast_exception_status_to_int(exception_status)
+    verbose_router_logger.debug(f"Attempting to add {deployment} to cooldown list")
+
+    if _should_cooldown_deployment(
+        litellm_router_instance=litellm_router_instance,
+        deployment=deployment,
+        exception_status=exception_status,
+        original_exception=original_exception,
+    ):
+        litellm_router_instance.cooldown_cache.add_deployment_to_cooldown(
+            model_id=deployment,
+            original_exception=original_exception,
+            exception_status=exception_status_int,
+            cooldown_time=time_to_cooldown,
+        )
+
+        # Trigger cooldown callback handler
+        asyncio.create_task(
+            router_cooldown_event_callback(
+                litellm_router_instance=litellm_router_instance,
+                deployment_id=deployment,
+                exception_status=exception_status,
+                cooldown_time=time_to_cooldown,
+            )
+        )
+        return True
+    return False
+
+
+async def _async_get_cooldown_deployments(
+    litellm_router_instance: LitellmRouter,
+    parent_otel_span: Optional[Span],
+) -> List[str]:
+    """
+    Async implementation of '_get_cooldown_deployments'
+    """
+    model_ids = litellm_router_instance.get_model_ids()
+    cooldown_models = (
+        await litellm_router_instance.cooldown_cache.async_get_active_cooldowns(
+            model_ids=model_ids,
+            parent_otel_span=parent_otel_span,
+        )
+    )
+
+    cached_value_deployment_ids = []
+    if (
+        cooldown_models is not None
+        and isinstance(cooldown_models, list)
+        and len(cooldown_models) > 0
+        and isinstance(cooldown_models[0], tuple)
+    ):
+        cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
+
+    verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
+    return cached_value_deployment_ids
+
+
+async def _async_get_cooldown_deployments_with_debug_info(
+    litellm_router_instance: LitellmRouter,
+    parent_otel_span: Optional[Span],
+) -> List[tuple]:
+    """
+    Async implementation of '_get_cooldown_deployments'
+    """
+    model_ids = litellm_router_instance.get_model_ids()
+    cooldown_models = (
+        await litellm_router_instance.cooldown_cache.async_get_active_cooldowns(
+            model_ids=model_ids, parent_otel_span=parent_otel_span
+        )
+    )
+
+    verbose_router_logger.debug(f"retrieve cooldown models: {cooldown_models}")
+    return cooldown_models
+
+
+def _get_cooldown_deployments(
+    litellm_router_instance: LitellmRouter, parent_otel_span: Optional[Span]
+) -> List[str]:
+    """
+    Get the list of models being cooled down for this minute
+    """
+    # get the current cooldown list for that minute
+
+    # ----------------------
+    # Return cooldown models
+    # ----------------------
+    model_ids = litellm_router_instance.get_model_ids()
+
+    cooldown_models = litellm_router_instance.cooldown_cache.get_active_cooldowns(
+        model_ids=model_ids, parent_otel_span=parent_otel_span
+    )
+
+    cached_value_deployment_ids = []
+    if (
+        cooldown_models is not None
+        and isinstance(cooldown_models, list)
+        and len(cooldown_models) > 0
+        and isinstance(cooldown_models[0], tuple)
+    ):
+        cached_value_deployment_ids = [cv[0] for cv in cooldown_models]
+
+    return cached_value_deployment_ids
+
+
+def should_cooldown_based_on_allowed_fails_policy(
+    litellm_router_instance: LitellmRouter,
+    deployment: str,
+    original_exception: Any,
+) -> bool:
+    """
+    Check if fails are within the allowed limit and update the number of fails.
+
+    Returns:
+    - True if fails exceed the allowed limit (should cooldown)
+    - False if fails are within the allowed limit (should not cooldown)
+    """
+    allowed_fails = (
+        litellm_router_instance.get_allowed_fails_from_policy(
+            exception=original_exception,
+        )
+        or litellm_router_instance.allowed_fails
+    )
+    cooldown_time = (
+        litellm_router_instance.cooldown_time or DEFAULT_COOLDOWN_TIME_SECONDS
+    )
+
+    current_fails = litellm_router_instance.failed_calls.get_cache(key=deployment) or 0
+    updated_fails = current_fails + 1
+
+    if updated_fails > allowed_fails:
+        return True
+    else:
+        litellm_router_instance.failed_calls.set_cache(
+            key=deployment, value=updated_fails, ttl=cooldown_time
+        )
+
+    return False
+
+
+def _is_allowed_fails_set_on_router(
+    litellm_router_instance: LitellmRouter,
+) -> bool:
+    """
+    Check if Router.allowed_fails is set or is Non-default Value
+
+    Returns:
+    - True if Router.allowed_fails is set or is Non-default Value
+    - False if Router.allowed_fails is None or is Default Value
+    """
+    if litellm_router_instance.allowed_fails is None:
+        return False
+    if litellm_router_instance.allowed_fails != litellm.allowed_fails:
+        return True
+    return False
+
+
+def cast_exception_status_to_int(exception_status: Union[str, int]) -> int:
+    if isinstance(exception_status, str):
+        try:
+            exception_status = int(exception_status)
+        except Exception:
+            verbose_router_logger.debug(
+                f"Unable to cast exception status to int {exception_status}. Defaulting to status=500."
+            )
+            exception_status = 500
+    return exception_status