chore: initial snapshot for gitea/github upload
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
# Slack Alerting on LiteLLM Gateway
|
||||
|
||||
This folder contains the Slack Alerting integration for LiteLLM Gateway.
|
||||
|
||||
## Folder Structure
|
||||
|
||||
- `slack_alerting.py`: This is the main file that handles sending different types of alerts
|
||||
- `batching_handler.py`: Handles Batching + sending Httpx Post requests to slack. Slack alerts are sent every 10s or when events are greater than X events. Done to ensure litellm has good performance under high traffic
|
||||
- `types.py`: This file contains the AlertType enum which is used to define the different types of alerts that can be sent to Slack.
|
||||
- `utils.py`: This file contains common utils used specifically for slack alerting
|
||||
|
||||
## Budget Alert Types
|
||||
|
||||
The `budget_alert_types.py` module provides a flexible framework for handling different types of budget alerts:
|
||||
|
||||
- `BaseBudgetAlertType`: An abstract base class with abstract methods that all alert types must implement:
|
||||
- `get_event_group()`: Returns the Litellm_EntityType for the alert
|
||||
- `get_event_message()`: Returns the message prefix for the alert
|
||||
- `get_id(user_info)`: Returns the ID to use for caching/tracking the alert
|
||||
|
||||
Concrete implementations include:
|
||||
- `ProxyBudgetAlert`: Alerting for proxy-level budget concerns
|
||||
- `SoftBudgetAlert`: Alerting when soft budgets are crossed
|
||||
- `UserBudgetAlert`: Alerting for user-level budget concerns
|
||||
- `TeamBudgetAlert`: Alerting for team-level budget concerns
|
||||
- `TokenBudgetAlert`: Alerting for API key budget concerns
|
||||
- `ProjectedLimitExceededAlert`: Alerting when projected spend will exceed budget
|
||||
|
||||
Use the `get_budget_alert_type()` factory function to get the appropriate alert type class for a given alert type string:
|
||||
|
||||
```python
|
||||
from litellm.integrations.SlackAlerting.budget_alert_types import get_budget_alert_type
|
||||
|
||||
# Get the appropriate handler
|
||||
budget_alert_class = get_budget_alert_type("user_budget")
|
||||
|
||||
# Use the handler methods
|
||||
event_group = budget_alert_class.get_event_group() # Returns Litellm_EntityType.USER
|
||||
event_message = budget_alert_class.get_event_message() # Returns "User Budget: "
|
||||
cache_id = budget_alert_class.get_id(user_info) # Returns user_id
|
||||
```
|
||||
|
||||
To add a new budget alert type, simply create a new class that extends `BaseBudgetAlertType` and implements all the required methods, then add it to the dictionary in the `get_budget_alert_type()` function.
|
||||
|
||||
## Further Reading
|
||||
- [Doc setting up Alerting on LiteLLM Proxy (Gateway)](https://docs.litellm.ai/docs/proxy/alerting)
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Handles Batching + sending Httpx Post requests to slack
|
||||
|
||||
Slack alerts are sent every 10s or when events are greater than X events
|
||||
|
||||
see custom_batch_logger.py for more details / defaults
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
from litellm._logging import verbose_proxy_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .slack_alerting import SlackAlerting as _SlackAlerting
|
||||
|
||||
SlackAlertingType = _SlackAlerting
|
||||
else:
|
||||
SlackAlertingType = Any
|
||||
|
||||
|
||||
def squash_payloads(queue):
|
||||
squashed = {}
|
||||
if len(queue) == 0:
|
||||
return squashed
|
||||
if len(queue) == 1:
|
||||
return {"key": {"item": queue[0], "count": 1}}
|
||||
|
||||
for item in queue:
|
||||
url = item["url"]
|
||||
alert_type = item["alert_type"]
|
||||
_key = (url, alert_type)
|
||||
|
||||
if _key in squashed:
|
||||
squashed[_key]["count"] += 1
|
||||
# Merge the payloads
|
||||
|
||||
else:
|
||||
squashed[_key] = {"item": item, "count": 1}
|
||||
|
||||
return squashed
|
||||
|
||||
|
||||
def _print_alerting_payload_warning(
|
||||
payload: dict, slackAlertingInstance: SlackAlertingType
|
||||
):
|
||||
"""
|
||||
Print the payload to the console when
|
||||
slackAlertingInstance.alerting_args.log_to_console is True
|
||||
|
||||
Relevant issue: https://github.com/BerriAI/litellm/issues/7372
|
||||
"""
|
||||
if slackAlertingInstance.alerting_args.log_to_console is True:
|
||||
verbose_proxy_logger.warning(payload)
|
||||
|
||||
|
||||
async def send_to_webhook(slackAlertingInstance: SlackAlertingType, item, count):
|
||||
"""
|
||||
Send a single slack alert to the webhook
|
||||
"""
|
||||
import json
|
||||
|
||||
payload = item.get("payload", {})
|
||||
try:
|
||||
if count > 1:
|
||||
payload["text"] = f"[Num Alerts: {count}]\n\n{payload['text']}"
|
||||
|
||||
response = await slackAlertingInstance.async_http_handler.post(
|
||||
url=item["url"],
|
||||
headers=item["headers"],
|
||||
data=json.dumps(payload),
|
||||
)
|
||||
if response.status_code != 200:
|
||||
verbose_proxy_logger.debug(
|
||||
f"Error sending slack alert to url={item['url']}. Error={response.text}"
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.debug(f"Error sending slack alert: {str(e)}")
|
||||
finally:
|
||||
_print_alerting_payload_warning(
|
||||
payload, slackAlertingInstance=slackAlertingInstance
|
||||
)
|
||||
@@ -0,0 +1,115 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Literal
|
||||
|
||||
from litellm.proxy._types import CallInfo
|
||||
|
||||
|
||||
class BaseBudgetAlertType(ABC):
|
||||
"""Base class for different budget alert types"""
|
||||
|
||||
@abstractmethod
|
||||
def get_event_message(self) -> str:
|
||||
"""Return the event message for this alert type"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
"""Return the ID to use for caching/tracking this alert"""
|
||||
pass
|
||||
|
||||
|
||||
class ProxyBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Proxy Budget: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return "default_id"
|
||||
|
||||
|
||||
class SoftBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Soft Budget Crossed: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.token or "default_id"
|
||||
|
||||
|
||||
class UserBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "User Budget: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.user_id or "default_id"
|
||||
|
||||
|
||||
class TeamBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Team Budget: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.team_id or "default_id"
|
||||
|
||||
|
||||
class OrganizationBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Organization Budget: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.organization_id or "default_id"
|
||||
|
||||
|
||||
class TokenBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Key Budget: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.token or "default_id"
|
||||
|
||||
|
||||
class ProjectedLimitExceededAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Key Budget: Projected Limit Exceeded"
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.token or "default_id"
|
||||
|
||||
|
||||
class ProjectBudgetAlert(BaseBudgetAlertType):
|
||||
def get_event_message(self) -> str:
|
||||
return "Project Budget: "
|
||||
|
||||
def get_id(self, user_info: CallInfo) -> str:
|
||||
return user_info.token or "default_id"
|
||||
|
||||
|
||||
def get_budget_alert_type(
|
||||
type: Literal[
|
||||
"token_budget",
|
||||
"user_budget",
|
||||
"soft_budget",
|
||||
"max_budget_alert",
|
||||
"team_budget",
|
||||
"organization_budget",
|
||||
"proxy_budget",
|
||||
"projected_limit_exceeded",
|
||||
"project_budget",
|
||||
],
|
||||
) -> BaseBudgetAlertType:
|
||||
"""Factory function to get the appropriate budget alert type class"""
|
||||
|
||||
alert_types = {
|
||||
"proxy_budget": ProxyBudgetAlert(),
|
||||
"soft_budget": SoftBudgetAlert(),
|
||||
"user_budget": UserBudgetAlert(),
|
||||
"max_budget_alert": TokenBudgetAlert(),
|
||||
"team_budget": TeamBudgetAlert(),
|
||||
"organization_budget": OrganizationBudgetAlert(),
|
||||
"token_budget": TokenBudgetAlert(),
|
||||
"projected_limit_exceeded": ProjectedLimitExceededAlert(),
|
||||
"project_budget": ProjectBudgetAlert(),
|
||||
}
|
||||
|
||||
if type in alert_types:
|
||||
return alert_types[type]
|
||||
else:
|
||||
return ProxyBudgetAlert()
|
||||
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
Class to check for LLM API hanging requests
|
||||
|
||||
|
||||
Notes:
|
||||
- Do not create tasks that sleep, that can saturate the event loop
|
||||
- Do not store large objects (eg. messages in memory) that can increase RAM usage
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_proxy_logger
|
||||
from litellm.caching.in_memory_cache import InMemoryCache
|
||||
from litellm.litellm_core_utils.core_helpers import get_litellm_metadata_from_kwargs
|
||||
from litellm.types.integrations.slack_alerting import (
|
||||
HANGING_ALERT_BUFFER_TIME_SECONDS,
|
||||
MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
|
||||
HangingRequestData,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
|
||||
else:
|
||||
SlackAlerting = Any
|
||||
|
||||
|
||||
class AlertingHangingRequestCheck:
|
||||
"""
|
||||
Class to safely handle checking hanging requests alerts
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
slack_alerting_object: SlackAlerting,
|
||||
):
|
||||
self.slack_alerting_object = slack_alerting_object
|
||||
self.hanging_request_cache = InMemoryCache(
|
||||
default_ttl=int(
|
||||
self.slack_alerting_object.alerting_threshold
|
||||
+ HANGING_ALERT_BUFFER_TIME_SECONDS
|
||||
),
|
||||
)
|
||||
|
||||
async def add_request_to_hanging_request_check(
|
||||
self,
|
||||
request_data: Optional[dict] = None,
|
||||
):
|
||||
"""
|
||||
Add a request to the hanging request cache. This is the list of request_ids that gets periodicall checked for hanging requests
|
||||
"""
|
||||
if request_data is None:
|
||||
return
|
||||
|
||||
request_metadata = get_litellm_metadata_from_kwargs(kwargs=request_data)
|
||||
model = request_data.get("model", "")
|
||||
api_base: Optional[str] = None
|
||||
|
||||
if request_data.get("deployment", None) is not None and isinstance(
|
||||
request_data["deployment"], dict
|
||||
):
|
||||
api_base = litellm.get_api_base(
|
||||
model=model,
|
||||
optional_params=request_data["deployment"].get("litellm_params", {}),
|
||||
)
|
||||
|
||||
hanging_request_data = HangingRequestData(
|
||||
request_id=request_data.get("litellm_call_id", ""),
|
||||
model=model,
|
||||
api_base=api_base,
|
||||
key_alias=request_metadata.get("user_api_key_alias", ""),
|
||||
team_alias=request_metadata.get("user_api_key_team_alias", ""),
|
||||
)
|
||||
|
||||
await self.hanging_request_cache.async_set_cache(
|
||||
key=hanging_request_data.request_id,
|
||||
value=hanging_request_data,
|
||||
ttl=int(
|
||||
self.slack_alerting_object.alerting_threshold
|
||||
+ HANGING_ALERT_BUFFER_TIME_SECONDS
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
async def send_alerts_for_hanging_requests(self):
|
||||
"""
|
||||
Send alerts for hanging requests
|
||||
"""
|
||||
from litellm.proxy.proxy_server import proxy_logging_obj
|
||||
|
||||
#########################################################
|
||||
# Find all requests that have been hanging for more than the alerting threshold
|
||||
# Get the last 50 oldest items in the cache and check if they have completed
|
||||
#########################################################
|
||||
# check if request_id is in internal usage cache
|
||||
if proxy_logging_obj.internal_usage_cache is None:
|
||||
return
|
||||
|
||||
hanging_requests = await self.hanging_request_cache.async_get_oldest_n_keys(
|
||||
n=MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
|
||||
)
|
||||
|
||||
for request_id in hanging_requests:
|
||||
hanging_request_data: Optional[
|
||||
HangingRequestData
|
||||
] = await self.hanging_request_cache.async_get_cache(
|
||||
key=request_id,
|
||||
)
|
||||
|
||||
if hanging_request_data is None:
|
||||
continue
|
||||
|
||||
request_status = (
|
||||
await proxy_logging_obj.internal_usage_cache.async_get_cache(
|
||||
key="request_status:{}".format(hanging_request_data.request_id),
|
||||
litellm_parent_otel_span=None,
|
||||
local_only=True,
|
||||
)
|
||||
)
|
||||
# this means the request status was either success or fail
|
||||
# and is not hanging
|
||||
if request_status is not None:
|
||||
# clear this request from hanging request cache since the request was either success or failed
|
||||
self.hanging_request_cache._remove_key(
|
||||
key=request_id,
|
||||
)
|
||||
continue
|
||||
|
||||
################
|
||||
# Send the Alert on Slack
|
||||
################
|
||||
await self.send_hanging_request_alert(
|
||||
hanging_request_data=hanging_request_data
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
async def check_for_hanging_requests(
|
||||
self,
|
||||
):
|
||||
"""
|
||||
Background task that checks all request ids in self.hanging_request_cache to check if they have completed
|
||||
|
||||
Runs every alerting_threshold/2 seconds to check for hanging requests
|
||||
"""
|
||||
while True:
|
||||
verbose_proxy_logger.debug("Checking for hanging requests....")
|
||||
await self.send_alerts_for_hanging_requests()
|
||||
await asyncio.sleep(self.slack_alerting_object.alerting_threshold / 2)
|
||||
|
||||
async def send_hanging_request_alert(
|
||||
self,
|
||||
hanging_request_data: HangingRequestData,
|
||||
):
|
||||
"""
|
||||
Send a hanging request alert
|
||||
"""
|
||||
from litellm.integrations.SlackAlerting.slack_alerting import AlertType
|
||||
|
||||
################
|
||||
# Send the Alert on Slack
|
||||
################
|
||||
request_info = f"""Request Model: `{hanging_request_data.model}`
|
||||
API Base: `{hanging_request_data.api_base}`
|
||||
Key Alias: `{hanging_request_data.key_alias}`
|
||||
Team Alias: `{hanging_request_data.team_alias}`"""
|
||||
|
||||
alerting_message = f"`Requests are hanging - {self.slack_alerting_object.alerting_threshold}s+ request time`"
|
||||
await self.slack_alerting_object.send_alert(
|
||||
message=alerting_message + "\n" + request_info,
|
||||
level="Medium",
|
||||
alert_type=AlertType.llm_requests_hanging,
|
||||
alerting_metadata=hanging_request_data.alerting_metadata or {},
|
||||
request_model=hanging_request_data.model,
|
||||
api_base=hanging_request_data.api_base,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
Utils used for slack alerting
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
import litellm
|
||||
from litellm.proxy._types import AlertType
|
||||
from litellm.secret_managers.main import get_secret
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as _Logging
|
||||
|
||||
Logging = _Logging
|
||||
else:
|
||||
Logging = Any
|
||||
|
||||
|
||||
def process_slack_alerting_variables(
|
||||
alert_to_webhook_url: Optional[Dict[AlertType, Union[List[str], str]]]
|
||||
) -> Optional[Dict[AlertType, Union[List[str], str]]]:
|
||||
"""
|
||||
process alert_to_webhook_url
|
||||
- check if any urls are set as os.environ/SLACK_WEBHOOK_URL_1 read env var and set the correct value
|
||||
"""
|
||||
if alert_to_webhook_url is None:
|
||||
return None
|
||||
|
||||
for alert_type, webhook_urls in alert_to_webhook_url.items():
|
||||
if isinstance(webhook_urls, list):
|
||||
_webhook_values: List[str] = []
|
||||
for webhook_url in webhook_urls:
|
||||
if "os.environ/" in webhook_url:
|
||||
_env_value = get_secret(secret_name=webhook_url)
|
||||
if not isinstance(_env_value, str):
|
||||
raise ValueError(
|
||||
f"Invalid webhook url value for: {webhook_url}. Got type={type(_env_value)}"
|
||||
)
|
||||
_webhook_values.append(_env_value)
|
||||
else:
|
||||
_webhook_values.append(webhook_url)
|
||||
|
||||
alert_to_webhook_url[alert_type] = _webhook_values
|
||||
else:
|
||||
_webhook_value_str: str = webhook_urls
|
||||
if "os.environ/" in webhook_urls:
|
||||
_env_value = get_secret(secret_name=webhook_urls)
|
||||
if not isinstance(_env_value, str):
|
||||
raise ValueError(
|
||||
f"Invalid webhook url value for: {webhook_urls}. Got type={type(_env_value)}"
|
||||
)
|
||||
_webhook_value_str = _env_value
|
||||
else:
|
||||
_webhook_value_str = webhook_urls
|
||||
|
||||
alert_to_webhook_url[alert_type] = _webhook_value_str
|
||||
|
||||
return alert_to_webhook_url
|
||||
|
||||
|
||||
async def _add_langfuse_trace_id_to_alert(
|
||||
request_data: Optional[dict] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Returns langfuse trace url
|
||||
|
||||
- check:
|
||||
-> existing_trace_id
|
||||
-> trace_id
|
||||
-> litellm_call_id
|
||||
"""
|
||||
if "langfuse" not in litellm.logging_callback_manager._get_all_callbacks():
|
||||
return None
|
||||
#########################################################
|
||||
# Only run if langfuse is added as a callback
|
||||
#########################################################
|
||||
|
||||
if (
|
||||
request_data is not None
|
||||
and request_data.get("litellm_logging_obj", None) is not None
|
||||
):
|
||||
trace_id: Optional[str] = None
|
||||
litellm_logging_obj: Logging = request_data["litellm_logging_obj"]
|
||||
|
||||
for _ in range(3):
|
||||
trace_id = litellm_logging_obj._get_trace_id(service_name="langfuse")
|
||||
if trace_id is not None:
|
||||
break
|
||||
await asyncio.sleep(3) # wait 3s before retrying for trace id
|
||||
#########################################################
|
||||
langfuse_object = litellm_logging_obj._get_callback_object(
|
||||
service_name="langfuse"
|
||||
)
|
||||
if langfuse_object is not None:
|
||||
base_url = langfuse_object.Langfuse.base_url
|
||||
return f"{base_url}/trace/{trace_id}"
|
||||
|
||||
return None
|
||||
Reference in New Issue
Block a user