chore: initial public snapshot for github upload
This commit is contained in:
@@ -0,0 +1,286 @@
|
||||
import asyncio
|
||||
import gzip
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||
from litellm.integrations.datadog.datadog_handler import (
|
||||
get_datadog_env,
|
||||
get_datadog_hostname,
|
||||
get_datadog_pod_name,
|
||||
get_datadog_service,
|
||||
)
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
get_async_httpx_client,
|
||||
httpxSpecialProvider,
|
||||
)
|
||||
from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
|
||||
from litellm.types.integrations.datadog_metrics import (
|
||||
DatadogMetricPoint,
|
||||
DatadogMetricSeries,
|
||||
DatadogMetricsPayload,
|
||||
)
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
|
||||
class DatadogMetricsLogger(CustomBatchLogger):
|
||||
def __init__(self, start_periodic_flush: bool = True, **kwargs):
|
||||
self.dd_api_key = os.getenv("DD_API_KEY")
|
||||
self.dd_app_key = os.getenv("DD_APP_KEY")
|
||||
self.dd_site = os.getenv("DD_SITE", "datadoghq.com")
|
||||
|
||||
if not self.dd_api_key:
|
||||
verbose_logger.warning(
|
||||
"Datadog Metrics: DD_API_KEY is required. Integration will not work."
|
||||
)
|
||||
|
||||
self.upload_url = f"https://api.{self.dd_site}/api/v2/series"
|
||||
|
||||
self.async_client = get_async_httpx_client(
|
||||
llm_provider=httpxSpecialProvider.LoggingCallback
|
||||
)
|
||||
|
||||
# Initialize lock
|
||||
self.flush_lock = asyncio.Lock()
|
||||
|
||||
# Only set flush_lock if not already provided by caller
|
||||
if "flush_lock" not in kwargs:
|
||||
kwargs["flush_lock"] = self.flush_lock
|
||||
|
||||
# Send metrics more quickly to datadog (every 5 seconds)
|
||||
if "flush_interval" not in kwargs:
|
||||
kwargs["flush_interval"] = 5
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# Start periodic flush task only if instructed
|
||||
if start_periodic_flush:
|
||||
asyncio.create_task(self.periodic_flush())
|
||||
|
||||
def _extract_tags(
|
||||
self,
|
||||
log: StandardLoggingPayload,
|
||||
status_code: Optional[Union[str, int]] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Builds the list of tags for a Datadog metric point
|
||||
"""
|
||||
# Base tags
|
||||
tags = [
|
||||
f"env:{get_datadog_env()}",
|
||||
f"service:{get_datadog_service()}",
|
||||
f"version:{os.getenv('DD_VERSION', 'unknown')}",
|
||||
f"HOSTNAME:{get_datadog_hostname()}",
|
||||
f"POD_NAME:{get_datadog_pod_name()}",
|
||||
]
|
||||
|
||||
# Add metric-specific tags
|
||||
if provider := log.get("custom_llm_provider"):
|
||||
tags.append(f"provider:{provider}")
|
||||
|
||||
if model := log.get("model"):
|
||||
tags.append(f"model_name:{model}")
|
||||
|
||||
if model_group := log.get("model_group"):
|
||||
tags.append(f"model_group:{model_group}")
|
||||
|
||||
if status_code is not None:
|
||||
tags.append(f"status_code:{status_code}")
|
||||
|
||||
# Extract team tag
|
||||
metadata = log.get("metadata", {}) or {}
|
||||
team_tag = (
|
||||
metadata.get("user_api_key_team_alias")
|
||||
or metadata.get("team_alias") # type: ignore
|
||||
or metadata.get("user_api_key_team_id")
|
||||
or metadata.get("team_id") # type: ignore
|
||||
)
|
||||
|
||||
if team_tag:
|
||||
tags.append(f"team:{team_tag}")
|
||||
|
||||
return tags
|
||||
|
||||
def _add_metrics_from_log(
|
||||
self,
|
||||
log: StandardLoggingPayload,
|
||||
kwargs: dict,
|
||||
status_code: Union[str, int] = "200",
|
||||
):
|
||||
"""
|
||||
Extracts latencies and appends Datadog metric series to the queue
|
||||
"""
|
||||
tags = self._extract_tags(log, status_code=status_code)
|
||||
|
||||
# We record metrics with the end_time as the timestamp for the point
|
||||
end_time_dt = kwargs.get("end_time") or datetime.now()
|
||||
timestamp = int(end_time_dt.timestamp())
|
||||
|
||||
# 1. Total Request Latency Metric (End to End)
|
||||
start_time_dt = kwargs.get("start_time")
|
||||
if start_time_dt and end_time_dt:
|
||||
total_duration = (end_time_dt - start_time_dt).total_seconds()
|
||||
series_total_latency: DatadogMetricSeries = {
|
||||
"metric": "litellm.request.total_latency",
|
||||
"type": 3, # gauge
|
||||
"points": [{"timestamp": timestamp, "value": total_duration}],
|
||||
"tags": tags,
|
||||
}
|
||||
self.log_queue.append(series_total_latency)
|
||||
|
||||
# 2. LLM API Latency Metric (Provider alone)
|
||||
api_call_start_time = kwargs.get("api_call_start_time")
|
||||
if api_call_start_time and end_time_dt:
|
||||
llm_api_duration = (end_time_dt - api_call_start_time).total_seconds()
|
||||
series_llm_latency: DatadogMetricSeries = {
|
||||
"metric": "litellm.llm_api.latency",
|
||||
"type": 3, # gauge
|
||||
"points": [{"timestamp": timestamp, "value": llm_api_duration}],
|
||||
"tags": tags,
|
||||
}
|
||||
self.log_queue.append(series_llm_latency)
|
||||
|
||||
# 3. Request Count / Status Code
|
||||
series_count: DatadogMetricSeries = {
|
||||
"metric": "litellm.llm_api.request_count",
|
||||
"type": 1, # count
|
||||
"points": [{"timestamp": timestamp, "value": 1.0}],
|
||||
"tags": tags,
|
||||
"interval": self.flush_interval,
|
||||
}
|
||||
self.log_queue.append(series_count)
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object", None
|
||||
)
|
||||
|
||||
if standard_logging_object is None:
|
||||
return
|
||||
|
||||
self._add_metrics_from_log(
|
||||
log=standard_logging_object, kwargs=kwargs, status_code="200"
|
||||
)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.flush_queue()
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Metrics: Error in async_log_success_event: {str(e)}"
|
||||
)
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object", None
|
||||
)
|
||||
|
||||
if standard_logging_object is None:
|
||||
return
|
||||
|
||||
# Extract status code from error information
|
||||
status_code = "500" # default
|
||||
error_information = (
|
||||
standard_logging_object.get("error_information", {}) or {}
|
||||
)
|
||||
error_code = error_information.get("error_code") # type: ignore
|
||||
if error_code is not None:
|
||||
status_code = str(error_code)
|
||||
|
||||
self._add_metrics_from_log(
|
||||
log=standard_logging_object, kwargs=kwargs, status_code=status_code
|
||||
)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.flush_queue()
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Metrics: Error in async_log_failure_event: {str(e)}"
|
||||
)
|
||||
|
||||
async def async_send_batch(self):
|
||||
if not self.log_queue:
|
||||
return
|
||||
|
||||
batch = self.log_queue.copy()
|
||||
payload_data: DatadogMetricsPayload = {"series": batch}
|
||||
|
||||
try:
|
||||
await self._upload_to_datadog(payload_data)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Metrics: Error in async_send_batch: {str(e)}"
|
||||
)
|
||||
raise
|
||||
|
||||
async def _upload_to_datadog(self, payload: DatadogMetricsPayload):
|
||||
if not self.dd_api_key:
|
||||
return
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"DD-API-KEY": self.dd_api_key,
|
||||
}
|
||||
|
||||
if self.dd_app_key:
|
||||
headers["DD-APPLICATION-KEY"] = self.dd_app_key
|
||||
|
||||
json_data = safe_dumps(payload)
|
||||
compressed_data = gzip.compress(json_data.encode("utf-8"))
|
||||
headers["Content-Encoding"] = "gzip"
|
||||
|
||||
response = await self.async_client.post(
|
||||
self.upload_url, content=compressed_data, headers=headers # type: ignore
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Datadog Metrics: Uploaded {len(payload['series'])} metric points. Status: {response.status_code}"
|
||||
)
|
||||
|
||||
async def async_health_check(self) -> IntegrationHealthCheckStatus:
|
||||
"""
|
||||
Check if the service is healthy
|
||||
"""
|
||||
try:
|
||||
# Send a test metric point to Datadog
|
||||
test_metric_point: DatadogMetricPoint = {
|
||||
"timestamp": int(time.time()),
|
||||
"value": 1.0,
|
||||
}
|
||||
test_metric_series: DatadogMetricSeries = {
|
||||
"metric": "litellm.health_check",
|
||||
"type": 3, # Gauge
|
||||
"points": [test_metric_point],
|
||||
"tags": ["env:health_check"],
|
||||
}
|
||||
|
||||
payload_data: DatadogMetricsPayload = {"series": [test_metric_series]}
|
||||
|
||||
await self._upload_to_datadog(payload_data)
|
||||
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="healthy",
|
||||
error_message=None,
|
||||
)
|
||||
except Exception as e:
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="unhealthy",
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
async def get_request_response_payload(
|
||||
self,
|
||||
request_id: str,
|
||||
start_time_utc: Optional[datetime],
|
||||
end_time_utc: Optional[datetime],
|
||||
) -> Optional[dict]:
|
||||
pass
|
||||
Reference in New Issue
Block a user