chore: initial public snapshot for github upload
This commit is contained in:
@@ -0,0 +1,777 @@
|
||||
"""
|
||||
DataDog Integration - sends logs to /api/v2/log
|
||||
|
||||
DD Reference API: https://docs.datadoghq.com/api/latest/logs
|
||||
|
||||
`async_log_success_event` - used by litellm proxy to send logs to datadog
|
||||
`log_success_event` - sync version of logging to DataDog, only used on litellm Python SDK, if user opts in to using sync functions
|
||||
|
||||
async_log_success_event: will store batch of DD_MAX_BATCH_SIZE in memory and flush to Datadog once it reaches DD_MAX_BATCH_SIZE or every 5 seconds
|
||||
|
||||
async_service_failure_hook: Logs failures from Redis, Postgres (Adjacent systems), as 'WARNING' on DataDog
|
||||
|
||||
For batching specific details see CustomBatchLogger class
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import os
|
||||
import traceback
|
||||
from datetime import datetime as datetimeObj
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import httpx
|
||||
from httpx import Response
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm._uuid import uuid
|
||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||
from litellm.integrations.datadog.datadog_mock_client import (
|
||||
should_use_datadog_mock,
|
||||
create_mock_datadog_client,
|
||||
)
|
||||
from litellm.integrations.datadog.datadog_handler import (
|
||||
get_datadog_hostname,
|
||||
get_datadog_service,
|
||||
get_datadog_source,
|
||||
get_datadog_tags,
|
||||
get_datadog_base_url_from_env,
|
||||
)
|
||||
from litellm.litellm_core_utils.dd_tracing import tracer
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
_get_httpx_client,
|
||||
get_async_httpx_client,
|
||||
httpxSpecialProvider,
|
||||
)
|
||||
from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
|
||||
from litellm.types.integrations.datadog import (
|
||||
DD_ERRORS,
|
||||
DD_MAX_BATCH_SIZE,
|
||||
DataDogStatus,
|
||||
DatadogInitParams,
|
||||
DatadogPayload,
|
||||
DatadogProxyFailureHookJsonMessage,
|
||||
)
|
||||
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
from ..additional_logging_utils import AdditionalLoggingUtils
|
||||
|
||||
# max number of logs DD API can accept
|
||||
|
||||
|
||||
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
|
||||
DD_LOGGED_SUCCESS_SERVICE_TYPES = [
|
||||
ServiceTypes.RESET_BUDGET_JOB,
|
||||
]
|
||||
|
||||
|
||||
class DataDogLogger(
|
||||
CustomBatchLogger,
|
||||
AdditionalLoggingUtils,
|
||||
):
|
||||
# Class variables or attributes
|
||||
def __init__(
|
||||
self,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initializes the datadog logger, checks if the correct env variables are set
|
||||
|
||||
Required environment variables (Direct API):
|
||||
`DD_API_KEY` - your datadog api key
|
||||
`DD_SITE` - your datadog site, example = `"us5.datadoghq.com"`
|
||||
|
||||
Optional environment variables (DataDog Agent):
|
||||
`LITELLM_DD_AGENT_HOST` - hostname or IP of DataDog agent, example = `"localhost"`
|
||||
`LITELLM_DD_AGENT_PORT` - port of DataDog agent (default: 10518 for logs)
|
||||
|
||||
Note: We use LITELLM_DD_AGENT_HOST instead of DD_AGENT_HOST to avoid conflicts
|
||||
with ddtrace which automatically sets DD_AGENT_HOST for APM tracing.
|
||||
"""
|
||||
try:
|
||||
verbose_logger.debug("Datadog: in init datadog logger")
|
||||
|
||||
self.is_mock_mode = should_use_datadog_mock()
|
||||
|
||||
if self.is_mock_mode:
|
||||
create_mock_datadog_client()
|
||||
verbose_logger.debug(
|
||||
"[DATADOG MOCK] Datadog logger initialized in mock mode"
|
||||
)
|
||||
|
||||
#########################################################
|
||||
# Handle datadog_params set as litellm.datadog_params
|
||||
#########################################################
|
||||
dict_datadog_params = self._get_datadog_params()
|
||||
kwargs.update(dict_datadog_params)
|
||||
|
||||
self.async_client = get_async_httpx_client(
|
||||
llm_provider=httpxSpecialProvider.LoggingCallback
|
||||
)
|
||||
|
||||
# Configure DataDog endpoint (Agent or Direct API)
|
||||
# Use LITELLM_DD_AGENT_HOST to avoid conflicts with ddtrace's DD_AGENT_HOST
|
||||
dd_agent_host = os.getenv("LITELLM_DD_AGENT_HOST")
|
||||
if dd_agent_host:
|
||||
self._configure_dd_agent(dd_agent_host=dd_agent_host)
|
||||
else:
|
||||
self._configure_dd_direct_api()
|
||||
|
||||
# Optional override for testing
|
||||
dd_base_url = get_datadog_base_url_from_env()
|
||||
if dd_base_url:
|
||||
self.intake_url = f"{dd_base_url}/api/v2/logs"
|
||||
self.sync_client = _get_httpx_client()
|
||||
asyncio.create_task(self.periodic_flush())
|
||||
self.flush_lock = asyncio.Lock()
|
||||
super().__init__(
|
||||
**kwargs, flush_lock=self.flush_lock, batch_size=DD_MAX_BATCH_SIZE
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog: Got exception on init Datadog client {str(e)}"
|
||||
)
|
||||
raise e
|
||||
|
||||
def _get_datadog_params(self) -> Dict:
|
||||
"""
|
||||
Get the datadog_params from litellm.datadog_params
|
||||
|
||||
These are params specific to initializing the DataDogLogger e.g. turn_off_message_logging
|
||||
"""
|
||||
dict_datadog_params: Dict = {}
|
||||
if litellm.datadog_params is not None:
|
||||
if isinstance(litellm.datadog_params, DatadogInitParams):
|
||||
dict_datadog_params = litellm.datadog_params.model_dump()
|
||||
elif isinstance(litellm.datadog_params, Dict):
|
||||
# only allow params that are of DatadogInitParams
|
||||
dict_datadog_params = DatadogInitParams(
|
||||
**litellm.datadog_params
|
||||
).model_dump()
|
||||
return dict_datadog_params
|
||||
|
||||
def _configure_dd_agent(self, dd_agent_host: str) -> None:
|
||||
"""
|
||||
Configure DataDog Agent for log forwarding
|
||||
|
||||
Args:
|
||||
dd_agent_host: Hostname or IP of DataDog agent
|
||||
"""
|
||||
dd_agent_port = os.getenv(
|
||||
"LITELLM_DD_AGENT_PORT", "10518"
|
||||
) # default port for logs
|
||||
self.intake_url = f"http://{dd_agent_host}:{dd_agent_port}/api/v2/logs"
|
||||
self.DD_API_KEY = os.getenv("DD_API_KEY") # Optional when using agent
|
||||
verbose_logger.debug(f"Datadog: Using DD Agent at {self.intake_url}")
|
||||
|
||||
def _configure_dd_direct_api(self) -> None:
|
||||
"""
|
||||
Configure direct DataDog API connection
|
||||
|
||||
Raises:
|
||||
Exception: If required environment variables are not set
|
||||
"""
|
||||
if os.getenv("DD_API_KEY", None) is None:
|
||||
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>")
|
||||
if os.getenv("DD_SITE", None) is None:
|
||||
raise Exception("DD_SITE is not set in .env, set 'DD_SITE=<>")
|
||||
|
||||
self.DD_API_KEY = os.getenv("DD_API_KEY")
|
||||
self.intake_url = f"https://http-intake.logs.{os.getenv('DD_SITE')}/api/v2/logs"
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
"""
|
||||
Async Log success events to Datadog
|
||||
|
||||
- Creates a Datadog payload
|
||||
- Adds the Payload to the in memory logs queue
|
||||
- Payload is flushed every 10 seconds or when batch size is greater than 100
|
||||
|
||||
|
||||
Raises:
|
||||
Raises a NON Blocking verbose_logger.exception if an error occurs
|
||||
"""
|
||||
try:
|
||||
verbose_logger.debug(
|
||||
"Datadog: Logging - Enters logging function for model %s", kwargs
|
||||
)
|
||||
await self._log_async_event(kwargs, response_obj, start_time, end_time)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
|
||||
)
|
||||
pass
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
verbose_logger.debug(
|
||||
"Datadog: Logging - Enters logging function for model %s", kwargs
|
||||
)
|
||||
await self._log_async_event(kwargs, response_obj, start_time, end_time)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
|
||||
)
|
||||
pass
|
||||
|
||||
async def async_post_call_failure_hook(
|
||||
self,
|
||||
request_data: dict,
|
||||
original_exception: Exception,
|
||||
user_api_key_dict: Any,
|
||||
traceback_str: Optional[str] = None,
|
||||
) -> Optional[Any]:
|
||||
"""
|
||||
Log proxy-level failures (e.g. 401 auth, DB connection errors) to Datadog.
|
||||
|
||||
Ensures failures that occur before or outside the LLM completion flow
|
||||
(e.g. ConnectError during auth when DB is down) are visible in Datadog
|
||||
alongside Prometheus.
|
||||
"""
|
||||
try:
|
||||
from litellm.litellm_core_utils.litellm_logging import (
|
||||
StandardLoggingPayloadSetup,
|
||||
)
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
error_information = StandardLoggingPayloadSetup.get_error_information(
|
||||
original_exception=original_exception,
|
||||
traceback_str=traceback_str,
|
||||
)
|
||||
_code = error_information.get("error_code") or ""
|
||||
status_code: Optional[int] = None
|
||||
if _code and str(_code).strip().isdigit():
|
||||
status_code = int(_code)
|
||||
|
||||
# Use project-standard sanitized user context when running in proxy
|
||||
user_context: Dict[str, Any] = {}
|
||||
try:
|
||||
from litellm.proxy.litellm_pre_call_utils import (
|
||||
LiteLLMProxyRequestSetup,
|
||||
)
|
||||
|
||||
_meta = (
|
||||
LiteLLMProxyRequestSetup.get_sanitized_user_information_from_key(
|
||||
user_api_key_dict=user_api_key_dict
|
||||
)
|
||||
)
|
||||
user_context = dict(_meta) if isinstance(_meta, dict) else _meta
|
||||
except Exception:
|
||||
# Fallback if proxy not available (e.g. SDK-only): minimal safe fields
|
||||
if hasattr(user_api_key_dict, "request_route"):
|
||||
user_context["request_route"] = getattr(
|
||||
user_api_key_dict, "request_route", None
|
||||
)
|
||||
if hasattr(user_api_key_dict, "team_id"):
|
||||
user_context["team_id"] = getattr(
|
||||
user_api_key_dict, "team_id", None
|
||||
)
|
||||
if hasattr(user_api_key_dict, "user_id"):
|
||||
user_context["user_id"] = getattr(
|
||||
user_api_key_dict, "user_id", None
|
||||
)
|
||||
if hasattr(user_api_key_dict, "end_user_id"):
|
||||
user_context["end_user_id"] = getattr(
|
||||
user_api_key_dict, "end_user_id", None
|
||||
)
|
||||
|
||||
message_payload: DatadogProxyFailureHookJsonMessage = {
|
||||
"exception": error_information.get("error_message")
|
||||
or str(original_exception),
|
||||
"error_class": error_information.get("error_class")
|
||||
or original_exception.__class__.__name__,
|
||||
"status_code": status_code,
|
||||
"traceback": error_information.get("traceback") or "",
|
||||
"user_api_key_dict": user_context,
|
||||
}
|
||||
|
||||
dd_payload = DatadogPayload(
|
||||
ddsource=get_datadog_source(),
|
||||
ddtags=get_datadog_tags(),
|
||||
hostname=get_datadog_hostname(),
|
||||
message=safe_dumps(message_payload),
|
||||
service=get_datadog_service(),
|
||||
status=DataDogStatus.ERROR,
|
||||
)
|
||||
self._add_trace_context_to_payload(dd_payload=dd_payload)
|
||||
self.log_queue.append(dd_payload)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.async_send_batch()
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog: async_post_call_failure_hook - {str(e)}\n{traceback.format_exc()}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def async_send_batch(self):
|
||||
"""
|
||||
Sends the in memory logs queue to datadog api
|
||||
|
||||
Logs sent to /api/v2/logs
|
||||
|
||||
DD Ref: https://docs.datadoghq.com/api/latest/logs/
|
||||
|
||||
Raises:
|
||||
Raises a NON Blocking verbose_logger.exception if an error occurs
|
||||
"""
|
||||
try:
|
||||
if not self.log_queue:
|
||||
verbose_logger.exception("Datadog: log_queue does not exist")
|
||||
return
|
||||
|
||||
verbose_logger.debug(
|
||||
"Datadog - about to flush %s events on %s",
|
||||
len(self.log_queue),
|
||||
self.intake_url,
|
||||
)
|
||||
|
||||
if self.is_mock_mode:
|
||||
verbose_logger.debug(
|
||||
"[DATADOG MOCK] Mock mode enabled - API calls will be intercepted"
|
||||
)
|
||||
|
||||
response = await self.async_send_compressed_data(self.log_queue)
|
||||
if response.status_code == 413:
|
||||
verbose_logger.exception(DD_ERRORS.DATADOG_413_ERROR.value)
|
||||
return
|
||||
|
||||
response.raise_for_status()
|
||||
if response.status_code != 202:
|
||||
raise Exception(
|
||||
f"Response from datadog API status_code: {response.status_code}, text: {response.text}"
|
||||
)
|
||||
|
||||
if self.is_mock_mode:
|
||||
verbose_logger.debug(
|
||||
f"[DATADOG MOCK] Batch of {len(self.log_queue)} events successfully mocked"
|
||||
)
|
||||
else:
|
||||
verbose_logger.debug(
|
||||
"Datadog: Response from datadog API status_code: %s, text: %s",
|
||||
response.status_code,
|
||||
response.text,
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Error sending batch API - {str(e)}\n{traceback.format_exc()}"
|
||||
)
|
||||
|
||||
def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
"""
|
||||
Sync Log success events to Datadog
|
||||
|
||||
- Creates a Datadog payload
|
||||
- instantly logs it on DD API
|
||||
"""
|
||||
try:
|
||||
if litellm.datadog_use_v1 is True:
|
||||
dd_payload = self._create_v0_logging_payload(
|
||||
kwargs=kwargs,
|
||||
response_obj=response_obj,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
else:
|
||||
dd_payload = self.create_datadog_logging_payload(
|
||||
kwargs=kwargs,
|
||||
response_obj=response_obj,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
# Build headers
|
||||
headers = {}
|
||||
# Add API key if available (required for direct API, optional for agent)
|
||||
if self.DD_API_KEY:
|
||||
headers["DD-API-KEY"] = self.DD_API_KEY
|
||||
|
||||
response = self.sync_client.post(
|
||||
url=self.intake_url,
|
||||
json=dd_payload, # type: ignore
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
if response.status_code != 202:
|
||||
raise Exception(
|
||||
f"Response from datadog API status_code: {response.status_code}, text: {response.text}"
|
||||
)
|
||||
|
||||
verbose_logger.debug(
|
||||
"Datadog: Response from datadog API status_code: %s, text: %s",
|
||||
response.status_code,
|
||||
response.text,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}"
|
||||
)
|
||||
pass
|
||||
pass
|
||||
|
||||
async def _log_async_event(self, kwargs, response_obj, start_time, end_time):
|
||||
dd_payload = self.create_datadog_logging_payload(
|
||||
kwargs=kwargs,
|
||||
response_obj=response_obj,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
self.log_queue.append(dd_payload)
|
||||
verbose_logger.debug(
|
||||
f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..."
|
||||
)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.async_send_batch()
|
||||
|
||||
def _create_datadog_logging_payload_helper(
|
||||
self,
|
||||
standard_logging_object: StandardLoggingPayload,
|
||||
status: DataDogStatus,
|
||||
) -> DatadogPayload:
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
json_payload = safe_dumps(standard_logging_object)
|
||||
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
|
||||
dd_payload = DatadogPayload(
|
||||
ddsource=get_datadog_source(),
|
||||
ddtags=get_datadog_tags(standard_logging_object=standard_logging_object),
|
||||
hostname=get_datadog_hostname(),
|
||||
message=json_payload,
|
||||
service=get_datadog_service(),
|
||||
status=status,
|
||||
)
|
||||
self._add_trace_context_to_payload(dd_payload=dd_payload)
|
||||
return dd_payload
|
||||
|
||||
def create_datadog_logging_payload(
|
||||
self,
|
||||
kwargs: Union[dict, Any],
|
||||
response_obj: Any,
|
||||
start_time: datetime.datetime,
|
||||
end_time: datetime.datetime,
|
||||
) -> DatadogPayload:
|
||||
"""
|
||||
Helper function to create a datadog payload for logging
|
||||
|
||||
Args:
|
||||
kwargs (Union[dict, Any]): request kwargs
|
||||
response_obj (Any): llm api response
|
||||
start_time (datetime.datetime): start time of request
|
||||
end_time (datetime.datetime): end time of request
|
||||
|
||||
Returns:
|
||||
DatadogPayload: defined in types.py
|
||||
"""
|
||||
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object", None
|
||||
)
|
||||
if standard_logging_object is None:
|
||||
raise ValueError("standard_logging_object not found in kwargs")
|
||||
|
||||
status = DataDogStatus.INFO
|
||||
if standard_logging_object.get("status") == "failure":
|
||||
status = DataDogStatus.ERROR
|
||||
|
||||
# Build the initial payload
|
||||
self.truncate_standard_logging_payload_content(standard_logging_object)
|
||||
|
||||
dd_payload = self._create_datadog_logging_payload_helper(
|
||||
standard_logging_object=standard_logging_object,
|
||||
status=status,
|
||||
)
|
||||
return dd_payload
|
||||
|
||||
async def async_send_compressed_data(self, data: List) -> Response:
|
||||
"""
|
||||
Async helper to send compressed data to datadog self.intake_url
|
||||
|
||||
Datadog recommends using gzip to compress data
|
||||
https://docs.datadoghq.com/api/latest/logs/
|
||||
|
||||
"Datadog recommends sending your logs compressed. Add the Content-Encoding: gzip header to the request when sending"
|
||||
"""
|
||||
|
||||
import gzip
|
||||
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
compressed_data = gzip.compress(safe_dumps(data).encode("utf-8"))
|
||||
|
||||
# Build headers
|
||||
headers = {
|
||||
"Content-Encoding": "gzip",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
# Add API key if available (required for direct API, optional for agent)
|
||||
if self.DD_API_KEY:
|
||||
headers["DD-API-KEY"] = self.DD_API_KEY
|
||||
|
||||
response = await self.async_client.post(
|
||||
url=self.intake_url,
|
||||
data=compressed_data, # type: ignore
|
||||
headers=headers,
|
||||
)
|
||||
return response
|
||||
|
||||
async def async_service_failure_hook(
|
||||
self,
|
||||
payload: ServiceLoggerPayload,
|
||||
error: Optional[str] = "",
|
||||
parent_otel_span: Optional[Any] = None,
|
||||
start_time: Optional[Union[datetimeObj, float]] = None,
|
||||
end_time: Optional[Union[float, datetimeObj]] = None,
|
||||
event_metadata: Optional[dict] = None,
|
||||
):
|
||||
"""
|
||||
Logs failures from Redis, Postgres (Adjacent systems), as 'WARNING' on DataDog
|
||||
|
||||
- example - Redis is failing / erroring, will be logged on DataDog
|
||||
"""
|
||||
try:
|
||||
_payload_dict = payload.model_dump()
|
||||
_payload_dict.update(event_metadata or {})
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
_dd_message_str = safe_dumps(_payload_dict)
|
||||
_dd_payload = DatadogPayload(
|
||||
ddsource=get_datadog_source(),
|
||||
ddtags=get_datadog_tags(),
|
||||
hostname=get_datadog_hostname(),
|
||||
message=_dd_message_str,
|
||||
service=get_datadog_service(),
|
||||
status=DataDogStatus.WARN,
|
||||
)
|
||||
|
||||
self.log_queue.append(_dd_payload)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog: Logger - Exception in async_service_failure_hook: {e}"
|
||||
)
|
||||
pass
|
||||
|
||||
async def async_service_success_hook(
|
||||
self,
|
||||
payload: ServiceLoggerPayload,
|
||||
error: Optional[str] = "",
|
||||
parent_otel_span: Optional[Any] = None,
|
||||
start_time: Optional[Union[datetimeObj, float]] = None,
|
||||
end_time: Optional[Union[float, datetimeObj]] = None,
|
||||
event_metadata: Optional[dict] = None,
|
||||
):
|
||||
"""
|
||||
Logs success from Redis, Postgres (Adjacent systems), as 'INFO' on DataDog
|
||||
|
||||
No user has asked for this so far, this might be spammy on datatdog. If need arises we can implement this
|
||||
"""
|
||||
try:
|
||||
# intentionally done. Don't want to log all service types to DD
|
||||
if payload.service not in DD_LOGGED_SUCCESS_SERVICE_TYPES:
|
||||
return
|
||||
|
||||
_payload_dict = payload.model_dump()
|
||||
_payload_dict.update(event_metadata or {})
|
||||
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
_dd_message_str = safe_dumps(_payload_dict)
|
||||
_dd_payload = DatadogPayload(
|
||||
ddsource=get_datadog_source(),
|
||||
ddtags=get_datadog_tags(),
|
||||
hostname=get_datadog_hostname(),
|
||||
message=_dd_message_str,
|
||||
service=get_datadog_service(),
|
||||
status=DataDogStatus.INFO,
|
||||
)
|
||||
|
||||
self.log_queue.append(_dd_payload)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog: Logger - Exception in async_service_failure_hook: {e}"
|
||||
)
|
||||
|
||||
def _create_v0_logging_payload(
|
||||
self,
|
||||
kwargs: Union[dict, Any],
|
||||
response_obj: Any,
|
||||
start_time: datetime.datetime,
|
||||
end_time: datetime.datetime,
|
||||
) -> DatadogPayload:
|
||||
"""
|
||||
Note: This is our V1 Version of DataDog Logging Payload
|
||||
|
||||
|
||||
(Not Recommended) If you want this to get logged set `litellm.datadog_use_v1 = True`
|
||||
"""
|
||||
|
||||
litellm_params = kwargs.get("litellm_params", {})
|
||||
metadata = (
|
||||
litellm_params.get("metadata", {}) or {}
|
||||
) # if litellm_params['metadata'] == None
|
||||
messages = kwargs.get("messages")
|
||||
optional_params = kwargs.get("optional_params", {})
|
||||
call_type = kwargs.get("call_type", "litellm.completion")
|
||||
cache_hit = kwargs.get("cache_hit", False)
|
||||
usage = response_obj["usage"]
|
||||
id = response_obj.get("id", str(uuid.uuid4()))
|
||||
usage = dict(usage)
|
||||
try:
|
||||
response_time = (end_time - start_time).total_seconds() * 1000
|
||||
except Exception:
|
||||
response_time = None
|
||||
|
||||
try:
|
||||
response_obj = dict(response_obj)
|
||||
except Exception:
|
||||
response_obj = response_obj
|
||||
|
||||
# Clean Metadata before logging - never log raw metadata
|
||||
# the raw metadata can contain circular references which leads to infinite recursion
|
||||
# we clean out all extra litellm metadata params before logging
|
||||
clean_metadata = {}
|
||||
if isinstance(metadata, dict):
|
||||
for key, value in metadata.items():
|
||||
# clean litellm metadata before logging
|
||||
if key in [
|
||||
"endpoint",
|
||||
"caching_groups",
|
||||
"previous_models",
|
||||
]:
|
||||
continue
|
||||
else:
|
||||
clean_metadata[key] = value
|
||||
|
||||
# Build the initial payload
|
||||
payload = {
|
||||
"id": id,
|
||||
"call_type": call_type,
|
||||
"cache_hit": cache_hit,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"response_time": response_time,
|
||||
"model": kwargs.get("model", ""),
|
||||
"user": kwargs.get("user", ""),
|
||||
"model_parameters": optional_params,
|
||||
"spend": kwargs.get("response_cost", 0),
|
||||
"messages": messages,
|
||||
"response": response_obj,
|
||||
"usage": usage,
|
||||
"metadata": clean_metadata,
|
||||
}
|
||||
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
json_payload = safe_dumps(payload)
|
||||
|
||||
verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload)
|
||||
|
||||
dd_payload = DatadogPayload(
|
||||
ddsource=get_datadog_source(),
|
||||
ddtags=get_datadog_tags(),
|
||||
hostname=get_datadog_hostname(),
|
||||
message=json_payload,
|
||||
service=get_datadog_service(),
|
||||
status=DataDogStatus.INFO,
|
||||
)
|
||||
return dd_payload
|
||||
|
||||
def _add_trace_context_to_payload(
|
||||
self,
|
||||
dd_payload: DatadogPayload,
|
||||
) -> None:
|
||||
"""Attach Datadog APM trace context if one is active."""
|
||||
|
||||
try:
|
||||
trace_context = self._get_active_trace_context()
|
||||
if trace_context is None:
|
||||
return
|
||||
|
||||
dd_payload["dd.trace_id"] = trace_context["trace_id"]
|
||||
span_id = trace_context.get("span_id")
|
||||
if span_id is not None:
|
||||
dd_payload["dd.span_id"] = span_id
|
||||
except Exception:
|
||||
verbose_logger.exception(
|
||||
"Datadog: Failed to attach trace context to payload"
|
||||
)
|
||||
|
||||
def _get_active_trace_context(self) -> Optional[Dict[str, str]]:
|
||||
try:
|
||||
current_span = None
|
||||
current_span_fn = getattr(tracer, "current_span", None)
|
||||
if callable(current_span_fn):
|
||||
current_span = current_span_fn()
|
||||
|
||||
if current_span is None:
|
||||
current_root_span_fn = getattr(tracer, "current_root_span", None)
|
||||
if callable(current_root_span_fn):
|
||||
current_span = current_root_span_fn()
|
||||
|
||||
if current_span is None:
|
||||
return None
|
||||
|
||||
trace_id = getattr(current_span, "trace_id", None)
|
||||
if trace_id is None:
|
||||
return None
|
||||
|
||||
span_id = getattr(current_span, "span_id", None)
|
||||
trace_context: Dict[str, str] = {"trace_id": str(trace_id)}
|
||||
if span_id is not None:
|
||||
trace_context["span_id"] = str(span_id)
|
||||
return trace_context
|
||||
except Exception:
|
||||
verbose_logger.exception(
|
||||
"Datadog: Failed to retrieve active trace context from tracer"
|
||||
)
|
||||
return None
|
||||
|
||||
async def async_health_check(self) -> IntegrationHealthCheckStatus:
|
||||
"""
|
||||
Check if the service is healthy
|
||||
"""
|
||||
from litellm.litellm_core_utils.litellm_logging import (
|
||||
create_dummy_standard_logging_payload,
|
||||
)
|
||||
|
||||
standard_logging_object = create_dummy_standard_logging_payload()
|
||||
dd_payload = self._create_datadog_logging_payload_helper(
|
||||
standard_logging_object=standard_logging_object,
|
||||
status=DataDogStatus.INFO,
|
||||
)
|
||||
log_queue = [dd_payload]
|
||||
response = await self.async_send_compressed_data(log_queue)
|
||||
try:
|
||||
response.raise_for_status()
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="healthy",
|
||||
error_message=None,
|
||||
)
|
||||
except httpx.HTTPStatusError as e:
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="unhealthy",
|
||||
error_message=e.response.text,
|
||||
)
|
||||
except Exception as e:
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="unhealthy",
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
async def get_request_response_payload(
|
||||
self,
|
||||
request_id: str,
|
||||
start_time_utc: Optional[datetimeObj],
|
||||
end_time_utc: Optional[datetimeObj],
|
||||
) -> Optional[dict]:
|
||||
pass
|
||||
@@ -0,0 +1,216 @@
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
get_async_httpx_client,
|
||||
httpxSpecialProvider,
|
||||
)
|
||||
from litellm.types.integrations.datadog_cost_management import (
|
||||
DatadogFOCUSCostEntry,
|
||||
)
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
|
||||
class DatadogCostManagementLogger(CustomBatchLogger):
|
||||
def __init__(self, **kwargs):
|
||||
self.dd_api_key = os.getenv("DD_API_KEY")
|
||||
self.dd_app_key = os.getenv("DD_APP_KEY")
|
||||
self.dd_site = os.getenv("DD_SITE", "datadoghq.com")
|
||||
|
||||
if not self.dd_api_key or not self.dd_app_key:
|
||||
verbose_logger.warning(
|
||||
"Datadog Cost Management: DD_API_KEY and DD_APP_KEY are required. Integration will not work."
|
||||
)
|
||||
|
||||
self.upload_url = f"https://api.{self.dd_site}/api/v2/cost/custom_costs"
|
||||
|
||||
self.async_client = get_async_httpx_client(
|
||||
llm_provider=httpxSpecialProvider.LoggingCallback
|
||||
)
|
||||
|
||||
# Initialize lock and start periodic flush task
|
||||
self.flush_lock = asyncio.Lock()
|
||||
asyncio.create_task(self.periodic_flush())
|
||||
|
||||
# Check if flush_lock is already in kwargs to avoid double passing (unlikely but safe)
|
||||
if "flush_lock" not in kwargs:
|
||||
kwargs["flush_lock"] = self.flush_lock
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object", None
|
||||
)
|
||||
|
||||
if standard_logging_object is None:
|
||||
return
|
||||
|
||||
# Only log if there is a cost associated
|
||||
if standard_logging_object.get("response_cost", 0) > 0:
|
||||
self.log_queue.append(standard_logging_object)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.async_send_batch()
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Cost Management: Error in async_log_success_event: {str(e)}"
|
||||
)
|
||||
|
||||
async def async_send_batch(self):
|
||||
if not self.log_queue:
|
||||
return
|
||||
|
||||
try:
|
||||
# Aggregate costs from the batch
|
||||
aggregated_entries = self._aggregate_costs(self.log_queue)
|
||||
|
||||
if not aggregated_entries:
|
||||
return
|
||||
|
||||
# Send to Datadog
|
||||
await self._upload_to_datadog(aggregated_entries)
|
||||
|
||||
# Clear queue only on success (or if we decide to drop on failure)
|
||||
# CustomBatchLogger clears queue in flush_queue, so we just process here
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Cost Management: Error in async_send_batch: {str(e)}"
|
||||
)
|
||||
|
||||
def _aggregate_costs(
|
||||
self, logs: List[StandardLoggingPayload]
|
||||
) -> List[DatadogFOCUSCostEntry]:
|
||||
"""
|
||||
Aggregates costs by Provider, Model, and Date.
|
||||
Returns a list of DatadogFOCUSCostEntry.
|
||||
"""
|
||||
aggregator: Dict[
|
||||
Tuple[str, str, str, Tuple[Tuple[str, str], ...]], DatadogFOCUSCostEntry
|
||||
] = {}
|
||||
|
||||
for log in logs:
|
||||
try:
|
||||
# Extract keys for aggregation
|
||||
provider = log.get("custom_llm_provider") or "unknown"
|
||||
model = log.get("model") or "unknown"
|
||||
cost = log.get("response_cost", 0)
|
||||
|
||||
if cost == 0:
|
||||
continue
|
||||
|
||||
# Get date strings (FOCUS format requires specific keys, but for aggregation we group by Day)
|
||||
# UTC date
|
||||
# We interpret "ChargePeriod" as the day of the request.
|
||||
ts = log.get("startTime") or time.time()
|
||||
dt = datetime.fromtimestamp(ts)
|
||||
date_str = dt.strftime("%Y-%m-%d")
|
||||
|
||||
# ChargePeriodStart and End
|
||||
# If we want daily granularity, end date is usually same day or next day?
|
||||
# Datadog Custom Costs usually expects periods.
|
||||
# "ChargePeriodStart": "2023-01-01", "ChargePeriodEnd": "2023-12-31" in example.
|
||||
# If we send daily, we can say Start=Date, End=Date.
|
||||
|
||||
# Grouping Key: Provider + Model + Date + Tags?
|
||||
# For simplicity, let's aggregate by Provider + Model + Date first.
|
||||
# If we handle tags, we need to include them in the key.
|
||||
|
||||
tags = self._extract_tags(log)
|
||||
tags_key = tuple(sorted(tags.items())) if tags else ()
|
||||
|
||||
key = (provider, model, date_str, tags_key)
|
||||
|
||||
if key not in aggregator:
|
||||
aggregator[key] = {
|
||||
"ProviderName": provider,
|
||||
"ChargeDescription": f"LLM Usage for {model}",
|
||||
"ChargePeriodStart": date_str,
|
||||
"ChargePeriodEnd": date_str,
|
||||
"BilledCost": 0.0,
|
||||
"BillingCurrency": "USD",
|
||||
"Tags": tags if tags else None,
|
||||
}
|
||||
|
||||
aggregator[key]["BilledCost"] += cost
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.warning(
|
||||
f"Error processing log for cost aggregation: {e}"
|
||||
)
|
||||
continue
|
||||
|
||||
return list(aggregator.values())
|
||||
|
||||
def _extract_tags(self, log: StandardLoggingPayload) -> Dict[str, str]:
|
||||
from litellm.integrations.datadog.datadog_handler import (
|
||||
get_datadog_env,
|
||||
get_datadog_hostname,
|
||||
get_datadog_pod_name,
|
||||
get_datadog_service,
|
||||
)
|
||||
|
||||
tags = {
|
||||
"env": get_datadog_env(),
|
||||
"service": get_datadog_service(),
|
||||
"host": get_datadog_hostname(),
|
||||
"pod_name": get_datadog_pod_name(),
|
||||
}
|
||||
|
||||
# Add metadata as tags
|
||||
metadata = log.get("metadata", {})
|
||||
if metadata:
|
||||
# Add user info
|
||||
# Add user info
|
||||
if metadata.get("user_api_key_alias"):
|
||||
tags["user"] = str(metadata["user_api_key_alias"])
|
||||
|
||||
# Add Team Tag
|
||||
team_tag = (
|
||||
metadata.get("user_api_key_team_alias")
|
||||
or metadata.get("team_alias") # type: ignore
|
||||
or metadata.get("user_api_key_team_id")
|
||||
or metadata.get("team_id") # type: ignore
|
||||
)
|
||||
|
||||
if team_tag:
|
||||
tags["team"] = str(team_tag)
|
||||
# model_group is not in StandardLoggingMetadata TypedDict, so we need to access it via dict.get()
|
||||
model_group = metadata.get("model_group") # type: ignore[misc]
|
||||
if model_group:
|
||||
tags["model_group"] = str(model_group)
|
||||
|
||||
return tags
|
||||
|
||||
async def _upload_to_datadog(self, payload: List[Dict]):
|
||||
if not self.dd_api_key or not self.dd_app_key:
|
||||
return
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"DD-API-KEY": self.dd_api_key,
|
||||
"DD-APPLICATION-KEY": self.dd_app_key,
|
||||
}
|
||||
|
||||
# The API endpoint expects a list of objects directly in the body (file content behavior)
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
data_json = safe_dumps(payload)
|
||||
|
||||
response = await self.async_client.put(
|
||||
self.upload_url, content=data_json, headers=headers
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Datadog Cost Management: Uploaded {len(payload)} cost entries. Status: {response.status_code}"
|
||||
)
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Shared helpers for Datadog integrations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
|
||||
def get_datadog_source() -> str:
|
||||
return os.getenv("DD_SOURCE", "litellm")
|
||||
|
||||
|
||||
def get_datadog_service() -> str:
|
||||
return os.getenv("DD_SERVICE", "litellm-server")
|
||||
|
||||
|
||||
def get_datadog_hostname() -> str:
|
||||
return os.getenv("HOSTNAME", "")
|
||||
|
||||
|
||||
def get_datadog_base_url_from_env() -> Optional[str]:
|
||||
"""
|
||||
Get base URL override from common DD_BASE_URL env var.
|
||||
This is useful for testing or custom endpoints.
|
||||
"""
|
||||
return os.getenv("DD_BASE_URL")
|
||||
|
||||
|
||||
def get_datadog_env() -> str:
|
||||
return os.getenv("DD_ENV", "unknown")
|
||||
|
||||
|
||||
def get_datadog_pod_name() -> str:
|
||||
return os.getenv("POD_NAME", "unknown")
|
||||
|
||||
|
||||
def get_datadog_tags(
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = None,
|
||||
) -> str:
|
||||
"""Build Datadog tags string used by multiple integrations."""
|
||||
|
||||
base_tags = {
|
||||
"env": get_datadog_env(),
|
||||
"service": get_datadog_service(),
|
||||
"version": os.getenv("DD_VERSION", "unknown"),
|
||||
"HOSTNAME": get_datadog_hostname(),
|
||||
"POD_NAME": get_datadog_pod_name(),
|
||||
}
|
||||
|
||||
tags: List[str] = [f"{k}:{v}" for k, v in base_tags.items()]
|
||||
|
||||
if standard_logging_object:
|
||||
request_tags = standard_logging_object.get("request_tags", []) or []
|
||||
tags.extend(f"request_tag:{tag}" for tag in request_tags)
|
||||
|
||||
# Add Team Tag
|
||||
metadata = standard_logging_object.get("metadata", {}) or {}
|
||||
team_tag = (
|
||||
metadata.get("user_api_key_team_alias")
|
||||
or metadata.get("team_alias")
|
||||
or metadata.get("user_api_key_team_id")
|
||||
or metadata.get("team_id")
|
||||
)
|
||||
if team_tag:
|
||||
tags.append(f"team:{team_tag}")
|
||||
|
||||
return ",".join(tags)
|
||||
@@ -0,0 +1,856 @@
|
||||
"""
|
||||
Implements logging integration with Datadog's LLM Observability Service
|
||||
|
||||
|
||||
API Reference: https://docs.datadoghq.com/llm_observability/setup/api/?tab=example#api-standards
|
||||
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from litellm._uuid import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
import httpx
|
||||
|
||||
import litellm
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||
from litellm.integrations.datadog.datadog_mock_client import (
|
||||
should_use_datadog_mock,
|
||||
create_mock_datadog_client,
|
||||
)
|
||||
from litellm.integrations.datadog.datadog_handler import (
|
||||
get_datadog_service,
|
||||
get_datadog_tags,
|
||||
get_datadog_base_url_from_env,
|
||||
)
|
||||
from litellm.litellm_core_utils.dd_tracing import tracer
|
||||
from litellm.litellm_core_utils.prompt_templates.common_utils import (
|
||||
handle_any_messages_to_chat_completion_str_messages_conversion,
|
||||
)
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
get_async_httpx_client,
|
||||
httpxSpecialProvider,
|
||||
)
|
||||
from litellm.types.integrations.datadog_llm_obs import *
|
||||
from litellm.types.utils import (
|
||||
CallTypes,
|
||||
StandardLoggingGuardrailInformation,
|
||||
StandardLoggingPayload,
|
||||
StandardLoggingPayloadErrorInformation,
|
||||
)
|
||||
|
||||
|
||||
class DataDogLLMObsLogger(CustomBatchLogger):
|
||||
def __init__(self, **kwargs):
|
||||
try:
|
||||
verbose_logger.debug("DataDogLLMObs: Initializing logger")
|
||||
|
||||
self.is_mock_mode = should_use_datadog_mock()
|
||||
|
||||
if self.is_mock_mode:
|
||||
create_mock_datadog_client()
|
||||
verbose_logger.debug(
|
||||
"[DATADOG MOCK] DataDogLLMObs logger initialized in mock mode"
|
||||
)
|
||||
|
||||
# Configure DataDog endpoint (Agent or Direct API)
|
||||
# Use LITELLM_DD_AGENT_HOST to avoid conflicts with ddtrace's DD_AGENT_HOST
|
||||
# Check for agent mode FIRST - agent mode doesn't require DD_API_KEY or DD_SITE
|
||||
dd_agent_host = os.getenv("LITELLM_DD_AGENT_HOST")
|
||||
|
||||
self.async_client = get_async_httpx_client(
|
||||
llm_provider=httpxSpecialProvider.LoggingCallback
|
||||
)
|
||||
self.DD_API_KEY = os.getenv("DD_API_KEY")
|
||||
|
||||
if dd_agent_host:
|
||||
self._configure_dd_agent(dd_agent_host=dd_agent_host)
|
||||
else:
|
||||
# Only require DD_API_KEY and DD_SITE for direct API mode
|
||||
if os.getenv("DD_API_KEY", None) is None:
|
||||
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>'")
|
||||
if os.getenv("DD_SITE", None) is None:
|
||||
raise Exception(
|
||||
"DD_SITE is not set, set 'DD_SITE=<>', example sit = `us5.datadoghq.com`"
|
||||
)
|
||||
self._configure_dd_direct_api()
|
||||
|
||||
# Optional override for testing
|
||||
dd_base_url = get_datadog_base_url_from_env()
|
||||
if dd_base_url:
|
||||
self.intake_url = f"{dd_base_url}/api/intake/llm-obs/v1/trace/spans"
|
||||
|
||||
asyncio.create_task(self.periodic_flush())
|
||||
self.flush_lock = asyncio.Lock()
|
||||
self.log_queue: List[LLMObsPayload] = []
|
||||
|
||||
#########################################################
|
||||
# Handle datadog_llm_observability_params set as litellm.datadog_llm_observability_params
|
||||
#########################################################
|
||||
dict_datadog_llm_obs_params = self._get_datadog_llm_obs_params()
|
||||
kwargs.update(dict_datadog_llm_obs_params)
|
||||
CustomBatchLogger.__init__(self, **kwargs, flush_lock=self.flush_lock)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(f"DataDogLLMObs: Error initializing - {str(e)}")
|
||||
raise e
|
||||
|
||||
def _configure_dd_agent(self, dd_agent_host: str):
|
||||
"""
|
||||
Configure the Datadog logger to send traces to the Agent.
|
||||
"""
|
||||
# When using the Agent, LLM Observability Intake does NOT require the API Key
|
||||
# Reference: https://docs.datadoghq.com/llm_observability/setup/sdk/#agent-setup
|
||||
|
||||
# Use specific port for LLM Obs (Trace Agent) to avoid conflict with Logs Agent (10518)
|
||||
agent_port = os.getenv("LITELLM_DD_LLM_OBS_PORT", "8126")
|
||||
self.DD_SITE = "localhost" # Not used for URL construction in agent mode
|
||||
self.intake_url = (
|
||||
f"http://{dd_agent_host}:{agent_port}/api/intake/llm-obs/v1/trace/spans"
|
||||
)
|
||||
verbose_logger.debug(f"DataDogLLMObs: Using DD Agent at {self.intake_url}")
|
||||
|
||||
def _configure_dd_direct_api(self):
|
||||
"""
|
||||
Configure the Datadog logger to send traces directly to the Datadog API.
|
||||
"""
|
||||
if not self.DD_API_KEY:
|
||||
raise Exception("DD_API_KEY is not set, set 'DD_API_KEY=<>'")
|
||||
|
||||
self.DD_SITE = os.getenv("DD_SITE")
|
||||
if not self.DD_SITE:
|
||||
raise Exception(
|
||||
"DD_SITE is not set, set 'DD_SITE=<>', example site = `us5.datadoghq.com`"
|
||||
)
|
||||
|
||||
self.intake_url = (
|
||||
f"https://api.{self.DD_SITE}/api/intake/llm-obs/v1/trace/spans"
|
||||
)
|
||||
|
||||
def _get_datadog_llm_obs_params(self) -> Dict:
|
||||
"""
|
||||
Get the datadog_llm_observability_params from litellm.datadog_llm_observability_params
|
||||
|
||||
These are params specific to initializing the DataDogLLMObsLogger e.g. turn_off_message_logging
|
||||
"""
|
||||
dict_datadog_llm_obs_params: Dict = {}
|
||||
if litellm.datadog_llm_observability_params is not None:
|
||||
if isinstance(
|
||||
litellm.datadog_llm_observability_params, DatadogLLMObsInitParams
|
||||
):
|
||||
dict_datadog_llm_obs_params = (
|
||||
litellm.datadog_llm_observability_params.model_dump()
|
||||
)
|
||||
elif isinstance(litellm.datadog_llm_observability_params, Dict):
|
||||
# only allow params that are of DatadogLLMObsInitParams
|
||||
dict_datadog_llm_obs_params = DatadogLLMObsInitParams(
|
||||
**litellm.datadog_llm_observability_params
|
||||
).model_dump()
|
||||
return dict_datadog_llm_obs_params
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
verbose_logger.debug(
|
||||
f"DataDogLLMObs: Logging success event for model {kwargs.get('model', 'unknown')}"
|
||||
)
|
||||
payload = self.create_llm_obs_payload(kwargs, start_time, end_time)
|
||||
verbose_logger.debug(f"DataDogLLMObs: Payload: {payload}")
|
||||
self.log_queue.append(payload)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.async_send_batch()
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"DataDogLLMObs: Error logging success event - {str(e)}"
|
||||
)
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
verbose_logger.debug(
|
||||
f"DataDogLLMObs: Logging failure event for model {kwargs.get('model', 'unknown')}"
|
||||
)
|
||||
payload = self.create_llm_obs_payload(kwargs, start_time, end_time)
|
||||
verbose_logger.debug(f"DataDogLLMObs: Payload: {payload}")
|
||||
self.log_queue.append(payload)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.async_send_batch()
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"DataDogLLMObs: Error logging failure event - {str(e)}"
|
||||
)
|
||||
|
||||
async def async_send_batch(self):
|
||||
try:
|
||||
if not self.log_queue:
|
||||
return
|
||||
|
||||
verbose_logger.debug(
|
||||
f"DataDogLLMObs: Flushing {len(self.log_queue)} events"
|
||||
)
|
||||
|
||||
if self.is_mock_mode:
|
||||
verbose_logger.debug(
|
||||
"[DATADOG MOCK] Mock mode enabled - API calls will be intercepted"
|
||||
)
|
||||
|
||||
# Prepare the payload
|
||||
payload = {
|
||||
"data": DDIntakePayload(
|
||||
type="span",
|
||||
attributes=DDSpanAttributes(
|
||||
ml_app=get_datadog_service(),
|
||||
tags=[get_datadog_tags()],
|
||||
spans=self.log_queue,
|
||||
),
|
||||
),
|
||||
}
|
||||
|
||||
# serialize datetime objects - for budget reset time in spend metrics
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
|
||||
try:
|
||||
verbose_logger.debug("payload %s", safe_dumps(payload))
|
||||
except Exception as debug_error:
|
||||
verbose_logger.debug(
|
||||
"payload serialization failed: %s", str(debug_error)
|
||||
)
|
||||
|
||||
json_payload = safe_dumps(payload)
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if self.DD_API_KEY:
|
||||
headers["DD-API-KEY"] = self.DD_API_KEY
|
||||
|
||||
response = await self.async_client.post(
|
||||
url=self.intake_url,
|
||||
content=json_payload,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
if response.status_code != 202:
|
||||
raise Exception(
|
||||
f"DataDogLLMObs: Unexpected response - status_code: {response.status_code}, text: {response.text}"
|
||||
)
|
||||
|
||||
if self.is_mock_mode:
|
||||
verbose_logger.debug(
|
||||
f"[DATADOG MOCK] Batch of {len(self.log_queue)} events successfully mocked"
|
||||
)
|
||||
else:
|
||||
verbose_logger.debug(
|
||||
f"DataDogLLMObs: Successfully sent batch - status_code: {response.status_code}"
|
||||
)
|
||||
self.log_queue.clear()
|
||||
except httpx.HTTPStatusError as e:
|
||||
verbose_logger.exception(
|
||||
f"DataDogLLMObs: Error sending batch - {e.response.text}"
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(f"DataDogLLMObs: Error sending batch - {str(e)}")
|
||||
|
||||
def create_llm_obs_payload(
|
||||
self, kwargs: Dict, start_time: datetime, end_time: datetime
|
||||
) -> LLMObsPayload:
|
||||
standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object"
|
||||
)
|
||||
if standard_logging_payload is None:
|
||||
raise Exception("DataDogLLMObs: standard_logging_object is not set")
|
||||
|
||||
messages = standard_logging_payload["messages"]
|
||||
messages = self._ensure_string_content(messages=messages)
|
||||
|
||||
metadata = kwargs.get("litellm_params", {}).get("metadata", {})
|
||||
|
||||
input_meta = InputMeta(
|
||||
messages=handle_any_messages_to_chat_completion_str_messages_conversion(
|
||||
messages
|
||||
)
|
||||
)
|
||||
output_meta = OutputMeta(
|
||||
messages=self._get_response_messages(
|
||||
standard_logging_payload=standard_logging_payload,
|
||||
call_type=standard_logging_payload.get("call_type"),
|
||||
)
|
||||
)
|
||||
|
||||
error_info = self._assemble_error_info(standard_logging_payload)
|
||||
|
||||
metadata_parent_id: Optional[str] = None
|
||||
if isinstance(metadata, dict):
|
||||
metadata_parent_id = metadata.get("parent_id")
|
||||
|
||||
meta = Meta(
|
||||
kind=self._get_datadog_span_kind(
|
||||
standard_logging_payload.get("call_type"), metadata_parent_id
|
||||
),
|
||||
input=input_meta,
|
||||
output=output_meta,
|
||||
metadata=self._get_dd_llm_obs_payload_metadata(standard_logging_payload),
|
||||
error=error_info,
|
||||
)
|
||||
|
||||
# Calculate metrics (you may need to adjust these based on available data)
|
||||
metrics = LLMMetrics(
|
||||
input_tokens=float(standard_logging_payload.get("prompt_tokens", 0)),
|
||||
output_tokens=float(standard_logging_payload.get("completion_tokens", 0)),
|
||||
total_tokens=float(standard_logging_payload.get("total_tokens", 0)),
|
||||
total_cost=float(standard_logging_payload.get("response_cost", 0)),
|
||||
time_to_first_token=self._get_time_to_first_token_seconds(
|
||||
standard_logging_payload
|
||||
),
|
||||
)
|
||||
|
||||
payload: LLMObsPayload = LLMObsPayload(
|
||||
parent_id=metadata_parent_id if metadata_parent_id else "undefined",
|
||||
trace_id=standard_logging_payload.get("trace_id", str(uuid.uuid4())),
|
||||
span_id=metadata.get("span_id", str(uuid.uuid4())),
|
||||
name=metadata.get("name", "litellm_llm_call"),
|
||||
meta=meta,
|
||||
start_ns=int(start_time.timestamp() * 1e9),
|
||||
duration=int((end_time - start_time).total_seconds() * 1e9),
|
||||
metrics=metrics,
|
||||
status="error" if error_info else "ok",
|
||||
tags=[get_datadog_tags(standard_logging_object=standard_logging_payload)],
|
||||
)
|
||||
|
||||
apm_trace_id = self._get_apm_trace_id()
|
||||
if apm_trace_id is not None:
|
||||
payload["apm_id"] = apm_trace_id
|
||||
|
||||
return payload
|
||||
|
||||
def _get_apm_trace_id(self) -> Optional[str]:
|
||||
"""Retrieve the current APM trace ID if available."""
|
||||
try:
|
||||
current_span_fn = getattr(tracer, "current_span", None)
|
||||
if callable(current_span_fn):
|
||||
current_span = current_span_fn()
|
||||
if current_span is not None:
|
||||
trace_id = getattr(current_span, "trace_id", None)
|
||||
if trace_id is not None:
|
||||
return str(trace_id)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _assemble_error_info(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> Optional[DDLLMObsError]:
|
||||
"""
|
||||
Assemble error information for failure cases according to DD LLM Obs API spec
|
||||
"""
|
||||
# Handle error information for failure cases according to DD LLM Obs API spec
|
||||
error_info: Optional[DDLLMObsError] = None
|
||||
|
||||
if standard_logging_payload.get("status") == "failure":
|
||||
# Try to get structured error information first
|
||||
error_information: Optional[
|
||||
StandardLoggingPayloadErrorInformation
|
||||
] = standard_logging_payload.get("error_information")
|
||||
|
||||
if error_information:
|
||||
error_info = DDLLMObsError(
|
||||
message=error_information.get("error_message")
|
||||
or standard_logging_payload.get("error_str")
|
||||
or "Unknown error",
|
||||
type=error_information.get("error_class"),
|
||||
stack=error_information.get("traceback"),
|
||||
)
|
||||
return error_info
|
||||
|
||||
def _get_time_to_first_token_seconds(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> float:
|
||||
"""
|
||||
Get the time to first token in seconds
|
||||
|
||||
CompletionStartTime - StartTime = Time to first token
|
||||
|
||||
For non streaming calls, CompletionStartTime is time we get the response back
|
||||
"""
|
||||
start_time: Optional[float] = standard_logging_payload.get("startTime")
|
||||
completion_start_time: Optional[float] = standard_logging_payload.get(
|
||||
"completionStartTime"
|
||||
)
|
||||
end_time: Optional[float] = standard_logging_payload.get("endTime")
|
||||
|
||||
if completion_start_time is not None and start_time is not None:
|
||||
return completion_start_time - start_time
|
||||
elif end_time is not None and start_time is not None:
|
||||
return end_time - start_time
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def _get_response_messages(
|
||||
self, standard_logging_payload: StandardLoggingPayload, call_type: Optional[str]
|
||||
) -> List[Any]:
|
||||
"""
|
||||
Get the messages from the response object
|
||||
|
||||
for now this handles logging /chat/completions responses
|
||||
"""
|
||||
|
||||
response_obj = standard_logging_payload.get("response")
|
||||
if response_obj is None:
|
||||
return []
|
||||
|
||||
# edge case: handle response_obj is a string representation of a dict
|
||||
if isinstance(response_obj, str):
|
||||
try:
|
||||
import ast
|
||||
|
||||
response_obj = ast.literal_eval(response_obj)
|
||||
except (ValueError, SyntaxError):
|
||||
try:
|
||||
# fallback to json parsing
|
||||
response_obj = json.loads(str(response_obj))
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
if call_type in [
|
||||
CallTypes.completion.value,
|
||||
CallTypes.acompletion.value,
|
||||
CallTypes.text_completion.value,
|
||||
CallTypes.atext_completion.value,
|
||||
CallTypes.generate_content.value,
|
||||
CallTypes.agenerate_content.value,
|
||||
CallTypes.generate_content_stream.value,
|
||||
CallTypes.agenerate_content_stream.value,
|
||||
CallTypes.anthropic_messages.value,
|
||||
]:
|
||||
try:
|
||||
# Safely extract message from response_obj, handle failure cases
|
||||
if isinstance(response_obj, dict) and "choices" in response_obj:
|
||||
choices = response_obj["choices"]
|
||||
if choices and len(choices) > 0 and "message" in choices[0]:
|
||||
return [choices[0]["message"]]
|
||||
return []
|
||||
except (KeyError, IndexError, TypeError):
|
||||
# In case of any error accessing the response structure, return empty list
|
||||
return []
|
||||
return []
|
||||
|
||||
def _get_datadog_span_kind(
|
||||
self, call_type: Optional[str], parent_id: Optional[str] = None
|
||||
) -> Literal["llm", "tool", "task", "embedding", "retrieval"]:
|
||||
"""
|
||||
Map liteLLM call_type to appropriate DataDog LLM Observability span kind.
|
||||
|
||||
Available DataDog span kinds: "llm", "tool", "task", "embedding", "retrieval"
|
||||
see: https://docs.datadoghq.com/ja/llm_observability/terms/
|
||||
"""
|
||||
# Non llm/workflow/agent kinds cannot be root spans, so fallback to "llm" when parent metadata is missing
|
||||
if call_type is None or parent_id is None:
|
||||
return "llm"
|
||||
|
||||
# Embedding operations
|
||||
if call_type in [CallTypes.embedding.value, CallTypes.aembedding.value]:
|
||||
return "embedding"
|
||||
|
||||
# LLM completion operations
|
||||
if call_type in [
|
||||
CallTypes.completion.value,
|
||||
CallTypes.acompletion.value,
|
||||
CallTypes.text_completion.value,
|
||||
CallTypes.atext_completion.value,
|
||||
CallTypes.generate_content.value,
|
||||
CallTypes.agenerate_content.value,
|
||||
CallTypes.generate_content_stream.value,
|
||||
CallTypes.agenerate_content_stream.value,
|
||||
CallTypes.anthropic_messages.value,
|
||||
CallTypes.responses.value,
|
||||
CallTypes.aresponses.value,
|
||||
]:
|
||||
return "llm"
|
||||
|
||||
# Tool operations
|
||||
if call_type in [CallTypes.call_mcp_tool.value]:
|
||||
return "tool"
|
||||
|
||||
# Retrieval operations
|
||||
if call_type in [
|
||||
CallTypes.get_assistants.value,
|
||||
CallTypes.aget_assistants.value,
|
||||
CallTypes.get_thread.value,
|
||||
CallTypes.aget_thread.value,
|
||||
CallTypes.get_messages.value,
|
||||
CallTypes.aget_messages.value,
|
||||
CallTypes.afile_retrieve.value,
|
||||
CallTypes.file_retrieve.value,
|
||||
CallTypes.afile_list.value,
|
||||
CallTypes.file_list.value,
|
||||
CallTypes.afile_content.value,
|
||||
CallTypes.file_content.value,
|
||||
CallTypes.retrieve_batch.value,
|
||||
CallTypes.aretrieve_batch.value,
|
||||
CallTypes.retrieve_fine_tuning_job.value,
|
||||
CallTypes.aretrieve_fine_tuning_job.value,
|
||||
CallTypes.alist_input_items.value,
|
||||
]:
|
||||
return "retrieval"
|
||||
|
||||
# Task operations (batch, fine-tuning, file operations, etc.)
|
||||
if call_type in [
|
||||
CallTypes.create_batch.value,
|
||||
CallTypes.acreate_batch.value,
|
||||
CallTypes.create_fine_tuning_job.value,
|
||||
CallTypes.acreate_fine_tuning_job.value,
|
||||
CallTypes.cancel_fine_tuning_job.value,
|
||||
CallTypes.acancel_fine_tuning_job.value,
|
||||
CallTypes.list_fine_tuning_jobs.value,
|
||||
CallTypes.alist_fine_tuning_jobs.value,
|
||||
CallTypes.create_assistants.value,
|
||||
CallTypes.acreate_assistants.value,
|
||||
CallTypes.delete_assistant.value,
|
||||
CallTypes.adelete_assistant.value,
|
||||
CallTypes.create_thread.value,
|
||||
CallTypes.acreate_thread.value,
|
||||
CallTypes.add_message.value,
|
||||
CallTypes.a_add_message.value,
|
||||
CallTypes.run_thread.value,
|
||||
CallTypes.arun_thread.value,
|
||||
CallTypes.run_thread_stream.value,
|
||||
CallTypes.arun_thread_stream.value,
|
||||
CallTypes.file_delete.value,
|
||||
CallTypes.afile_delete.value,
|
||||
CallTypes.create_file.value,
|
||||
CallTypes.acreate_file.value,
|
||||
CallTypes.image_generation.value,
|
||||
CallTypes.aimage_generation.value,
|
||||
CallTypes.image_edit.value,
|
||||
CallTypes.aimage_edit.value,
|
||||
CallTypes.moderation.value,
|
||||
CallTypes.amoderation.value,
|
||||
CallTypes.transcription.value,
|
||||
CallTypes.atranscription.value,
|
||||
CallTypes.speech.value,
|
||||
CallTypes.aspeech.value,
|
||||
CallTypes.rerank.value,
|
||||
CallTypes.arerank.value,
|
||||
]:
|
||||
return "task"
|
||||
|
||||
# Default fallback for unknown or passthrough operations
|
||||
return "llm"
|
||||
|
||||
def _ensure_string_content(
|
||||
self, messages: Optional[Union[str, List[Any], Dict[Any, Any]]]
|
||||
) -> List[Any]:
|
||||
if messages is None:
|
||||
return []
|
||||
if isinstance(messages, str):
|
||||
return [messages]
|
||||
elif isinstance(messages, list):
|
||||
return [message for message in messages]
|
||||
elif isinstance(messages, dict):
|
||||
return [str(messages.get("content", ""))]
|
||||
return []
|
||||
|
||||
def _get_dd_llm_obs_payload_metadata(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Fields to track in DD LLM Observability metadata from litellm standard logging payload
|
||||
"""
|
||||
_metadata: Dict[str, Any] = {
|
||||
"model_name": standard_logging_payload.get("model", "unknown"),
|
||||
"model_provider": standard_logging_payload.get(
|
||||
"custom_llm_provider", "unknown"
|
||||
),
|
||||
"id": standard_logging_payload.get("id", "unknown"),
|
||||
"trace_id": standard_logging_payload.get("trace_id", "unknown"),
|
||||
"cache_hit": standard_logging_payload.get("cache_hit", "unknown"),
|
||||
"cache_key": standard_logging_payload.get("cache_key", "unknown"),
|
||||
"saved_cache_cost": standard_logging_payload.get("saved_cache_cost", 0),
|
||||
"guardrail_information": standard_logging_payload.get(
|
||||
"guardrail_information", None
|
||||
),
|
||||
"is_streamed_request": self._get_stream_value_from_payload(
|
||||
standard_logging_payload
|
||||
),
|
||||
}
|
||||
|
||||
#########################################################
|
||||
# Add latency metrics to metadata
|
||||
#########################################################
|
||||
latency_metrics = self._get_latency_metrics(standard_logging_payload)
|
||||
_metadata.update({"latency_metrics": dict(latency_metrics)})
|
||||
|
||||
#########################################################
|
||||
# Add spend metrics to metadata
|
||||
#########################################################
|
||||
spend_metrics = self._get_spend_metrics(standard_logging_payload)
|
||||
_metadata.update({"spend_metrics": dict(spend_metrics)})
|
||||
|
||||
## extract tool calls and add to metadata
|
||||
tool_call_metadata = self._extract_tool_call_metadata(standard_logging_payload)
|
||||
_metadata.update(tool_call_metadata)
|
||||
|
||||
_standard_logging_metadata: dict = (
|
||||
dict(standard_logging_payload.get("metadata", {})) or {}
|
||||
)
|
||||
_metadata.update(_standard_logging_metadata)
|
||||
return _metadata
|
||||
|
||||
def _get_latency_metrics(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> DDLLMObsLatencyMetrics:
|
||||
"""
|
||||
Get the latency metrics from the standard logging payload
|
||||
"""
|
||||
latency_metrics: DDLLMObsLatencyMetrics = DDLLMObsLatencyMetrics()
|
||||
# Add latency metrics to metadata
|
||||
# Time to first token (convert from seconds to milliseconds for consistency)
|
||||
time_to_first_token_seconds = self._get_time_to_first_token_seconds(
|
||||
standard_logging_payload
|
||||
)
|
||||
if time_to_first_token_seconds > 0:
|
||||
latency_metrics["time_to_first_token_ms"] = (
|
||||
time_to_first_token_seconds * 1000
|
||||
)
|
||||
|
||||
# LiteLLM overhead time
|
||||
hidden_params = standard_logging_payload.get("hidden_params", {})
|
||||
litellm_overhead_ms = hidden_params.get("litellm_overhead_time_ms")
|
||||
if litellm_overhead_ms is not None:
|
||||
latency_metrics["litellm_overhead_time_ms"] = litellm_overhead_ms
|
||||
|
||||
# Guardrail overhead latency
|
||||
guardrail_info: Optional[
|
||||
list[StandardLoggingGuardrailInformation]
|
||||
] = standard_logging_payload.get("guardrail_information")
|
||||
if guardrail_info is not None:
|
||||
total_duration = 0.0
|
||||
for info in guardrail_info:
|
||||
_guardrail_duration_seconds: Optional[float] = info.get("duration")
|
||||
if _guardrail_duration_seconds is not None:
|
||||
total_duration += float(_guardrail_duration_seconds)
|
||||
|
||||
if total_duration > 0:
|
||||
# Convert from seconds to milliseconds for consistency
|
||||
latency_metrics["guardrail_overhead_time_ms"] = total_duration * 1000
|
||||
|
||||
return latency_metrics
|
||||
|
||||
def _get_stream_value_from_payload(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> bool:
|
||||
"""
|
||||
Extract the stream value from standard logging payload.
|
||||
|
||||
The stream field in StandardLoggingPayload is only set to True for completed streaming responses.
|
||||
For non-streaming requests, it's None. The original stream parameter is in model_parameters.
|
||||
|
||||
Returns:
|
||||
bool: True if this was a streaming request, False otherwise
|
||||
"""
|
||||
# Check top-level stream field first (only True for completed streaming)
|
||||
stream_value = standard_logging_payload.get("stream")
|
||||
if stream_value is True:
|
||||
return True
|
||||
|
||||
# Fallback to model_parameters.stream for original request parameters
|
||||
model_params = standard_logging_payload.get("model_parameters", {})
|
||||
if isinstance(model_params, dict):
|
||||
stream_value = model_params.get("stream")
|
||||
if stream_value is True:
|
||||
return True
|
||||
|
||||
# Default to False for non-streaming requests
|
||||
return False
|
||||
|
||||
def _get_spend_metrics(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> DDLLMObsSpendMetrics:
|
||||
"""
|
||||
Get the spend metrics from the standard logging payload
|
||||
"""
|
||||
spend_metrics: DDLLMObsSpendMetrics = DDLLMObsSpendMetrics()
|
||||
|
||||
# send response cost
|
||||
spend_metrics["response_cost"] = standard_logging_payload.get(
|
||||
"response_cost", 0.0
|
||||
)
|
||||
|
||||
# Get budget information from metadata
|
||||
metadata = standard_logging_payload.get("metadata", {})
|
||||
|
||||
# API key max budget
|
||||
user_api_key_max_budget = metadata.get("user_api_key_max_budget")
|
||||
if user_api_key_max_budget is not None:
|
||||
spend_metrics["user_api_key_max_budget"] = float(user_api_key_max_budget)
|
||||
|
||||
# API key spend
|
||||
user_api_key_spend = metadata.get("user_api_key_spend")
|
||||
if user_api_key_spend is not None:
|
||||
try:
|
||||
spend_metrics["user_api_key_spend"] = float(user_api_key_spend)
|
||||
except (ValueError, TypeError):
|
||||
verbose_logger.debug(
|
||||
f"Invalid user_api_key_spend value: {user_api_key_spend}"
|
||||
)
|
||||
|
||||
# API key budget reset datetime
|
||||
user_api_key_budget_reset_at = metadata.get("user_api_key_budget_reset_at")
|
||||
if user_api_key_budget_reset_at is not None:
|
||||
try:
|
||||
from datetime import datetime, timezone
|
||||
|
||||
budget_reset_at = None
|
||||
if isinstance(user_api_key_budget_reset_at, str):
|
||||
# Handle ISO format strings that might have 'Z' suffix
|
||||
iso_string = user_api_key_budget_reset_at.replace("Z", "+00:00")
|
||||
budget_reset_at = datetime.fromisoformat(iso_string)
|
||||
elif isinstance(user_api_key_budget_reset_at, datetime):
|
||||
budget_reset_at = user_api_key_budget_reset_at
|
||||
|
||||
if budget_reset_at is not None:
|
||||
# Preserve timezone info if already present
|
||||
if budget_reset_at.tzinfo is None:
|
||||
budget_reset_at = budget_reset_at.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Convert to ISO string format for JSON serialization
|
||||
# This prevents circular reference issues and ensures proper timezone representation
|
||||
iso_string = budget_reset_at.isoformat()
|
||||
spend_metrics["user_api_key_budget_reset_at"] = iso_string
|
||||
|
||||
# Debug logging to verify the conversion
|
||||
verbose_logger.debug(
|
||||
f"Converted budget_reset_at to ISO format: {iso_string}"
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_logger.debug(f"Error processing budget reset datetime: {e}")
|
||||
verbose_logger.debug(f"Original value: {user_api_key_budget_reset_at}")
|
||||
|
||||
return spend_metrics
|
||||
|
||||
def _process_input_messages_preserving_tool_calls(
|
||||
self, messages: List[Any]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process input messages while preserving tool_calls and tool message types.
|
||||
|
||||
This bypasses the lossy string conversion when tool calls are present,
|
||||
allowing complex nested tool_calls objects to be preserved for Datadog.
|
||||
"""
|
||||
processed = []
|
||||
for msg in messages:
|
||||
if isinstance(msg, dict):
|
||||
# Preserve messages with tool_calls or tool role as-is
|
||||
if "tool_calls" in msg or msg.get("role") == "tool":
|
||||
processed.append(msg)
|
||||
else:
|
||||
# For regular messages, still apply string conversion
|
||||
converted = (
|
||||
handle_any_messages_to_chat_completion_str_messages_conversion(
|
||||
[msg]
|
||||
)
|
||||
)
|
||||
processed.extend(converted)
|
||||
else:
|
||||
# For non-dict messages, apply string conversion
|
||||
converted = (
|
||||
handle_any_messages_to_chat_completion_str_messages_conversion(
|
||||
[msg]
|
||||
)
|
||||
)
|
||||
processed.extend(converted)
|
||||
return processed
|
||||
|
||||
@staticmethod
|
||||
def _tool_calls_kv_pair(tool_calls: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract tool call information into key-value pairs for Datadog metadata.
|
||||
|
||||
Similar to OpenTelemetry's implementation but adapted for Datadog's format.
|
||||
"""
|
||||
kv_pairs: Dict[str, Any] = {}
|
||||
for idx, tool_call in enumerate(tool_calls):
|
||||
try:
|
||||
# Extract tool call ID
|
||||
tool_id = tool_call.get("id")
|
||||
if tool_id:
|
||||
kv_pairs[f"tool_calls.{idx}.id"] = tool_id
|
||||
|
||||
# Extract tool call type
|
||||
tool_type = tool_call.get("type")
|
||||
if tool_type:
|
||||
kv_pairs[f"tool_calls.{idx}.type"] = tool_type
|
||||
|
||||
# Extract function information
|
||||
function = tool_call.get("function")
|
||||
if function:
|
||||
function_name = function.get("name")
|
||||
if function_name:
|
||||
kv_pairs[f"tool_calls.{idx}.function.name"] = function_name
|
||||
|
||||
function_arguments = function.get("arguments")
|
||||
if function_arguments:
|
||||
# Store arguments as JSON string for Datadog
|
||||
if isinstance(function_arguments, str):
|
||||
kv_pairs[
|
||||
f"tool_calls.{idx}.function.arguments"
|
||||
] = function_arguments
|
||||
else:
|
||||
import json
|
||||
|
||||
kv_pairs[
|
||||
f"tool_calls.{idx}.function.arguments"
|
||||
] = json.dumps(function_arguments)
|
||||
except (KeyError, TypeError, ValueError) as e:
|
||||
verbose_logger.debug(
|
||||
f"DataDogLLMObs: Error processing tool call {idx}: {str(e)}"
|
||||
)
|
||||
continue
|
||||
|
||||
return kv_pairs
|
||||
|
||||
def _extract_tool_call_metadata(
|
||||
self, standard_logging_payload: StandardLoggingPayload
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract tool call information from both input messages and response for Datadog metadata.
|
||||
"""
|
||||
tool_call_metadata: Dict[str, Any] = {}
|
||||
|
||||
try:
|
||||
# Extract tool calls from input messages
|
||||
messages = standard_logging_payload.get("messages", [])
|
||||
if messages and isinstance(messages, list):
|
||||
for message in messages:
|
||||
if isinstance(message, dict) and "tool_calls" in message:
|
||||
tool_calls = message.get("tool_calls")
|
||||
if tool_calls:
|
||||
input_tool_calls_kv = self._tool_calls_kv_pair(tool_calls)
|
||||
# Prefix with "input_" to distinguish from response tool calls
|
||||
for key, value in input_tool_calls_kv.items():
|
||||
tool_call_metadata[f"input_{key}"] = value
|
||||
|
||||
# Extract tool calls from response
|
||||
response_obj = standard_logging_payload.get("response")
|
||||
if response_obj and isinstance(response_obj, dict):
|
||||
choices = response_obj.get("choices", [])
|
||||
for choice in choices:
|
||||
if isinstance(choice, dict):
|
||||
message = choice.get("message")
|
||||
if message and isinstance(message, dict):
|
||||
tool_calls = message.get("tool_calls")
|
||||
if tool_calls:
|
||||
response_tool_calls_kv = self._tool_calls_kv_pair(
|
||||
tool_calls
|
||||
)
|
||||
# Prefix with "output_" to distinguish from input tool calls
|
||||
for key, value in response_tool_calls_kv.items():
|
||||
tool_call_metadata[f"output_{key}"] = value
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.debug(
|
||||
f"DataDogLLMObs: Error extracting tool call metadata: {str(e)}"
|
||||
)
|
||||
|
||||
return tool_call_metadata
|
||||
@@ -0,0 +1,286 @@
|
||||
import asyncio
|
||||
import gzip
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from litellm._logging import verbose_logger
|
||||
from litellm.integrations.custom_batch_logger import CustomBatchLogger
|
||||
from litellm.integrations.datadog.datadog_handler import (
|
||||
get_datadog_env,
|
||||
get_datadog_hostname,
|
||||
get_datadog_pod_name,
|
||||
get_datadog_service,
|
||||
)
|
||||
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
get_async_httpx_client,
|
||||
httpxSpecialProvider,
|
||||
)
|
||||
from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus
|
||||
from litellm.types.integrations.datadog_metrics import (
|
||||
DatadogMetricPoint,
|
||||
DatadogMetricSeries,
|
||||
DatadogMetricsPayload,
|
||||
)
|
||||
from litellm.types.utils import StandardLoggingPayload
|
||||
|
||||
|
||||
class DatadogMetricsLogger(CustomBatchLogger):
|
||||
def __init__(self, start_periodic_flush: bool = True, **kwargs):
|
||||
self.dd_api_key = os.getenv("DD_API_KEY")
|
||||
self.dd_app_key = os.getenv("DD_APP_KEY")
|
||||
self.dd_site = os.getenv("DD_SITE", "datadoghq.com")
|
||||
|
||||
if not self.dd_api_key:
|
||||
verbose_logger.warning(
|
||||
"Datadog Metrics: DD_API_KEY is required. Integration will not work."
|
||||
)
|
||||
|
||||
self.upload_url = f"https://api.{self.dd_site}/api/v2/series"
|
||||
|
||||
self.async_client = get_async_httpx_client(
|
||||
llm_provider=httpxSpecialProvider.LoggingCallback
|
||||
)
|
||||
|
||||
# Initialize lock
|
||||
self.flush_lock = asyncio.Lock()
|
||||
|
||||
# Only set flush_lock if not already provided by caller
|
||||
if "flush_lock" not in kwargs:
|
||||
kwargs["flush_lock"] = self.flush_lock
|
||||
|
||||
# Send metrics more quickly to datadog (every 5 seconds)
|
||||
if "flush_interval" not in kwargs:
|
||||
kwargs["flush_interval"] = 5
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# Start periodic flush task only if instructed
|
||||
if start_periodic_flush:
|
||||
asyncio.create_task(self.periodic_flush())
|
||||
|
||||
def _extract_tags(
|
||||
self,
|
||||
log: StandardLoggingPayload,
|
||||
status_code: Optional[Union[str, int]] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Builds the list of tags for a Datadog metric point
|
||||
"""
|
||||
# Base tags
|
||||
tags = [
|
||||
f"env:{get_datadog_env()}",
|
||||
f"service:{get_datadog_service()}",
|
||||
f"version:{os.getenv('DD_VERSION', 'unknown')}",
|
||||
f"HOSTNAME:{get_datadog_hostname()}",
|
||||
f"POD_NAME:{get_datadog_pod_name()}",
|
||||
]
|
||||
|
||||
# Add metric-specific tags
|
||||
if provider := log.get("custom_llm_provider"):
|
||||
tags.append(f"provider:{provider}")
|
||||
|
||||
if model := log.get("model"):
|
||||
tags.append(f"model_name:{model}")
|
||||
|
||||
if model_group := log.get("model_group"):
|
||||
tags.append(f"model_group:{model_group}")
|
||||
|
||||
if status_code is not None:
|
||||
tags.append(f"status_code:{status_code}")
|
||||
|
||||
# Extract team tag
|
||||
metadata = log.get("metadata", {}) or {}
|
||||
team_tag = (
|
||||
metadata.get("user_api_key_team_alias")
|
||||
or metadata.get("team_alias") # type: ignore
|
||||
or metadata.get("user_api_key_team_id")
|
||||
or metadata.get("team_id") # type: ignore
|
||||
)
|
||||
|
||||
if team_tag:
|
||||
tags.append(f"team:{team_tag}")
|
||||
|
||||
return tags
|
||||
|
||||
def _add_metrics_from_log(
|
||||
self,
|
||||
log: StandardLoggingPayload,
|
||||
kwargs: dict,
|
||||
status_code: Union[str, int] = "200",
|
||||
):
|
||||
"""
|
||||
Extracts latencies and appends Datadog metric series to the queue
|
||||
"""
|
||||
tags = self._extract_tags(log, status_code=status_code)
|
||||
|
||||
# We record metrics with the end_time as the timestamp for the point
|
||||
end_time_dt = kwargs.get("end_time") or datetime.now()
|
||||
timestamp = int(end_time_dt.timestamp())
|
||||
|
||||
# 1. Total Request Latency Metric (End to End)
|
||||
start_time_dt = kwargs.get("start_time")
|
||||
if start_time_dt and end_time_dt:
|
||||
total_duration = (end_time_dt - start_time_dt).total_seconds()
|
||||
series_total_latency: DatadogMetricSeries = {
|
||||
"metric": "litellm.request.total_latency",
|
||||
"type": 3, # gauge
|
||||
"points": [{"timestamp": timestamp, "value": total_duration}],
|
||||
"tags": tags,
|
||||
}
|
||||
self.log_queue.append(series_total_latency)
|
||||
|
||||
# 2. LLM API Latency Metric (Provider alone)
|
||||
api_call_start_time = kwargs.get("api_call_start_time")
|
||||
if api_call_start_time and end_time_dt:
|
||||
llm_api_duration = (end_time_dt - api_call_start_time).total_seconds()
|
||||
series_llm_latency: DatadogMetricSeries = {
|
||||
"metric": "litellm.llm_api.latency",
|
||||
"type": 3, # gauge
|
||||
"points": [{"timestamp": timestamp, "value": llm_api_duration}],
|
||||
"tags": tags,
|
||||
}
|
||||
self.log_queue.append(series_llm_latency)
|
||||
|
||||
# 3. Request Count / Status Code
|
||||
series_count: DatadogMetricSeries = {
|
||||
"metric": "litellm.llm_api.request_count",
|
||||
"type": 1, # count
|
||||
"points": [{"timestamp": timestamp, "value": 1.0}],
|
||||
"tags": tags,
|
||||
"interval": self.flush_interval,
|
||||
}
|
||||
self.log_queue.append(series_count)
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object", None
|
||||
)
|
||||
|
||||
if standard_logging_object is None:
|
||||
return
|
||||
|
||||
self._add_metrics_from_log(
|
||||
log=standard_logging_object, kwargs=kwargs, status_code="200"
|
||||
)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.flush_queue()
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Metrics: Error in async_log_success_event: {str(e)}"
|
||||
)
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get(
|
||||
"standard_logging_object", None
|
||||
)
|
||||
|
||||
if standard_logging_object is None:
|
||||
return
|
||||
|
||||
# Extract status code from error information
|
||||
status_code = "500" # default
|
||||
error_information = (
|
||||
standard_logging_object.get("error_information", {}) or {}
|
||||
)
|
||||
error_code = error_information.get("error_code") # type: ignore
|
||||
if error_code is not None:
|
||||
status_code = str(error_code)
|
||||
|
||||
self._add_metrics_from_log(
|
||||
log=standard_logging_object, kwargs=kwargs, status_code=status_code
|
||||
)
|
||||
|
||||
if len(self.log_queue) >= self.batch_size:
|
||||
await self.flush_queue()
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Metrics: Error in async_log_failure_event: {str(e)}"
|
||||
)
|
||||
|
||||
async def async_send_batch(self):
|
||||
if not self.log_queue:
|
||||
return
|
||||
|
||||
batch = self.log_queue.copy()
|
||||
payload_data: DatadogMetricsPayload = {"series": batch}
|
||||
|
||||
try:
|
||||
await self._upload_to_datadog(payload_data)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
f"Datadog Metrics: Error in async_send_batch: {str(e)}"
|
||||
)
|
||||
raise
|
||||
|
||||
async def _upload_to_datadog(self, payload: DatadogMetricsPayload):
|
||||
if not self.dd_api_key:
|
||||
return
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"DD-API-KEY": self.dd_api_key,
|
||||
}
|
||||
|
||||
if self.dd_app_key:
|
||||
headers["DD-APPLICATION-KEY"] = self.dd_app_key
|
||||
|
||||
json_data = safe_dumps(payload)
|
||||
compressed_data = gzip.compress(json_data.encode("utf-8"))
|
||||
headers["Content-Encoding"] = "gzip"
|
||||
|
||||
response = await self.async_client.post(
|
||||
self.upload_url, content=compressed_data, headers=headers # type: ignore
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
verbose_logger.debug(
|
||||
f"Datadog Metrics: Uploaded {len(payload['series'])} metric points. Status: {response.status_code}"
|
||||
)
|
||||
|
||||
async def async_health_check(self) -> IntegrationHealthCheckStatus:
|
||||
"""
|
||||
Check if the service is healthy
|
||||
"""
|
||||
try:
|
||||
# Send a test metric point to Datadog
|
||||
test_metric_point: DatadogMetricPoint = {
|
||||
"timestamp": int(time.time()),
|
||||
"value": 1.0,
|
||||
}
|
||||
test_metric_series: DatadogMetricSeries = {
|
||||
"metric": "litellm.health_check",
|
||||
"type": 3, # Gauge
|
||||
"points": [test_metric_point],
|
||||
"tags": ["env:health_check"],
|
||||
}
|
||||
|
||||
payload_data: DatadogMetricsPayload = {"series": [test_metric_series]}
|
||||
|
||||
await self._upload_to_datadog(payload_data)
|
||||
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="healthy",
|
||||
error_message=None,
|
||||
)
|
||||
except Exception as e:
|
||||
return IntegrationHealthCheckStatus(
|
||||
status="unhealthy",
|
||||
error_message=str(e),
|
||||
)
|
||||
|
||||
async def get_request_response_payload(
|
||||
self,
|
||||
request_id: str,
|
||||
start_time_utc: Optional[datetime],
|
||||
end_time_utc: Optional[datetime],
|
||||
) -> Optional[dict]:
|
||||
pass
|
||||
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
Mock client for Datadog integration testing.
|
||||
|
||||
This module intercepts Datadog API calls and returns successful mock responses,
|
||||
allowing full code execution without making actual network calls.
|
||||
|
||||
Usage:
|
||||
Set DATADOG_MOCK=true in environment variables or config to enable mock mode.
|
||||
"""
|
||||
|
||||
from litellm.integrations.mock_client_factory import (
|
||||
MockClientConfig,
|
||||
create_mock_client_factory,
|
||||
)
|
||||
|
||||
# Create mock client using factory
|
||||
_config = MockClientConfig(
|
||||
name="DATADOG",
|
||||
env_var="DATADOG_MOCK",
|
||||
default_latency_ms=100,
|
||||
default_status_code=202,
|
||||
default_json_data={"status": "ok"},
|
||||
url_matchers=[
|
||||
".datadoghq.com",
|
||||
"datadoghq.com",
|
||||
],
|
||||
patch_async_handler=True,
|
||||
patch_sync_client=True,
|
||||
)
|
||||
|
||||
create_mock_datadog_client, should_use_datadog_mock = create_mock_client_factory(
|
||||
_config
|
||||
)
|
||||
Reference in New Issue
Block a user