chore: initial snapshot for gitea/github upload

This commit is contained in:
Your Name
2026-03-26 16:04:46 +08:00
commit a699a1ac98
3497 changed files with 1586237 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
from typing import List
from litellm.llms.base_llm.audio_transcription.transformation import (
AudioTranscriptionRequestData,
)
from litellm.types.llms.openai import OpenAIAudioTranscriptionOptionalParams
from litellm.types.utils import FileTypes
from .whisper_transformation import OpenAIWhisperAudioTranscriptionConfig
class OpenAIGPTAudioTranscriptionConfig(OpenAIWhisperAudioTranscriptionConfig):
def get_supported_openai_params(
self, model: str
) -> List[OpenAIAudioTranscriptionOptionalParams]:
"""
Get the supported OpenAI params for the `gpt-4o-transcribe` models
"""
return [
"language",
"prompt",
"response_format",
"temperature",
"include",
]
def transform_audio_transcription_request(
self,
model: str,
audio_file: FileTypes,
optional_params: dict,
litellm_params: dict,
) -> AudioTranscriptionRequestData:
"""
Transform the audio transcription request
"""
data = {"model": model, "file": audio_file, **optional_params}
return AudioTranscriptionRequestData(
data=data,
)

View File

@@ -0,0 +1,159 @@
# OpenAI Audio Transcription Guardrail Translation Handler
Handler for processing OpenAI's audio transcription endpoint (`/v1/audio/transcriptions`) with guardrails.
## Overview
This handler processes audio transcription responses by:
1. Applying guardrails to the transcribed text output
2. Returning the input unchanged (since input is an audio file, not text)
## Data Format
### Input Format
The input is an audio file, which cannot be guardrailed (it's binary data, not text).
```json
{
"model": "whisper-1",
"file": "<audio file>",
"response_format": "json",
"language": "en"
}
```
### Output Format
```json
{
"text": "This is the transcribed text from the audio file."
}
```
Or with additional metadata:
```json
{
"text": "This is the transcribed text from the audio file.",
"duration": 3.5,
"language": "en"
}
```
## Usage
The handler is automatically discovered and applied when guardrails are used with the audio transcription endpoint.
### Example: Using Guardrails with Audio Transcription
```bash
curl -X POST 'http://localhost:4000/v1/audio/transcriptions' \
-H 'Authorization: Bearer your-api-key' \
-F 'file=@audio.mp3' \
-F 'model=whisper-1' \
-F 'guardrails=["pii_mask"]'
```
The guardrail will be applied to the **output** transcribed text only.
### Example: PII Masking in Transcribed Text
```bash
curl -X POST 'http://localhost:4000/v1/audio/transcriptions' \
-H 'Authorization: Bearer your-api-key' \
-F 'file=@meeting_recording.mp3' \
-F 'model=whisper-1' \
-F 'guardrails=["mask_pii"]' \
-F 'response_format=json'
```
If the audio contains: "My name is John Doe and my email is john@example.com"
The transcription output will be: "My name is [NAME_REDACTED] and my email is [EMAIL_REDACTED]"
### Example: Content Moderation on Transcriptions
```bash
curl -X POST 'http://localhost:4000/v1/audio/transcriptions' \
-H 'Authorization: Bearer your-api-key' \
-F 'file=@audio.wav' \
-F 'model=whisper-1' \
-F 'guardrails=["content_moderation"]'
```
## Implementation Details
### Input Processing
- **Status**: Not applicable
- **Reason**: Input is an audio file (binary data), not text
- **Result**: Request data returned unchanged
### Output Processing
- **Field**: `text` (string)
- **Processing**: Applies guardrail to the transcribed text
- **Result**: Updated text in response
## Use Cases
1. **PII Protection**: Automatically redact personally identifiable information from transcriptions
2. **Content Filtering**: Remove or flag inappropriate content in transcribed audio
3. **Compliance**: Ensure transcriptions meet regulatory requirements
4. **Data Sanitization**: Clean up transcriptions before storage or further processing
## Extension
Override these methods to customize behavior:
- `process_output_response()`: Customize how transcribed text is processed
- `process_input_messages()`: Currently a no-op, but can be overridden if needed
## Supported Call Types
- `CallTypes.transcription` - Synchronous audio transcription
- `CallTypes.atranscription` - Asynchronous audio transcription
## Notes
- Input processing is a no-op since audio files cannot be text-guardrailed
- Only the transcribed text output is processed
- Guardrails apply after transcription is complete
- Both sync and async call types use the same handler
- Works with all Whisper models and response formats
## Common Patterns
### Transcribe and Redact PII
```python
import litellm
response = litellm.transcription(
model="whisper-1",
file=open("interview.mp3", "rb"),
guardrails=["mask_pii"],
)
# response.text will have PII redacted
print(response.text)
```
### Async Transcription with Guardrails
```python
import litellm
import asyncio
async def transcribe_with_guardrails():
response = await litellm.atranscription(
model="whisper-1",
file=open("audio.mp3", "rb"),
guardrails=["content_filter"],
)
return response.text
text = asyncio.run(transcribe_with_guardrails())
```

View File

@@ -0,0 +1,13 @@
"""OpenAI Audio Transcription handler for Unified Guardrails."""
from litellm.llms.openai.transcriptions.guardrail_translation.handler import (
OpenAIAudioTranscriptionHandler,
)
from litellm.types.utils import CallTypes
guardrail_translation_mappings = {
CallTypes.transcription: OpenAIAudioTranscriptionHandler,
CallTypes.atranscription: OpenAIAudioTranscriptionHandler,
}
__all__ = ["guardrail_translation_mappings", "OpenAIAudioTranscriptionHandler"]

View File

@@ -0,0 +1,117 @@
"""
OpenAI Audio Transcription Handler for Unified Guardrails
This module provides guardrail translation support for OpenAI's audio transcription endpoint.
The handler processes the output transcribed text (input is audio, so no text to guardrail).
"""
from typing import TYPE_CHECKING, Any, Optional
from litellm._logging import verbose_proxy_logger
from litellm.llms.base_llm.guardrail_translation.base_translation import BaseTranslation
from litellm.types.utils import GenericGuardrailAPIInputs
if TYPE_CHECKING:
from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.utils import TranscriptionResponse
class OpenAIAudioTranscriptionHandler(BaseTranslation):
"""
Handler for processing OpenAI audio transcription responses with guardrails.
This class provides methods to:
1. Process output transcription text (post-call hook)
Note: Input processing is not applicable since the input is an audio file,
not text. Only the transcribed text output is processed.
"""
async def process_input_messages(
self,
data: dict,
guardrail_to_apply: "CustomGuardrail",
litellm_logging_obj: Optional[Any] = None,
) -> Any:
"""
Process input - not applicable for audio transcription.
The input is an audio file, not text, so there's nothing to apply
guardrails to. This method returns the data unchanged.
Args:
data: Request data dictionary containing audio file
guardrail_to_apply: The guardrail instance (unused)
Returns:
Unmodified data (audio files don't need text guardrails)
"""
verbose_proxy_logger.debug(
"OpenAI Audio Transcription: Input processing not applicable "
"(input is audio file, not text)"
)
return data
async def process_output_response(
self,
response: "TranscriptionResponse",
guardrail_to_apply: "CustomGuardrail",
litellm_logging_obj: Optional[Any] = None,
user_api_key_dict: Optional[Any] = None,
) -> Any:
"""
Process output transcription by applying guardrails to transcribed text.
Args:
response: Transcription response object containing transcribed text
guardrail_to_apply: The guardrail instance to apply
litellm_logging_obj: Optional logging object
user_api_key_dict: User API key metadata to pass to guardrails
Returns:
Modified response with guardrails applied to transcribed text
"""
if not hasattr(response, "text") or response.text is None:
verbose_proxy_logger.debug(
"OpenAI Audio Transcription: No text in response to process"
)
return response
if isinstance(response.text, str):
original_text = response.text
# Create a request_data dict with response info and user API key metadata
request_data: dict = {"response": response}
# Add user API key metadata with prefixed keys
user_metadata = self.transform_user_api_key_dict_to_metadata(
user_api_key_dict
)
if user_metadata:
request_data["litellm_metadata"] = user_metadata
inputs = GenericGuardrailAPIInputs(texts=[original_text])
# Include model information from the response if available
if hasattr(response, "model") and response.model:
inputs["model"] = response.model
guardrailed_inputs = await guardrail_to_apply.apply_guardrail(
inputs=inputs,
request_data=request_data,
input_type="response",
logging_obj=litellm_logging_obj,
)
guardrailed_texts = guardrailed_inputs.get("texts", [])
response.text = guardrailed_texts[0] if guardrailed_texts else original_text
verbose_proxy_logger.debug(
"OpenAI Audio Transcription: Applied guardrail to transcribed text. "
"Original length: %d, New length: %d",
len(original_text),
len(response.text),
)
else:
verbose_proxy_logger.debug(
"OpenAI Audio Transcription: Unexpected text type: %s. Expected string.",
type(response.text),
)
return response

View File

@@ -0,0 +1,231 @@
from typing import TYPE_CHECKING, Optional, Union, cast
import httpx
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel
import litellm
if TYPE_CHECKING:
from aiohttp import ClientSession
from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_name
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.base_llm.audio_transcription.transformation import (
BaseAudioTranscriptionConfig,
)
from litellm.types.utils import FileTypes
from litellm.utils import (
TranscriptionResponse,
convert_to_model_response_object,
extract_duration_from_srt_or_vtt,
)
from ..openai import OpenAIChatCompletion
class OpenAIAudioTranscription(OpenAIChatCompletion):
# Audio Transcriptions
async def make_openai_audio_transcriptions_request(
self,
openai_aclient: AsyncOpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
- call openai_aclient.audio.transcriptions.create by default
"""
try:
raw_response = (
await openai_aclient.audio.transcriptions.with_raw_response.create(
**data, timeout=timeout
)
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
except Exception as e:
raise e
def make_sync_openai_audio_transcriptions_request(
self,
openai_client: OpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
- call openai_aclient.audio.transcriptions.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = (
openai_client.audio.transcriptions.with_raw_response.create(
**data, timeout=timeout
)
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = openai_client.audio.transcriptions.create(**data, timeout=timeout) # type: ignore
return None, response
except Exception as e:
raise e
def audio_transcriptions(
self,
model: str,
audio_file: FileTypes,
optional_params: dict,
litellm_params: dict,
model_response: TranscriptionResponse,
timeout: float,
max_retries: int,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str],
api_base: Optional[str],
client=None,
atranscription: bool = False,
provider_config: Optional[BaseAudioTranscriptionConfig] = None,
shared_session: Optional["ClientSession"] = None,
) -> TranscriptionResponse:
"""
Handle audio transcription request
"""
if provider_config is not None:
transformed_data = provider_config.transform_audio_transcription_request(
model=model,
audio_file=audio_file,
optional_params=optional_params,
litellm_params=litellm_params,
)
data = cast(dict, transformed_data.data)
else:
data = {"model": model, "file": audio_file, **optional_params}
if atranscription is True:
return self.async_audio_transcriptions( # type: ignore
audio_file=audio_file,
data=data,
model_response=model_response,
timeout=timeout,
api_key=api_key,
api_base=api_base,
client=client,
max_retries=max_retries,
logging_obj=logging_obj,
shared_session=shared_session,
)
openai_client: OpenAI = self._get_openai_client( # type: ignore
is_async=False,
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
client=client,
)
## LOGGING
logging_obj.pre_call(
input=None,
api_key=openai_client.api_key,
additional_args={
"api_base": openai_client._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
)
_, response = self.make_sync_openai_audio_transcriptions_request(
openai_client=openai_client,
data=data,
timeout=timeout,
)
if isinstance(response, BaseModel):
stringified_response = response.model_dump()
else:
stringified_response = TranscriptionResponse(text=response).model_dump()
## LOGGING
logging_obj.post_call(
input=get_audio_file_name(audio_file),
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
hidden_params = {"model": model, "custom_llm_provider": "openai"}
final_response: TranscriptionResponse = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
return final_response
async def async_audio_transcriptions(
self,
audio_file: FileTypes,
data: dict,
model_response: TranscriptionResponse,
timeout: float,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
max_retries=None,
shared_session: Optional["ClientSession"] = None,
):
try:
openai_aclient: AsyncOpenAI = self._get_openai_client( # type: ignore
is_async=True,
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
client=client,
shared_session=shared_session,
)
## LOGGING
logging_obj.pre_call(
input=None,
api_key=openai_aclient.api_key,
additional_args={
"api_base": openai_aclient._base_url._uri_reference,
"atranscription": True,
"complete_input_dict": data,
},
)
headers, response = await self.make_openai_audio_transcriptions_request(
openai_aclient=openai_aclient,
data=data,
timeout=timeout,
)
logging_obj.model_call_details["response_headers"] = headers
if isinstance(response, BaseModel):
stringified_response = response.model_dump()
else:
duration = extract_duration_from_srt_or_vtt(response)
stringified_response = TranscriptionResponse(text=response).model_dump()
stringified_response["_audio_transcription_duration"] = duration
## LOGGING
logging_obj.post_call(
input=get_audio_file_name(audio_file),
api_key=api_key,
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
# Extract the actual model from data instead of hardcoding "whisper-1"
actual_model = data.get("model", "whisper-1")
hidden_params = {"model": actual_model, "custom_llm_provider": "openai"}
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription") # type: ignore
except Exception as e:
## LOGGING
logging_obj.post_call(
input=input,
api_key=api_key,
original_response=str(e),
)
raise e

View File

@@ -0,0 +1,150 @@
from typing import List, Optional, Union
from httpx import Headers, Response
from litellm.llms.base_llm.audio_transcription.transformation import (
AudioTranscriptionRequestData,
BaseAudioTranscriptionConfig,
)
from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.openai import (
AllMessageValues,
OpenAIAudioTranscriptionOptionalParams,
)
from litellm.types.utils import FileTypes, TranscriptionResponse
from ..common_utils import OpenAIError
class OpenAIWhisperAudioTranscriptionConfig(BaseAudioTranscriptionConfig):
def get_complete_url(
self,
api_base: Optional[str],
api_key: Optional[str],
model: str,
optional_params: dict,
litellm_params: dict,
stream: Optional[bool] = None,
) -> str:
"""
OPTIONAL
Get the complete url for the request
Some providers need `model` in `api_base`
"""
## get the api base, attach the endpoint - v1/audio/transcriptions
# strip trailing slash if present
api_base = api_base.rstrip("/") if api_base else ""
# if endswith "/v1"
if api_base and api_base.endswith("/v1"):
api_base = f"{api_base}/audio/transcriptions"
else:
api_base = f"{api_base}/v1/audio/transcriptions"
return api_base or ""
def get_supported_openai_params(
self, model: str
) -> List[OpenAIAudioTranscriptionOptionalParams]:
"""
Get the supported OpenAI params for the `whisper-1` models
"""
return [
"language",
"prompt",
"response_format",
"temperature",
"timestamp_granularities",
]
def map_openai_params(
self,
non_default_params: dict,
optional_params: dict,
model: str,
drop_params: bool,
) -> dict:
"""
Map the OpenAI params to the Whisper params
"""
supported_params = self.get_supported_openai_params(model)
for k, v in non_default_params.items():
if k in supported_params:
optional_params[k] = v
return optional_params
def validate_environment(
self,
headers: dict,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
) -> dict:
api_key = api_key or get_secret_str("OPENAI_API_KEY")
auth_header = {
"Authorization": f"Bearer {api_key}",
}
headers.update(auth_header)
return headers
def transform_audio_transcription_request(
self,
model: str,
audio_file: FileTypes,
optional_params: dict,
litellm_params: dict,
) -> AudioTranscriptionRequestData:
"""
Transform the audio transcription request
"""
data = {"model": model, "file": audio_file, **optional_params}
if "response_format" not in data or (
data["response_format"] == "text" or data["response_format"] == "json"
):
data[
"response_format"
] = "verbose_json" # ensures 'duration' is received - used for cost calculation
return AudioTranscriptionRequestData(
data=data,
)
def get_error_class(
self, error_message: str, status_code: int, headers: Union[dict, Headers]
) -> BaseLLMException:
return OpenAIError(
status_code=status_code,
message=error_message,
headers=headers,
)
def transform_audio_transcription_response(
self,
raw_response: Response,
) -> TranscriptionResponse:
try:
raw_response_json = raw_response.json()
except Exception as e:
raise ValueError(
f"Error transforming response to json: {str(e)}\nResponse: {raw_response.text}"
)
if any(
key in raw_response_json
for key in TranscriptionResponse.model_fields.keys()
):
return TranscriptionResponse(**raw_response_json)
else:
raise ValueError(
"Invalid response format. Received response does not match the expected format. Got: ",
raw_response_json,
)