chore: initial public snapshot for github upload
This commit is contained in:
@@ -0,0 +1,7 @@
|
||||
"""Azure Text-to-Speech module"""
|
||||
|
||||
from .transformation import AzureAVATextToSpeechConfig
|
||||
|
||||
__all__ = [
|
||||
"AzureAVATextToSpeechConfig",
|
||||
]
|
||||
@@ -0,0 +1,504 @@
|
||||
"""
|
||||
Azure AVA (Cognitive Services) Text-to-Speech transformation
|
||||
|
||||
Maps OpenAI TTS spec to Azure Cognitive Services TTS API
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Tuple, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
import litellm
|
||||
from litellm.llms.base_llm.text_to_speech.transformation import (
|
||||
BaseTextToSpeechConfig,
|
||||
TextToSpeechRequestData,
|
||||
)
|
||||
from litellm.secret_managers.main import get_secret_str
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
else:
|
||||
LiteLLMLoggingObj = Any
|
||||
HttpxBinaryResponseContent = Any
|
||||
|
||||
|
||||
class AzureAVATextToSpeechConfig(BaseTextToSpeechConfig):
|
||||
"""
|
||||
Configuration for Azure AVA (Cognitive Services) Text-to-Speech
|
||||
|
||||
Reference: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech
|
||||
"""
|
||||
|
||||
# Azure endpoint domains
|
||||
DEFAULT_VOICE = "en-US-AriaNeural"
|
||||
COGNITIVE_SERVICES_DOMAIN = "api.cognitive.microsoft.com"
|
||||
TTS_SPEECH_DOMAIN = "tts.speech.microsoft.com"
|
||||
TTS_ENDPOINT_PATH = "/cognitiveservices/v1"
|
||||
|
||||
# Voice name mappings from OpenAI voices to Azure voices
|
||||
VOICE_MAPPINGS = {
|
||||
"alloy": "en-US-JennyNeural",
|
||||
"echo": "en-US-GuyNeural",
|
||||
"fable": "en-GB-RyanNeural",
|
||||
"onyx": "en-US-DavisNeural",
|
||||
"nova": "en-US-AmberNeural",
|
||||
"shimmer": "en-US-AriaNeural",
|
||||
}
|
||||
|
||||
# Response format mappings from OpenAI to Azure
|
||||
FORMAT_MAPPINGS = {
|
||||
"mp3": "audio-24khz-48kbitrate-mono-mp3",
|
||||
"opus": "ogg-48khz-16bit-mono-opus",
|
||||
"aac": "audio-24khz-48kbitrate-mono-mp3", # Azure doesn't have AAC, use MP3
|
||||
"flac": "audio-24khz-48kbitrate-mono-mp3", # Azure doesn't have FLAC, use MP3
|
||||
"wav": "riff-24khz-16bit-mono-pcm",
|
||||
"pcm": "raw-24khz-16bit-mono-pcm",
|
||||
}
|
||||
|
||||
def dispatch_text_to_speech(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[Union[str, Dict]],
|
||||
optional_params: Dict,
|
||||
litellm_params_dict: Dict,
|
||||
logging_obj: "LiteLLMLoggingObj",
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
extra_headers: Optional[Dict[str, Any]],
|
||||
base_llm_http_handler: Any,
|
||||
aspeech: bool,
|
||||
api_base: Optional[str],
|
||||
api_key: Optional[str],
|
||||
**kwargs: Any,
|
||||
) -> Union[
|
||||
"HttpxBinaryResponseContent",
|
||||
Coroutine[Any, Any, "HttpxBinaryResponseContent"],
|
||||
]:
|
||||
"""
|
||||
Dispatch method to handle Azure AVA TTS requests
|
||||
|
||||
This method encapsulates Azure-specific credential resolution and parameter handling
|
||||
|
||||
Args:
|
||||
base_llm_http_handler: The BaseLLMHTTPHandler instance from main.py
|
||||
"""
|
||||
# Resolve api_base from multiple sources
|
||||
api_base = (
|
||||
api_base
|
||||
or litellm_params_dict.get("api_base")
|
||||
or litellm.api_base
|
||||
or get_secret_str("AZURE_API_BASE")
|
||||
)
|
||||
|
||||
# Resolve api_key from multiple sources (Azure-specific)
|
||||
api_key = (
|
||||
api_key
|
||||
or litellm_params_dict.get("api_key")
|
||||
or litellm.api_key
|
||||
or litellm.azure_key
|
||||
or get_secret_str("AZURE_OPENAI_API_KEY")
|
||||
or get_secret_str("AZURE_API_KEY")
|
||||
)
|
||||
|
||||
# Convert voice to string if it's a dict (for Azure AVA, voice must be a string)
|
||||
voice_str: Optional[str] = None
|
||||
if isinstance(voice, str):
|
||||
voice_str = voice
|
||||
elif isinstance(voice, dict):
|
||||
# Extract voice name from dict if needed
|
||||
voice_str = voice.get("name") if voice else None
|
||||
|
||||
litellm_params_dict.update(
|
||||
{
|
||||
"api_key": api_key,
|
||||
"api_base": api_base,
|
||||
}
|
||||
)
|
||||
# Call the text_to_speech_handler
|
||||
response = base_llm_http_handler.text_to_speech_handler(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice_str,
|
||||
text_to_speech_provider_config=self,
|
||||
text_to_speech_optional_params=optional_params,
|
||||
custom_llm_provider="azure",
|
||||
litellm_params=litellm_params_dict,
|
||||
logging_obj=logging_obj,
|
||||
timeout=timeout,
|
||||
extra_headers=extra_headers,
|
||||
client=None,
|
||||
_is_async=aspeech,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
def get_supported_openai_params(self, model: str) -> list:
|
||||
"""
|
||||
Azure AVA TTS supports these OpenAI parameters
|
||||
|
||||
Note: Azure also supports additional SSML-specific parameters (style, styledegree, role)
|
||||
which can be passed but are not part of the OpenAI spec
|
||||
"""
|
||||
return ["voice", "response_format", "speed"]
|
||||
|
||||
def _convert_speed_to_azure_rate(self, speed: float) -> str:
|
||||
"""
|
||||
Convert OpenAI speed value to Azure SSML prosody rate percentage
|
||||
|
||||
Args:
|
||||
speed: OpenAI speed value (0.25-4.0, default 1.0)
|
||||
|
||||
Returns:
|
||||
Azure rate string with percentage (e.g., "+50%", "-50%", "+0%")
|
||||
|
||||
Examples:
|
||||
speed=1.0 -> "+0%" (default)
|
||||
speed=2.0 -> "+100%"
|
||||
speed=0.5 -> "-50%"
|
||||
"""
|
||||
rate_percentage = int((speed - 1.0) * 100)
|
||||
return f"{rate_percentage:+d}%"
|
||||
|
||||
def _build_express_as_element(
|
||||
self,
|
||||
content: str,
|
||||
style: Optional[str] = None,
|
||||
styledegree: Optional[str] = None,
|
||||
role: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Build mstts:express-as element with optional style, styledegree, and role attributes
|
||||
|
||||
Args:
|
||||
content: The inner content to wrap
|
||||
style: Speaking style (e.g., "cheerful", "sad", "angry")
|
||||
styledegree: Style intensity (0.01 to 2)
|
||||
role: Voice role (e.g., "Girl", "Boy", "SeniorFemale", "SeniorMale")
|
||||
|
||||
Returns:
|
||||
Content wrapped in mstts:express-as if any attributes provided, otherwise raw content
|
||||
"""
|
||||
if not (style or styledegree or role):
|
||||
return content
|
||||
|
||||
express_as_attrs = []
|
||||
if style:
|
||||
express_as_attrs.append(f"style='{style}'")
|
||||
if styledegree:
|
||||
express_as_attrs.append(f"styledegree='{styledegree}'")
|
||||
if role:
|
||||
express_as_attrs.append(f"role='{role}'")
|
||||
|
||||
express_as_attrs_str = " ".join(express_as_attrs)
|
||||
return f"<mstts:express-as {express_as_attrs_str}>{content}</mstts:express-as>"
|
||||
|
||||
def _get_voice_language(
|
||||
self,
|
||||
voice_name: Optional[str],
|
||||
explicit_lang: Optional[str] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Get the language for the voice element's xml:lang attribute
|
||||
|
||||
Args:
|
||||
voice_name: The Azure voice name (e.g., "en-US-AriaNeural")
|
||||
explicit_lang: Explicitly provided language code (takes precedence)
|
||||
|
||||
Returns:
|
||||
Language code if available (e.g., "es-ES"), or None
|
||||
|
||||
Examples:
|
||||
- explicit_lang="es-ES" → "es-ES" (explicit takes precedence)
|
||||
- voice_name="en-US-AriaNeural", explicit_lang=None → None (use default from voice)
|
||||
- voice_name="en-US-AvaMultilingualNeural", explicit_lang="fr-FR" → "fr-FR"
|
||||
"""
|
||||
# If explicit language is provided, use it (for multilingual voices)
|
||||
if explicit_lang:
|
||||
return explicit_lang
|
||||
|
||||
# For non-multilingual voices, we don't need to set xml:lang on the voice element
|
||||
# The voice name already encodes the language (e.g., en-US-AriaNeural)
|
||||
# Only return a language if explicitly set
|
||||
return None
|
||||
|
||||
def map_openai_params(
|
||||
self,
|
||||
model: str,
|
||||
optional_params: Dict,
|
||||
voice: Optional[Union[str, Dict]] = None,
|
||||
drop_params: bool = False,
|
||||
kwargs: Dict = {},
|
||||
) -> Tuple[Optional[str], Dict]:
|
||||
"""
|
||||
Map OpenAI parameters to Azure AVA TTS parameters
|
||||
"""
|
||||
mapped_params = {}
|
||||
##########################################################
|
||||
# Map voice
|
||||
# OpenAI uses voice as a required param, hence not in optional_params
|
||||
##########################################################
|
||||
# If it's already an Azure voice, use it directly
|
||||
mapped_voice: Optional[str] = None
|
||||
if isinstance(voice, str):
|
||||
if voice in self.VOICE_MAPPINGS:
|
||||
mapped_voice = self.VOICE_MAPPINGS[voice]
|
||||
else:
|
||||
# Assume it's already an Azure voice name
|
||||
mapped_voice = voice
|
||||
|
||||
# Map response format
|
||||
if "response_format" in optional_params:
|
||||
format_name = optional_params["response_format"]
|
||||
if format_name in self.FORMAT_MAPPINGS:
|
||||
mapped_params["output_format"] = self.FORMAT_MAPPINGS[format_name]
|
||||
else:
|
||||
# Try to use it directly as Azure format
|
||||
mapped_params["output_format"] = format_name
|
||||
else:
|
||||
# Default to MP3
|
||||
mapped_params["output_format"] = "audio-24khz-48kbitrate-mono-mp3"
|
||||
|
||||
# Map speed (OpenAI: 0.25-4.0, Azure: prosody rate)
|
||||
if "speed" in optional_params:
|
||||
speed = optional_params["speed"]
|
||||
if speed is not None:
|
||||
mapped_params["rate"] = self._convert_speed_to_azure_rate(speed=speed)
|
||||
|
||||
# Pass through Azure-specific SSML parameters
|
||||
if "style" in kwargs:
|
||||
mapped_params["style"] = kwargs["style"]
|
||||
|
||||
if "styledegree" in kwargs:
|
||||
mapped_params["styledegree"] = kwargs["styledegree"]
|
||||
|
||||
if "role" in kwargs:
|
||||
mapped_params["role"] = kwargs["role"]
|
||||
|
||||
if "lang" in kwargs:
|
||||
mapped_params["lang"] = kwargs["lang"]
|
||||
return mapped_voice, mapped_params
|
||||
|
||||
def validate_environment(
|
||||
self,
|
||||
headers: dict,
|
||||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Validate Azure environment and set up authentication headers
|
||||
"""
|
||||
validated_headers = headers.copy()
|
||||
|
||||
# Azure AVA TTS requires either:
|
||||
# 1. Ocp-Apim-Subscription-Key header, or
|
||||
# 2. Authorization: Bearer <token> header
|
||||
|
||||
# We'll use the token-based auth via our token handler
|
||||
# The token will be added later in the handler
|
||||
|
||||
if api_key:
|
||||
# If subscription key is provided, use it directly
|
||||
validated_headers["Ocp-Apim-Subscription-Key"] = api_key
|
||||
|
||||
# Content-Type for SSML
|
||||
validated_headers["Content-Type"] = "application/ssml+xml"
|
||||
|
||||
# User-Agent
|
||||
validated_headers["User-Agent"] = "litellm"
|
||||
|
||||
return validated_headers
|
||||
|
||||
def get_complete_url(
|
||||
self,
|
||||
model: str,
|
||||
api_base: Optional[str],
|
||||
litellm_params: dict,
|
||||
) -> str:
|
||||
"""
|
||||
Get the complete URL for Azure AVA TTS request
|
||||
|
||||
Azure TTS endpoint format:
|
||||
https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
|
||||
"""
|
||||
if api_base is None:
|
||||
raise ValueError(
|
||||
f"api_base is required for Azure AVA TTS. "
|
||||
f"Format: https://{{region}}.{self.COGNITIVE_SERVICES_DOMAIN} or "
|
||||
f"https://{{region}}.{self.TTS_SPEECH_DOMAIN}"
|
||||
)
|
||||
|
||||
# Remove trailing slash and parse URL
|
||||
api_base = api_base.rstrip("/")
|
||||
parsed_url = urlparse(api_base)
|
||||
hostname = parsed_url.hostname or ""
|
||||
|
||||
# Check if it's a Cognitive Services endpoint (convert to TTS endpoint)
|
||||
if self._is_cognitive_services_endpoint(hostname=hostname):
|
||||
region = self._extract_region_from_hostname(
|
||||
hostname=hostname, domain=self.COGNITIVE_SERVICES_DOMAIN
|
||||
)
|
||||
return self._build_tts_url(region=region)
|
||||
|
||||
# Check if it's already a TTS endpoint
|
||||
if self._is_tts_endpoint(hostname=hostname):
|
||||
if not api_base.endswith(self.TTS_ENDPOINT_PATH):
|
||||
return f"{api_base}{self.TTS_ENDPOINT_PATH}"
|
||||
return api_base
|
||||
|
||||
# Assume it's a custom endpoint, append the path
|
||||
return f"{api_base}{self.TTS_ENDPOINT_PATH}"
|
||||
|
||||
def _is_cognitive_services_endpoint(self, hostname: str) -> bool:
|
||||
"""Check if hostname is a Cognitive Services endpoint"""
|
||||
return hostname == self.COGNITIVE_SERVICES_DOMAIN or hostname.endswith(
|
||||
f".{self.COGNITIVE_SERVICES_DOMAIN}"
|
||||
)
|
||||
|
||||
def _is_tts_endpoint(self, hostname: str) -> bool:
|
||||
"""Check if hostname is a TTS endpoint"""
|
||||
return hostname == self.TTS_SPEECH_DOMAIN or hostname.endswith(
|
||||
f".{self.TTS_SPEECH_DOMAIN}"
|
||||
)
|
||||
|
||||
def _extract_region_from_hostname(self, hostname: str, domain: str) -> str:
|
||||
"""
|
||||
Extract region from hostname
|
||||
|
||||
Examples:
|
||||
eastus.api.cognitive.microsoft.com -> eastus
|
||||
api.cognitive.microsoft.com -> ""
|
||||
"""
|
||||
if hostname.endswith(f".{domain}"):
|
||||
return hostname[: -len(f".{domain}")]
|
||||
return ""
|
||||
|
||||
def _build_tts_url(self, region: str) -> str:
|
||||
"""Build the complete TTS URL with region"""
|
||||
if region:
|
||||
return f"https://{region}.{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}"
|
||||
return f"https://{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}"
|
||||
|
||||
def is_ssml_input(self, input: str) -> bool:
|
||||
"""
|
||||
Returns True if input is SSML, False otherwise
|
||||
|
||||
Based on https://www.w3.org/TR/speech-synthesis/ all SSML must start with <speak>
|
||||
"""
|
||||
return "<speak>" in input or "<speak " in input
|
||||
|
||||
def transform_text_to_speech_request(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[str],
|
||||
optional_params: Dict,
|
||||
litellm_params: Dict,
|
||||
headers: dict,
|
||||
) -> TextToSpeechRequestData:
|
||||
"""
|
||||
Transform OpenAI TTS request to Azure AVA TTS SSML format
|
||||
|
||||
Note: optional_params should already be mapped via map_openai_params in main.py
|
||||
|
||||
Supports Azure-specific SSML features:
|
||||
- style: Speaking style (e.g., "cheerful", "sad", "angry")
|
||||
- styledegree: Style intensity (0.01 to 2)
|
||||
- role: Voice role (e.g., "Girl", "Boy", "SeniorFemale", "SeniorMale")
|
||||
- lang: Language code for multilingual voices (e.g., "es-ES", "fr-FR")
|
||||
|
||||
Auto-detects SSML:
|
||||
- If input contains <speak>, it's passed through as-is without transformation
|
||||
|
||||
Returns:
|
||||
TextToSpeechRequestData: Contains SSML body and Azure-specific headers
|
||||
"""
|
||||
# Get voice (already mapped in main.py, or use default)
|
||||
azure_voice = voice or self.DEFAULT_VOICE
|
||||
|
||||
# Get output format (already mapped in main.py)
|
||||
output_format = optional_params.get(
|
||||
"output_format", "audio-24khz-48kbitrate-mono-mp3"
|
||||
)
|
||||
headers["X-Microsoft-OutputFormat"] = output_format
|
||||
|
||||
# Auto-detect SSML: if input contains <speak>, pass it through as-is
|
||||
# Similar to Vertex AI behavior - check if input looks like SSML
|
||||
if self.is_ssml_input(input=input):
|
||||
return TextToSpeechRequestData(
|
||||
ssml_body=input,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# Build SSML from plain text
|
||||
rate = optional_params.get("rate", "0%")
|
||||
style = optional_params.get("style")
|
||||
styledegree = optional_params.get("styledegree")
|
||||
role = optional_params.get("role")
|
||||
lang = optional_params.get("lang")
|
||||
|
||||
# Escape XML special characters in input text
|
||||
escaped_input = (
|
||||
input.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace('"', """)
|
||||
.replace("'", "'")
|
||||
)
|
||||
|
||||
# Determine if we need mstts namespace (for express-as element)
|
||||
use_mstts = style or role or styledegree
|
||||
|
||||
# Build the xmlns attributes
|
||||
if use_mstts:
|
||||
xmlns = "xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts'"
|
||||
else:
|
||||
xmlns = "xmlns='http://www.w3.org/2001/10/synthesis'"
|
||||
|
||||
# Build the inner content with prosody
|
||||
prosody_content = f"<prosody rate='{rate}'>{escaped_input}</prosody>"
|
||||
|
||||
# Wrap in mstts:express-as if style or role is specified
|
||||
voice_content = self._build_express_as_element(
|
||||
content=prosody_content,
|
||||
style=style,
|
||||
styledegree=styledegree,
|
||||
role=role,
|
||||
)
|
||||
|
||||
# Build voice element with optional xml:lang attribute
|
||||
voice_lang = self._get_voice_language(
|
||||
voice_name=azure_voice,
|
||||
explicit_lang=lang,
|
||||
)
|
||||
voice_lang_attr = f" xml:lang='{voice_lang}'" if voice_lang else ""
|
||||
|
||||
ssml_body = f"""<speak version='1.0' {xmlns} xml:lang='en-US'>
|
||||
<voice name='{azure_voice}'{voice_lang_attr}>
|
||||
{voice_content}
|
||||
</voice>
|
||||
</speak>"""
|
||||
|
||||
return {
|
||||
"ssml_body": ssml_body,
|
||||
"headers": headers,
|
||||
}
|
||||
|
||||
def transform_text_to_speech_response(
|
||||
self,
|
||||
model: str,
|
||||
raw_response: httpx.Response,
|
||||
logging_obj: "LiteLLMLoggingObj",
|
||||
) -> "HttpxBinaryResponseContent":
|
||||
"""
|
||||
Transform Azure AVA TTS response to standard format
|
||||
|
||||
Azure returns the audio data directly in the response body
|
||||
"""
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
|
||||
# Azure returns audio data directly in the response body
|
||||
# Wrap it in HttpxBinaryResponseContent for consistent return type
|
||||
return HttpxBinaryResponseContent(raw_response)
|
||||
Reference in New Issue
Block a user