chore: initial public snapshot for github upload

This commit is contained in:
Your Name
2026-03-26 20:06:14 +08:00
commit 0e5ecd930e
3497 changed files with 1586236 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
"""Azure AI OCR module."""
from .common_utils import get_azure_ai_ocr_config
from .document_intelligence.transformation import (
AzureDocumentIntelligenceOCRConfig,
)
from .transformation import AzureAIOCRConfig
__all__ = [
"AzureAIOCRConfig",
"AzureDocumentIntelligenceOCRConfig",
"get_azure_ai_ocr_config",
]

View File

@@ -0,0 +1,52 @@
"""
Common utilities for Azure AI OCR providers.
This module provides routing logic to determine which OCR configuration to use
based on the model name.
"""
from typing import TYPE_CHECKING, Optional
from litellm._logging import verbose_logger
if TYPE_CHECKING:
from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig
def get_azure_ai_ocr_config(model: str) -> Optional["BaseOCRConfig"]:
"""
Determine which Azure AI OCR configuration to use based on the model name.
Azure AI supports multiple OCR services:
- Azure Document Intelligence: azure_ai/doc-intelligence/<model>
- Mistral OCR (via Azure AI): azure_ai/<model>
Args:
model: The model name (e.g., "azure_ai/doc-intelligence/prebuilt-read",
"azure_ai/pixtral-12b-2409")
Returns:
OCR configuration instance for the specified model
Examples:
>>> get_azure_ai_ocr_config("azure_ai/doc-intelligence/prebuilt-read")
<AzureDocumentIntelligenceOCRConfig object>
>>> get_azure_ai_ocr_config("azure_ai/pixtral-12b-2409")
<AzureAIOCRConfig object>
"""
from litellm.llms.azure_ai.ocr.document_intelligence.transformation import (
AzureDocumentIntelligenceOCRConfig,
)
from litellm.llms.azure_ai.ocr.transformation import AzureAIOCRConfig
# Check for Azure Document Intelligence models
if "doc-intelligence" in model or "documentintelligence" in model:
verbose_logger.debug(
f"Routing {model} to Azure Document Intelligence OCR config"
)
return AzureDocumentIntelligenceOCRConfig()
# Default to Mistral-based OCR for other azure_ai models
verbose_logger.debug(f"Routing {model} to Azure AI (Mistral) OCR config")
return AzureAIOCRConfig()

View File

@@ -0,0 +1,4 @@
"""Azure Document Intelligence OCR module."""
from .transformation import AzureDocumentIntelligenceOCRConfig
__all__ = ["AzureDocumentIntelligenceOCRConfig"]

View File

@@ -0,0 +1,698 @@
"""
Azure Document Intelligence OCR transformation implementation.
Azure Document Intelligence (formerly Form Recognizer) provides advanced document analysis capabilities.
This implementation transforms between Mistral OCR format and Azure Document Intelligence API v4.0.
Note: Azure Document Intelligence API is async - POST returns 202 Accepted with Operation-Location header.
The operation location must be polled until the analysis completes.
"""
import asyncio
import re
import time
from typing import Any, Dict, Optional
import httpx
from litellm._logging import verbose_logger
from litellm.constants import (
AZURE_DOCUMENT_INTELLIGENCE_API_VERSION,
AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI,
AZURE_OPERATION_POLLING_TIMEOUT,
)
from litellm.llms.base_llm.ocr.transformation import (
BaseOCRConfig,
DocumentType,
OCRPage,
OCRPageDimensions,
OCRRequestData,
OCRResponse,
OCRUsageInfo,
)
from litellm.secret_managers.main import get_secret_str
class AzureDocumentIntelligenceOCRConfig(BaseOCRConfig):
"""
Azure Document Intelligence OCR transformation configuration.
Supports Azure Document Intelligence v4.0 (2024-11-30) API.
Model route: azure_ai/doc-intelligence/<model>
Supported models:
- prebuilt-layout: Extracts text with markdown, tables, and structure (closest to Mistral OCR)
- prebuilt-read: Basic text extraction optimized for reading
- prebuilt-document: General document analysis
Reference: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/
"""
def __init__(self) -> None:
super().__init__()
def get_supported_ocr_params(self, model: str) -> list:
"""
Get supported OCR parameters for Azure Document Intelligence.
Azure DI has minimal optional parameters compared to Mistral OCR.
Most Mistral-specific params are ignored during transformation.
"""
return []
def validate_environment(
self,
headers: Dict,
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
litellm_params: Optional[dict] = None,
**kwargs,
) -> Dict:
"""
Validate environment and return headers for Azure Document Intelligence.
Authentication uses Ocp-Apim-Subscription-Key header.
"""
# Get API key from environment if not provided
if api_key is None:
api_key = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_API_KEY")
if api_key is None:
raise ValueError(
"Missing Azure Document Intelligence API Key - Set AZURE_DOCUMENT_INTELLIGENCE_API_KEY environment variable or pass api_key parameter"
)
# Validate API base/endpoint is provided
if api_base is None:
api_base = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
if api_base is None:
raise ValueError(
"Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter"
)
headers = {
"Ocp-Apim-Subscription-Key": api_key,
"Content-Type": "application/json",
**headers,
}
return headers
def get_complete_url(
self,
api_base: Optional[str],
model: str,
optional_params: dict,
litellm_params: Optional[dict] = None,
**kwargs,
) -> str:
"""
Get complete URL for Azure Document Intelligence endpoint.
Format: {endpoint}/documentintelligence/documentModels/{modelId}:analyze?api-version=2024-11-30
Note: API version 2024-11-30 uses /documentintelligence/ path (not /formrecognizer/)
Args:
api_base: Azure Document Intelligence endpoint (e.g., https://your-resource.cognitiveservices.azure.com)
model: Model ID (e.g., "prebuilt-layout", "prebuilt-read")
optional_params: Optional parameters
Returns: Complete URL for Azure DI analyze endpoint
"""
if api_base is None:
api_base = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
if api_base is None:
raise ValueError(
"Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter"
)
# Ensure no trailing slash
api_base = api_base.rstrip("/")
# Extract model ID from full model path if needed
# Model can be "prebuilt-layout" or "azure_ai/doc-intelligence/prebuilt-layout"
model_id = model
if "/" in model:
# Extract the last part after the last slash
model_id = model.split("/")[-1]
# Azure Document Intelligence analyze endpoint
# Note: API version 2024-11-30+ uses /documentintelligence/ (not /formrecognizer/)
return f"{api_base}/documentintelligence/documentModels/{model_id}:analyze?api-version={AZURE_DOCUMENT_INTELLIGENCE_API_VERSION}"
def _extract_base64_from_data_uri(self, data_uri: str) -> str:
"""
Extract base64 content from a data URI.
Args:
data_uri: Data URI like "data:application/pdf;base64,..."
Returns:
Base64 string without the data URI prefix
"""
# Match pattern: data:[<mediatype>][;base64],<data>
match = re.match(r"data:([^;]+)(?:;base64)?,(.+)", data_uri)
if match:
return match.group(2)
return data_uri
def transform_ocr_request(
self,
model: str,
document: DocumentType,
optional_params: dict,
headers: dict,
**kwargs,
) -> OCRRequestData:
"""
Transform OCR request to Azure Document Intelligence format.
Mistral OCR format:
{
"document": {
"type": "document_url",
"document_url": "https://example.com/doc.pdf"
}
}
Azure DI format:
{
"urlSource": "https://example.com/doc.pdf"
}
OR
{
"base64Source": "base64_encoded_content"
}
Args:
model: Model name
document: Document dict from user (Mistral format)
optional_params: Already mapped optional parameters
headers: Request headers
Returns:
OCRRequestData with JSON data
"""
verbose_logger.debug(
f"Azure Document Intelligence transform_ocr_request - model: {model}"
)
if not isinstance(document, dict):
raise ValueError(f"Expected document dict, got {type(document)}")
# Extract document URL from Mistral format
doc_type = document.get("type")
document_url = None
if doc_type == "document_url":
document_url = document.get("document_url", "")
elif doc_type == "image_url":
document_url = document.get("image_url", "")
else:
raise ValueError(
f"Invalid document type: {doc_type}. Must be 'document_url' or 'image_url'"
)
if not document_url:
raise ValueError("Document URL is required")
# Build Azure DI request
data: Dict[str, Any] = {}
# Check if it's a data URI (base64)
if document_url.startswith("data:"):
# Extract base64 content
base64_content = self._extract_base64_from_data_uri(document_url)
data["base64Source"] = base64_content
verbose_logger.debug("Using base64Source for Azure Document Intelligence")
else:
# Regular URL
data["urlSource"] = document_url
verbose_logger.debug("Using urlSource for Azure Document Intelligence")
# Azure DI doesn't support most Mistral-specific params
# Ignore pages, include_image_base64, etc.
return OCRRequestData(data=data, files=None)
def _extract_page_markdown(self, page_data: Dict[str, Any]) -> str:
"""
Extract text from Azure DI page and format as markdown.
Azure DI provides text in 'lines' array. We concatenate them with newlines.
Args:
page_data: Azure DI page object
Returns:
Markdown-formatted text
"""
lines = page_data.get("lines", [])
if not lines:
return ""
# Extract text content from each line
text_lines = [line.get("content", "") for line in lines]
# Join with newlines to preserve structure
return "\n".join(text_lines)
def _convert_dimensions(
self, width: float, height: float, unit: str
) -> OCRPageDimensions:
"""
Convert Azure DI dimensions to pixels.
Azure DI provides dimensions in inches. We convert to pixels using configured DPI.
Args:
width: Width in specified unit
height: Height in specified unit
unit: Unit of measurement (e.g., "inch")
Returns:
OCRPageDimensions with pixel values
"""
# Convert to pixels using configured DPI
dpi = AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI
if unit == "inch":
width_px = int(width * dpi)
height_px = int(height * dpi)
else:
# If unit is not inches, assume it's already in pixels
width_px = int(width)
height_px = int(height)
return OCRPageDimensions(width=width_px, height=height_px, dpi=dpi)
@staticmethod
def _check_timeout(start_time: float, timeout_secs: int) -> None:
"""
Check if operation has timed out.
Args:
start_time: Start time of the operation
timeout_secs: Timeout duration in seconds
Raises:
TimeoutError: If operation has exceeded timeout
"""
if time.time() - start_time > timeout_secs:
raise TimeoutError(
f"Azure Document Intelligence operation polling timed out after {timeout_secs} seconds"
)
@staticmethod
def _get_retry_after(response: httpx.Response) -> int:
"""
Get retry-after duration from response headers.
Args:
response: HTTP response
Returns:
Retry-after duration in seconds (default: 2)
"""
retry_after = int(response.headers.get("retry-after", "2"))
verbose_logger.debug(f"Retry polling after: {retry_after} seconds")
return retry_after
@staticmethod
def _check_operation_status(response: httpx.Response) -> str:
"""
Check Azure DI operation status from response.
Args:
response: HTTP response from operation endpoint
Returns:
Operation status string
Raises:
ValueError: If operation failed or status is unknown
"""
try:
result = response.json()
status = result.get("status")
verbose_logger.debug(f"Azure DI operation status: {status}")
if status == "succeeded":
return "succeeded"
elif status == "failed":
error_msg = result.get("error", {}).get("message", "Unknown error")
raise ValueError(
f"Azure Document Intelligence analysis failed: {error_msg}"
)
elif status in ["running", "notStarted"]:
return "running"
else:
raise ValueError(f"Unknown operation status: {status}")
except Exception as e:
if "succeeded" in str(e) or "failed" in str(e):
raise
# If we can't parse JSON, something went wrong
raise ValueError(f"Failed to parse Azure DI operation response: {e}")
def _poll_operation_sync(
self,
operation_url: str,
headers: Dict[str, str],
timeout_secs: int,
) -> httpx.Response:
"""
Poll Azure Document Intelligence operation until completion (sync).
Azure DI POST returns 202 with Operation-Location header.
We need to poll that URL until status is "succeeded" or "failed".
Args:
operation_url: The Operation-Location URL to poll
headers: Request headers (including auth)
timeout_secs: Total timeout in seconds
Returns:
Final response with completed analysis
"""
from litellm.llms.custom_httpx.http_handler import _get_httpx_client
client = _get_httpx_client()
start_time = time.time()
verbose_logger.debug(f"Polling Azure DI operation: {operation_url}")
while True:
self._check_timeout(start_time=start_time, timeout_secs=timeout_secs)
# Poll the operation status
response = client.get(url=operation_url, headers=headers)
# Check operation status
status = self._check_operation_status(response=response)
if status == "succeeded":
return response
elif status == "running":
# Wait before polling again
retry_after = self._get_retry_after(response=response)
time.sleep(retry_after)
async def _poll_operation_async(
self,
operation_url: str,
headers: Dict[str, str],
timeout_secs: int,
) -> httpx.Response:
"""
Poll Azure Document Intelligence operation until completion (async).
Args:
operation_url: The Operation-Location URL to poll
headers: Request headers (including auth)
timeout_secs: Total timeout in seconds
Returns:
Final response with completed analysis
"""
import litellm
from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
client = get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE_AI)
start_time = time.time()
verbose_logger.debug(f"Polling Azure DI operation (async): {operation_url}")
while True:
self._check_timeout(start_time=start_time, timeout_secs=timeout_secs)
# Poll the operation status
response = await client.get(url=operation_url, headers=headers)
# Check operation status
status = self._check_operation_status(response=response)
if status == "succeeded":
return response
elif status == "running":
# Wait before polling again
retry_after = self._get_retry_after(response=response)
await asyncio.sleep(retry_after)
def transform_ocr_response(
self,
model: str,
raw_response: httpx.Response,
logging_obj: Any,
**kwargs,
) -> OCRResponse:
"""
Transform Azure Document Intelligence response to Mistral OCR format.
Handles async operation polling: If response is 202 Accepted, polls Operation-Location
until analysis completes.
Azure DI response (after polling):
{
"status": "succeeded",
"analyzeResult": {
"content": "Full document text...",
"pages": [
{
"pageNumber": 1,
"width": 8.5,
"height": 11,
"unit": "inch",
"lines": [{"content": "text", "boundingBox": [...]}]
}
]
}
}
Mistral OCR format:
{
"pages": [
{
"index": 0,
"markdown": "extracted text",
"dimensions": {"width": 816, "height": 1056, "dpi": 96}
}
],
"model": "azure_ai/doc-intelligence/prebuilt-layout",
"usage_info": {"pages_processed": 1},
"object": "ocr"
}
Args:
model: Model name
raw_response: Raw HTTP response from Azure DI (may be 202 Accepted)
logging_obj: Logging object
Returns:
OCRResponse in Mistral format
"""
try:
# Check if we got 202 Accepted (async operation started)
if raw_response.status_code == 202:
verbose_logger.debug(
"Azure DI returned 202 Accepted, polling operation..."
)
# Get Operation-Location header
operation_url = raw_response.headers.get("Operation-Location")
if not operation_url:
raise ValueError(
"Azure Document Intelligence returned 202 but no Operation-Location header found"
)
# Get headers for polling (need auth)
poll_headers = {
"Ocp-Apim-Subscription-Key": raw_response.request.headers.get(
"Ocp-Apim-Subscription-Key", ""
)
}
# Get timeout from kwargs or use default
timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT
# Poll until operation completes
raw_response = self._poll_operation_sync(
operation_url=operation_url,
headers=poll_headers,
timeout_secs=timeout_secs,
)
# Now parse the completed response
response_json = raw_response.json()
verbose_logger.debug(
f"Azure Document Intelligence response status: {response_json.get('status')}"
)
# Check if request succeeded
status = response_json.get("status")
if status != "succeeded":
raise ValueError(
f"Azure Document Intelligence analysis failed with status: {status}"
)
# Extract analyze result
analyze_result = response_json.get("analyzeResult", {})
azure_pages = analyze_result.get("pages", [])
# Transform pages to Mistral format
mistral_pages = []
for azure_page in azure_pages:
page_number = azure_page.get("pageNumber", 1)
index = page_number - 1 # Convert to 0-based index
# Extract markdown text
markdown = self._extract_page_markdown(azure_page)
# Convert dimensions
width = azure_page.get("width", 8.5)
height = azure_page.get("height", 11)
unit = azure_page.get("unit", "inch")
dimensions = self._convert_dimensions(
width=width, height=height, unit=unit
)
# Build OCR page
ocr_page = OCRPage(
index=index, markdown=markdown, dimensions=dimensions
)
mistral_pages.append(ocr_page)
# Build usage info
usage_info = OCRUsageInfo(
pages_processed=len(mistral_pages), doc_size_bytes=None
)
# Return Mistral OCR response
return OCRResponse(
pages=mistral_pages,
model=model,
usage_info=usage_info,
object="ocr",
)
except Exception as e:
verbose_logger.error(
f"Error parsing Azure Document Intelligence response: {e}"
)
raise e
async def async_transform_ocr_response(
self,
model: str,
raw_response: httpx.Response,
logging_obj: Any,
**kwargs,
) -> OCRResponse:
"""
Async transform Azure Document Intelligence response to Mistral OCR format.
Handles async operation polling: If response is 202 Accepted, polls Operation-Location
until analysis completes using async polling.
Args:
model: Model name
raw_response: Raw HTTP response from Azure DI (may be 202 Accepted)
logging_obj: Logging object
Returns:
OCRResponse in Mistral format
"""
try:
# Check if we got 202 Accepted (async operation started)
if raw_response.status_code == 202:
verbose_logger.debug(
"Azure DI returned 202 Accepted, polling operation (async)..."
)
# Get Operation-Location header
operation_url = raw_response.headers.get("Operation-Location")
if not operation_url:
raise ValueError(
"Azure Document Intelligence returned 202 but no Operation-Location header found"
)
# Get headers for polling (need auth)
poll_headers = {
"Ocp-Apim-Subscription-Key": raw_response.request.headers.get(
"Ocp-Apim-Subscription-Key", ""
)
}
# Get timeout from kwargs or use default
timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT
# Poll until operation completes (async)
raw_response = await self._poll_operation_async(
operation_url=operation_url,
headers=poll_headers,
timeout_secs=timeout_secs,
)
# Now parse the completed response
response_json = raw_response.json()
verbose_logger.debug(
f"Azure Document Intelligence response status: {response_json.get('status')}"
)
# Check if request succeeded
status = response_json.get("status")
if status != "succeeded":
raise ValueError(
f"Azure Document Intelligence analysis failed with status: {status}"
)
# Extract analyze result
analyze_result = response_json.get("analyzeResult", {})
azure_pages = analyze_result.get("pages", [])
# Transform pages to Mistral format
mistral_pages = []
for azure_page in azure_pages:
page_number = azure_page.get("pageNumber", 1)
index = page_number - 1 # Convert to 0-based index
# Extract markdown text
markdown = self._extract_page_markdown(azure_page)
# Convert dimensions
width = azure_page.get("width", 8.5)
height = azure_page.get("height", 11)
unit = azure_page.get("unit", "inch")
dimensions = self._convert_dimensions(
width=width, height=height, unit=unit
)
# Build OCR page
ocr_page = OCRPage(
index=index, markdown=markdown, dimensions=dimensions
)
mistral_pages.append(ocr_page)
# Build usage info
usage_info = OCRUsageInfo(
pages_processed=len(mistral_pages), doc_size_bytes=None
)
# Return Mistral OCR response
return OCRResponse(
pages=mistral_pages,
model=model,
usage_info=usage_info,
object="ocr",
)
except Exception as e:
verbose_logger.error(
f"Error parsing Azure Document Intelligence response (async): {e}"
)
raise e

View File

@@ -0,0 +1,281 @@
"""
Azure AI OCR transformation implementation.
"""
from typing import Dict, Optional
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.prompt_templates.image_handling import (
async_convert_url_to_base64,
convert_url_to_base64,
)
from litellm.llms.base_llm.ocr.transformation import DocumentType, OCRRequestData
from litellm.llms.mistral.ocr.transformation import MistralOCRConfig
from litellm.secret_managers.main import get_secret_str
class AzureAIOCRConfig(MistralOCRConfig):
"""
Azure AI OCR transformation configuration.
Azure AI uses Mistral's OCR API but with a different endpoint format.
Inherits transformation logic from MistralOCRConfig since they use the same format.
Reference: Azure AI Foundry OCR documentation
Important: Azure AI only supports base64 data URIs (data:image/..., data:application/pdf;base64,...).
Regular URLs are not supported.
"""
def __init__(self) -> None:
super().__init__()
def validate_environment(
self,
headers: Dict,
model: str,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
litellm_params: Optional[dict] = None,
**kwargs,
) -> Dict:
"""
Validate environment and return headers for Azure AI OCR.
Azure AI uses Bearer token authentication with AZURE_AI_API_KEY.
"""
# Get API key from environment if not provided
if api_key is None:
api_key = get_secret_str("AZURE_AI_API_KEY")
if api_key is None:
raise ValueError(
"Missing Azure AI API Key - A call is being made to Azure AI but no key is set either in the environment variables or via params"
)
# Validate API base is provided
if api_base is None:
api_base = get_secret_str("AZURE_AI_API_BASE")
if api_base is None:
raise ValueError(
"Missing Azure AI API Base - Set AZURE_AI_API_BASE environment variable or pass api_base parameter"
)
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
**headers,
}
return headers
def get_complete_url(
self,
api_base: Optional[str],
model: str,
optional_params: dict,
litellm_params: Optional[dict] = None,
**kwargs,
) -> str:
"""
Get complete URL for Azure AI OCR endpoint.
Azure AI endpoint format: https://<api_base>/providers/mistral/azure/ocr
Args:
api_base: Azure AI API base URL
model: Model name (not used in URL construction)
optional_params: Optional parameters
Returns: Complete URL for Azure AI OCR endpoint
"""
if api_base is None:
raise ValueError(
"Missing Azure AI API Base - Set AZURE_AI_API_BASE environment variable or pass api_base parameter"
)
# Ensure no trailing slash
api_base = api_base.rstrip("/")
# Azure AI OCR endpoint format
return f"{api_base}/providers/mistral/azure/ocr"
def _convert_url_to_data_uri_sync(self, url: str) -> str:
"""
Synchronously convert a URL to a base64 data URI.
Azure AI OCR doesn't have internet access, so we need to fetch URLs
and convert them to base64 data URIs.
Args:
url: The URL to convert
Returns:
Base64 data URI string
"""
verbose_logger.debug(
f"Azure AI OCR: Converting URL to base64 data URI (sync): {url}"
)
# Fetch and convert to base64 data URI
# convert_url_to_base64 already returns a full data URI like "data:image/jpeg;base64,..."
data_uri = convert_url_to_base64(url=url)
verbose_logger.debug(
f"Azure AI OCR: Converted URL to data URI (length: {len(data_uri)})"
)
return data_uri
async def _convert_url_to_data_uri_async(self, url: str) -> str:
"""
Asynchronously convert a URL to a base64 data URI.
Azure AI OCR doesn't have internet access, so we need to fetch URLs
and convert them to base64 data URIs.
Args:
url: The URL to convert
Returns:
Base64 data URI string
"""
verbose_logger.debug(
f"Azure AI OCR: Converting URL to base64 data URI (async): {url}"
)
# Fetch and convert to base64 data URI asynchronously
# async_convert_url_to_base64 already returns a full data URI like "data:image/jpeg;base64,..."
data_uri = await async_convert_url_to_base64(url=url)
verbose_logger.debug(
f"Azure AI OCR: Converted URL to data URI (length: {len(data_uri)})"
)
return data_uri
def transform_ocr_request(
self,
model: str,
document: DocumentType,
optional_params: dict,
headers: dict,
**kwargs,
) -> OCRRequestData:
"""
Transform OCR request for Azure AI, converting URLs to base64 data URIs (sync).
Azure AI OCR doesn't have internet access, so we automatically fetch
any URLs and convert them to base64 data URIs synchronously.
Args:
model: Model name
document: Document dict from user
optional_params: Already mapped optional parameters
headers: Request headers
**kwargs: Additional arguments
Returns:
OCRRequestData with JSON data
"""
verbose_logger.debug(
f"Azure AI OCR transform_ocr_request (sync) - model: {model}"
)
if not isinstance(document, dict):
raise ValueError(f"Expected document dict, got {type(document)}")
# Check if we need to convert URL to base64
doc_type = document.get("type")
transformed_document = document.copy()
if doc_type == "document_url":
document_url = document.get("document_url", "")
# If it's not already a data URI, convert it
if document_url and not document_url.startswith("data:"):
verbose_logger.debug(
"Azure AI OCR: Converting document URL to base64 data URI (sync)"
)
data_uri = self._convert_url_to_data_uri_sync(url=document_url)
transformed_document["document_url"] = data_uri
elif doc_type == "image_url":
image_url = document.get("image_url", "")
# If it's not already a data URI, convert it
if image_url and not image_url.startswith("data:"):
verbose_logger.debug(
"Azure AI OCR: Converting image URL to base64 data URI (sync)"
)
data_uri = self._convert_url_to_data_uri_sync(url=image_url)
transformed_document["image_url"] = data_uri
# Call parent's transform to build the request
return super().transform_ocr_request(
model=model,
document=transformed_document,
optional_params=optional_params,
headers=headers,
**kwargs,
)
async def async_transform_ocr_request(
self,
model: str,
document: DocumentType,
optional_params: dict,
headers: dict,
**kwargs,
) -> OCRRequestData:
"""
Transform OCR request for Azure AI, converting URLs to base64 data URIs (async).
Azure AI OCR doesn't have internet access, so we automatically fetch
any URLs and convert them to base64 data URIs asynchronously.
Args:
model: Model name
document: Document dict from user
optional_params: Already mapped optional parameters
headers: Request headers
**kwargs: Additional arguments
Returns:
OCRRequestData with JSON data
"""
verbose_logger.debug(
f"Azure AI OCR async_transform_ocr_request - model: {model}"
)
if not isinstance(document, dict):
raise ValueError(f"Expected document dict, got {type(document)}")
# Check if we need to convert URL to base64
doc_type = document.get("type")
transformed_document = document.copy()
if doc_type == "document_url":
document_url = document.get("document_url", "")
# If it's not already a data URI, convert it
if document_url and not document_url.startswith("data:"):
verbose_logger.debug(
"Azure AI OCR: Converting document URL to base64 data URI (async)"
)
data_uri = await self._convert_url_to_data_uri_async(url=document_url)
transformed_document["document_url"] = data_uri
elif doc_type == "image_url":
image_url = document.get("image_url", "")
# If it's not already a data URI, convert it
if image_url and not image_url.startswith("data:"):
verbose_logger.debug(
"Azure AI OCR: Converting image URL to base64 data URI (async)"
)
data_uri = await self._convert_url_to_data_uri_async(url=image_url)
transformed_document["image_url"] = data_uri
# Call parent's transform to build the request
return super().transform_ocr_request(
model=model,
document=transformed_document,
optional_params=optional_params,
headers=headers,
**kwargs,
)