lijiaoqiao/llm-gateway-competitors/litellm-wheel-src/litellm/proxy/ocr_endpoints/endpoints.py

#### OCR Endpoints #####

import json
from typing import Any, Dict, Optional, cast

import orjson
from fastapi import APIRouter, Depends, Request, Response, UploadFile
from fastapi.responses import ORJSONResponse

from litellm._logging import verbose_proxy_logger
from litellm.ocr.main import convert_file_document_to_url_document, get_mime_type
from litellm.proxy._types import *
from litellm.proxy.auth.user_api_key_auth import UserAPIKeyAuth, user_api_key_auth
from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing

router = APIRouter()


def _build_document_from_upload(
    file_content: bytes,
    filename: Optional[str],
    content_type: Optional[str],
) -> Dict[str, str]:
    """
    Convert uploaded file bytes into a Mistral-format document dict with base64 data URI.

    Delegates to convert_file_document_to_url_document after resolving MIME type
    from the upload's content_type header or filename.
    """
    mime_type = content_type.split(";")[0].strip() if content_type else None
    if not mime_type or mime_type == "application/octet-stream":
        if filename:
            mime_type = get_mime_type(filename)

    return convert_file_document_to_url_document(
        {
            "type": "file",
            "file": file_content,
            "mime_type": mime_type or "application/octet-stream",
        }
    )


async def _parse_multipart_form(request: Request) -> Dict[str, Any]:
    """
    Extract OCR data from a multipart form request.

    Uses the cached form if already parsed by auth middleware,
    otherwise parses the form from the request.

    Returns:
        A dict with 'document', 'model', and any other OCR params.
    """
    try:
        form = await request.form()
    except Exception as e:
        raise ValueError(
            f"Failed to parse multipart form data: {str(e)}. "
            "When using curl with --form/-F, do NOT set the Content-Type header "
            "manually — curl will set it automatically with the required boundary."
        )

    uploaded_file = form.get("file")
    # request.form() may return either a FastAPI or Starlette UploadFile
    # depending on middleware; check both via isinstance (FastAPI's UploadFile
    # is a subclass of Starlette's) and fall back to duck-type check.
    if uploaded_file is None or (
        not isinstance(uploaded_file, UploadFile) and not hasattr(uploaded_file, "read")
    ):
        raise ValueError(
            "Multipart OCR request must include a 'file' field with the document to process"
        )

    uploaded_file = cast(UploadFile, uploaded_file)

    # Seek to start in case the file was already partially read by middleware
    await uploaded_file.seek(0)
    file_content = await uploaded_file.read()
    if not file_content:
        raise ValueError("Uploaded file is empty")

    document = _build_document_from_upload(
        file_content=file_content,
        filename=uploaded_file.filename,
        content_type=uploaded_file.content_type,
    )

    data: Dict[str, Any] = {"document": document}

    for field_name, field_value in form.items():
        if field_name in ("file", "document"):
            continue
        # Try to parse JSON values (e.g. pages=[0,1,2])
        if isinstance(field_value, str):
            try:
                data[field_name] = json.loads(field_value)
            except (json.JSONDecodeError, ValueError):
                data[field_name] = field_value
        else:
            data[field_name] = field_value

    verbose_proxy_logger.debug(
        f"OCR multipart form request parsed - model: {data.get('model')}, "
        f"document_type: {document['type']}, "
        f"filename: {uploaded_file.filename}"
    )

    return data


async def _parse_ocr_request(request: Request) -> Dict[str, Any]:
    """
    Parse an OCR request, supporting both JSON and multipart form data.

    JSON body (existing behavior):
        {
            "model": "mistral/mistral-ocr-latest",
            "document": {"type": "document_url", "document_url": "https://..."}
        }

    Multipart form data (new):
        - file: the uploaded file
        - model: model name (form field)
        - Any other OCR params as form fields (pages, include_image_base64, etc.)

    Returns:
        A dict suitable for passing to the OCR processing pipeline.
    """
    content_type = request.headers.get("content-type", "")

    if "multipart/form-data" in content_type.lower():
        return await _parse_multipart_form(request)

    # --- JSON body (existing behavior) ---
    try:
        body = await request.body()
    except RuntimeError:
        # Body stream was consumed by auth middleware (e.g., form parsing).
        body = b""

    if not body:
        # The body may be empty because the auth middleware already parsed
        # it as form data (e.g., _read_request_body called request.form()).
        # Check if form data is available.
        if getattr(request, "_form", None) is not None:
            verbose_proxy_logger.debug(
                "OCR request body is empty but form data is available from middleware — "
                "processing as multipart form."
            )
            return await _parse_multipart_form(request)

        raise ValueError(
            "Empty request body. For file uploads, use multipart/form-data content type "
            "with a file field. When using curl with --form/-F, do NOT set the Content-Type "
            "header manually."
        )

    try:
        data = orjson.loads(body)
    except orjson.JSONDecodeError as e:
        raise ValueError(
            f"Invalid JSON in request body: {e}. "
            "Ensure the request body is valid JSON with Content-Type: application/json, "
            "or use multipart/form-data for file uploads."
        )

    # Security: reject type="file" documents received via JSON.
    # The "file" document type is designed for local SDK usage where the
    # caller and the process share a filesystem.  In the proxy context the
    # caller is remote, so allowing a file-path string would let an
    # authenticated user read arbitrary files from the server's filesystem.
    # File uploads must go through multipart/form-data instead.
    doc = data.get("document") if isinstance(data, dict) else None
    if isinstance(doc, dict) and doc.get("type") == "file":
        raise ValueError(
            "document type 'file' is not supported through the JSON API. "
            "To upload a local file, use multipart/form-data with a 'file' field. "
            "For JSON requests, use 'document_url' or 'image_url' document types."
        )

    return data


@router.post(
    "/v1/ocr",
    dependencies=[Depends(user_api_key_auth)],
    response_class=ORJSONResponse,
    tags=["ocr"],
)
@router.post(
    "/ocr",
    dependencies=[Depends(user_api_key_auth)],
    response_class=ORJSONResponse,
    tags=["ocr"],
)
async def ocr(
    request: Request,
    fastapi_response: Response,
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
    """
    OCR endpoint for extracting text from documents and images.

    Supports two input modes:

    **1. JSON body** (Mistral OCR API compatible):
    ```bash
    curl -X POST "http://localhost:4000/v1/ocr" \
        -H "Authorization: Bearer sk-1234" \
        -H "Content-Type: application/json" \
        -d '{
            "model": "mistral-ocr",
            "document": {
                "type": "document_url",
                "document_url": "https://arxiv.org/pdf/2201.04234"
            }
        }'
    ```

    **2. Multipart form file upload**:
    ```bash
    curl -X POST "http://localhost:4000/v1/ocr" \
        -H "Authorization: Bearer sk-1234" \
        -F "model=mistral-ocr" \
        -F "file=@document.pdf"
    ```
    """
    from litellm.proxy.proxy_server import (
        general_settings,
        llm_router,
        proxy_config,
        proxy_logging_obj,
        select_data_generator,
        user_api_base,
        user_max_tokens,
        user_model,
        user_request_timeout,
        user_temperature,
        version,
    )

    data: dict = {}
    try:
        # Parse request body (JSON or multipart form)
        data = await _parse_ocr_request(request)

        # Process request using ProxyBaseLLMRequestProcessing
        processor = ProxyBaseLLMRequestProcessing(data=data)

        return await processor.base_process_llm_request(
            request=request,
            fastapi_response=fastapi_response,
            user_api_key_dict=user_api_key_dict,
            route_type="aocr",
            proxy_logging_obj=proxy_logging_obj,
            llm_router=llm_router,
            general_settings=general_settings,
            proxy_config=proxy_config,
            select_data_generator=select_data_generator,
            model=None,
            user_model=user_model,
            user_temperature=user_temperature,
            user_request_timeout=user_request_timeout,
            user_max_tokens=user_max_tokens,
            user_api_base=user_api_base,
            version=version,
        )
    except Exception as e:
        processor = ProxyBaseLLMRequestProcessing(data=data)
        raise await processor._handle_llm_api_exception(
            e=e,
            user_api_key_dict=user_api_key_dict,
            proxy_logging_obj=proxy_logging_obj,
            version=version,
        )