API Reference

Functions#

generate_chat_completion#

Generate a non-streaming chat completion.

async def generate_chat_completion(
    request: ChatCompletionRequest | dict
) -> ChatCompletionResponse

generate_chat_completion_stream#

Generate a streaming chat completion.

async def generate_chat_completion_stream(
    request: ChatCompletionRequest | dict,
    stream_options: StreamOptions | dict | None = None
) -> AsyncIterable[ChatCompletionChunk]

Request Types#

ChatCompletionRequest#

Main request object for chat completions. Accepts either typed objects or dictionaries.

@dataclass
class ChatCompletionRequest:
    provider: CompletionsProvider            # LLM provider (openai, anthropic, google, anthropic_vertex)
    api_key: str                             # API key for authentication
    model: str                               # Model name
    messages: ConversationHistory            # Conversation history
    temperature: Optional[float] = None      # Sampling temperature (0.0–1.0)
    tools: Optional[list[ToolDefinition]] = None      # Tool definitions for function calling
    tool_choice: Optional[ToolChoice] = None          # How model chooses tools
    timeout: Optional[float] = None          # Request timeout in seconds
    max_tokens: Optional[int] = None         # Maximum response tokens
    response_schema: Optional[Dict[str, Any]] = None  # JSON Schema for structured JSON output
    retry: Optional[RetryConfiguration] = None        # Retry configuration
    fallbacks: Optional[List[FallbackRequest]] = None # Fallback provider chain
    provider_kwargs: Optional[ProviderKwargs] = None  # Provider-specific arguments keyed by provider name

FallbackRequest#

Fallback provider configuration. Unspecified fields inherit from the original request.

@dataclass
class FallbackRequest:
    provider: Optional[CompletionsProvider] = None    # Override provider
    api_key: Optional[str] = None            # Override API key
    model: Optional[str] = None              # Override model
    messages: Optional[ConversationHistory] = None    # Override messages
    temperature: Optional[float] = None      # Override temperature
    tools: Optional[list[ToolDefinition]] = None      # Override tools
    tool_choice: Optional[ToolChoice] = None          # Override tool choice
    timeout: Optional[float] = None          # Override timeout
    max_tokens: Optional[int] = None         # Override max tokens
    response_schema: Optional[Dict[str, Any]] = None  # Override response schema
    retry: Optional[RetryConfiguration] = None        # Override retry config
    provider_kwargs: Optional[ProviderKwargs] = None  # Override provider kwargs

ProviderKwargs#

Provider-specific keyword arguments, keyed by provider name. Use this for provider-specific parameters that are not normalized in ChatCompletionRequest.

class ProviderKwargs(TypedDict, total=False):
    openai: OpenAIKwargs
    anthropic: AnthropicKwargs
    google: GoogleKwargs
    anthropic_vertex: AnthropicVertexKwargs
    alibaba: AlibabaKwargs

OpenAIKwargs#

OpenAI-specific arguments passed to the chat completions API.

class OpenAIKwargs(TypedDict, total=False):
    service_tier: Literal["auto", "default", "flex", "scale", "priority"]
    reasoning_effort: Literal["low", "medium", "high"]

AnthropicKwargs#

Anthropic-specific arguments passed to the messages API.

class AnthropicKwargs(TypedDict, total=False):
    thinking: ThinkingConfigParam  # Set {"type": "enabled", "budget_tokens": 10000}

GoogleKwargs#

Google Gemini-specific arguments passed to GenerateContentConfig.

class GoogleKwargs(TypedDict, total=False):
    thinking_config: ThinkingConfigDict  # Set {"thinking_budget": 10000, "include_thoughts": True}

AnthropicVertexKwargs#

Anthropic Vertex-specific arguments for Anthropic models via Google Cloud.

class AnthropicVertexKwargs(TypedDict, total=False):
    thinking: ThinkingConfigParam
    project_id: str
    region: str
    service_account_credentials: Dict[str, Any]

AlibabaKwargs#

Alibaba Qwen (DashScope) specific arguments passed via the OpenAI-compatible DashScope endpoint.

class AlibabaKwargs(TypedDict, total=False):
    base_url: str          # Override regional endpoint (default: Singapore intl)
    enable_search: bool    # Enable Qwen's built-in web search

Regional endpoints:

Singapore (default): https://dashscope-intl.aliyuncs.com/compatible-mode/v1
Virginia (US): https://dashscope-us.aliyuncs.com/compatible-mode/v1
Beijing (CN): https://dashscope.aliyuncs.com/compatible-mode/v1

RetryConfiguration#

Configuration for automatic retry with exponential backoff.

@dataclass
class RetryConfiguration:
    enabled: bool = True              # Enable retry
    max_retries: int = 3              # Maximum retry attempts
    retry_delay: float = 1.0          # Initial delay in seconds
    backoff_multiplier: float = 2.0   # Exponential backoff multiplier

StreamOptions#

Configuration for streaming behavior, including sentence-based streaming for TTS.

@dataclass
class StreamOptions:
    stream_sentences: bool = False           # Stream by sentences instead of tokens
    clean_sentences: bool = True             # Clean markdown for TTS
    min_sentence_length: int = 6             # Minimum sentence length
    punctuation_marks: Optional[list[str]] = None    # Custom punctuation marks
    punctuation_language: Optional[str] = None       # Language: en, zh, ko, ja, es, fr, it, de

Response Types#

ChatCompletionResponse#

Response from a chat completion request.

@dataclass
class ChatCompletionResponse:
    message: AssistantMessage                # The assistant's response
    finish_reason: str                       # Why generation stopped (stop, length, tool_calls)
    usage: Optional[dict[str, Any]] = None   # Token usage statistics
    provider: Optional[CompletionsProvider] = None  # Provider that generated this response
    model: Optional[str] = None              # Model that generated this response

Message Types#

UserMessage#

A message from the user.

@dataclass
class UserMessage:
    content: str                                      # Message content
    cache_breakpoint: Optional[CacheBreakpoint] = None  # Anthropic cache breakpoint

AssistantMessage#

A response message from the assistant, containing text and/or tool calls.

@dataclass
class AssistantMessage:
    content: Optional[str] = None                     # Text content
    tool_calls: Optional[List[ToolCall]] = None       # Tool calls made
    cache_breakpoint: Optional[CacheBreakpoint] = None  # Anthropic cache breakpoint
    thought_signature: Optional[bytes] = None         # Google Gemini thought signature

SystemMessage#

A system prompt message. Multiple system messages are collapsed into one for Anthropic and Google.

@dataclass
class SystemMessage:
    content: str                                      # System prompt content
    cache_breakpoint: Optional[CacheBreakpoint] = None  # Anthropic cache breakpoint

ToolResultMessage#

The result of a tool call, passed back to the model.

@dataclass
class ToolResultMessage:
    tool_call_id: str                        # ID of the tool call
    content: dict[str, Any]                  # Result JSON payload
    name: Optional[str] = None               # Function name
    cache_breakpoint: Optional[CacheBreakpoint] = None  # Anthropic cache breakpoint

Tool Types#

ToolDefinition#

Definition of a tool that the model can call.

@dataclass
class ToolDefinition:
    type: Literal["function"]                # Always "function"
    function: FunctionDefinition             # Function definition
    cache_breakpoint: Optional[CacheBreakpoint] = None  # Anthropic cache breakpoint

FunctionDefinition#

Definition of a function including its name, description, and parameter schema.

@dataclass
class FunctionDefinition:
    name: str                                # Function name
    description: str                         # What the function does
    parameters: Dict[str, Any]               # JSON schema for parameters
    strict: Optional[bool] = None            # Strict validation (OpenAI only)

ToolCall#

A tool call made by the model, containing the function name and arguments.

@dataclass
class ToolCall:
    id: str                                  # Unique identifier
    type: Literal["function"]                # Always "function"
    function: FunctionCall                   # Function call details
    index: Optional[int] = None              # Index for ordering multiple calls
    thought_signature: Optional[bytes] = None  # Google Gemini thought signature

FunctionCall#

The function name and arguments for a tool call.

@dataclass
class FunctionCall:
    arguments: dict[str, Any]                # Arguments passed to function
    name: str                                # Function name

Streaming Types#

ChatCompletionChunk#

Union of all streaming chunk types. Use chunk.type to determine the kind.

ChatCompletionChunk = Union[
    AssistantMessageDeltaChunk,     # type: "content_delta"
    AssistantMessageSentenceChunk,  # type: "content_sentence"
    ToolCallChunk,                  # type: "tool_call"
    FinishReasonChunk,              # type: "finish_reason"
    UsageChunk,                     # type: "usage"
    FinalResponseChunk              # type: "response"
]

AssistantMessageDeltaChunk#

An incremental text token from the streaming response.

@dataclass
class AssistantMessageDeltaChunk:
    content: str   # Incremental text token

    @property
    def type(self) -> str:  # Returns "content_delta"

AssistantMessageSentenceChunk#

A complete sentence extracted from the streaming response.

@dataclass
class AssistantMessageSentenceChunk:
    sentence: str  # Complete sentence

    @property
    def type(self) -> str:  # Returns "content_sentence"

ToolCallChunk#

A complete tool call compiled from streamed parts.

@dataclass
class ToolCallChunk:
    tool_call: ToolCall  # Complete tool call

    @property
    def type(self) -> str:  # Returns "tool_call"

FinalResponseChunk#

The complete response assembled from all streamed chunks.

@dataclass
class FinalResponseChunk:
    response: ChatCompletionResponse  # Complete response assembled from stream

    @property
    def type(self) -> str:  # Returns "response"

Enums#

CompletionsProvider#

Supported LLM providers.

class CompletionsProvider(StrEnum):
    OPENAI = "openai"
    ANTHROPIC = "anthropic"
    GOOGLE = "google"
    ANTHROPIC_VERTEX = "anthropic_vertex"
    ALIBABA = "alibaba"

ToolChoice#

Strategy for how the model should choose tools.

ToolChoice = Union[Literal["none", "auto", "required"], str]
# "auto"     - Let the model decide
# "none"     - Don't allow tool calls
# "required" - Require at least one tool call
# "<name>"   - Force a specific tool by name

Cache Types#

CacheBreakpoint#

Anthropic prompt caching breakpoint. Maximum of 4 per request.

@dataclass
class CacheBreakpoint:
    ttl: Literal["5m", "1h"]  # Cache time-to-live

Utility Functions#

serialize_conversation#

Convert a conversation history to a list of dictionaries for storage.

def serialize_conversation(
    conversation: ConversationHistory
) -> list[dict[str, Any]]

deserialize_conversation#

Convert a list of dictionaries back to typed message objects.

def deserialize_conversation(
    serialized_conversation: list[dict[str, Any]]
) -> ConversationHistory