API Reference
Functions#
generate_chat_completion#
Generate a non-streaming chat completion.
async def generate_chat_completion( request: ChatCompletionRequest | dict ) -> ChatCompletionResponse
generate_chat_completion_stream#
Generate a streaming chat completion.
async def generate_chat_completion_stream( request: ChatCompletionRequest | dict, stream_options: StreamOptions | dict | None = None ) -> AsyncIterable[ChatCompletionChunk]
Request Types#
ChatCompletionRequest#
Main request object for chat completions. Accepts either typed objects or dictionaries.
@dataclass class ChatCompletionRequest: provider: CompletionsProvider # LLM provider (openai, anthropic, google, anthropic_vertex) api_key: str # API key for authentication model: str # Model name messages: ConversationHistory # Conversation history temperature: Optional[float] = None # Sampling temperature (0.0–1.0) tools: Optional[list[ToolDefinition]] = None # Tool definitions for function calling tool_choice: Optional[ToolChoice] = None # How model chooses tools timeout: Optional[float] = None # Request timeout in seconds max_tokens: Optional[int] = None # Maximum response tokens response_schema: Optional[Dict[str, Any]] = None # JSON Schema for structured JSON output retry: Optional[RetryConfiguration] = None # Retry configuration fallbacks: Optional[List[FallbackRequest]] = None # Fallback provider chain provider_kwargs: Optional[ProviderKwargs] = None # Provider-specific arguments keyed by provider name
FallbackRequest#
Fallback provider configuration. Unspecified fields inherit from the original request.
@dataclass class FallbackRequest: provider: Optional[CompletionsProvider] = None # Override provider api_key: Optional[str] = None # Override API key model: Optional[str] = None # Override model messages: Optional[ConversationHistory] = None # Override messages temperature: Optional[float] = None # Override temperature tools: Optional[list[ToolDefinition]] = None # Override tools tool_choice: Optional[ToolChoice] = None # Override tool choice timeout: Optional[float] = None # Override timeout max_tokens: Optional[int] = None # Override max tokens response_schema: Optional[Dict[str, Any]] = None # Override response schema retry: Optional[RetryConfiguration] = None # Override retry config provider_kwargs: Optional[ProviderKwargs] = None # Override provider kwargs
ProviderKwargs#
Provider-specific keyword arguments, keyed by provider name. Use this for provider-specific parameters that are not normalized in ChatCompletionRequest.
class ProviderKwargs(TypedDict, total=False): openai: OpenAIKwargs anthropic: AnthropicKwargs google: GoogleKwargs anthropic_vertex: AnthropicVertexKwargs alibaba: AlibabaKwargs
OpenAIKwargs#
OpenAI-specific arguments passed to the chat completions API.
class OpenAIKwargs(TypedDict, total=False): service_tier: Literal["auto", "default", "flex", "scale", "priority"] reasoning_effort: Literal["low", "medium", "high"]
See Also: https://platform.openai.com/docs/api-reference/chat/create
AnthropicKwargs#
Anthropic-specific arguments passed to the messages API.
class AnthropicKwargs(TypedDict, total=False): thinking: ThinkingConfigParam # Set {"type": "enabled", "budget_tokens": 10000}
See Also: https://docs.anthropic.com/en/api/messages
GoogleKwargs#
Google Gemini-specific arguments passed to GenerateContentConfig.
class GoogleKwargs(TypedDict, total=False): thinking_config: ThinkingConfigDict # Set {"thinking_budget": 10000, "include_thoughts": True}
See Also: https://ai.google.dev/api/generate-content
AnthropicVertexKwargs#
Anthropic Vertex-specific arguments for Anthropic models via Google Cloud.
class AnthropicVertexKwargs(TypedDict, total=False): thinking: ThinkingConfigParam project_id: str region: str service_account_credentials: Dict[str, Any]
See Also: https://docs.anthropic.com/en/api/messages
AlibabaKwargs#
Alibaba Qwen (DashScope) specific arguments passed via the OpenAI-compatible DashScope endpoint.
class AlibabaKwargs(TypedDict, total=False): base_url: str # Override regional endpoint (default: Singapore intl) enable_search: bool # Enable Qwen's built-in web search
Regional endpoints:
- Singapore (default):
https://dashscope-intl.aliyuncs.com/compatible-mode/v1 - Virginia (US):
https://dashscope-us.aliyuncs.com/compatible-mode/v1 - Beijing (CN):
https://dashscope.aliyuncs.com/compatible-mode/v1
See Also: https://help.aliyun.com/zh/model-studio/getting-started/models
RetryConfiguration#
Configuration for automatic retry with exponential backoff.
@dataclass class RetryConfiguration: enabled: bool = True # Enable retry max_retries: int = 3 # Maximum retry attempts retry_delay: float = 1.0 # Initial delay in seconds backoff_multiplier: float = 2.0 # Exponential backoff multiplier
StreamOptions#
Configuration for streaming behavior, including sentence-based streaming for TTS.
@dataclass class StreamOptions: stream_sentences: bool = False # Stream by sentences instead of tokens clean_sentences: bool = True # Clean markdown for TTS min_sentence_length: int = 6 # Minimum sentence length punctuation_marks: Optional[list[str]] = None # Custom punctuation marks punctuation_language: Optional[str] = None # Language: en, zh, ko, ja, es, fr, it, de
Response Types#
ChatCompletionResponse#
Response from a chat completion request.
@dataclass class ChatCompletionResponse: message: AssistantMessage # The assistant's response finish_reason: str # Why generation stopped (stop, length, tool_calls) usage: Optional[dict[str, Any]] = None # Token usage statistics provider: Optional[CompletionsProvider] = None # Provider that generated this response model: Optional[str] = None # Model that generated this response
Message Types#
UserMessage#
A message from the user.
@dataclass class UserMessage: content: str # Message content cache_breakpoint: Optional[CacheBreakpoint] = None # Anthropic cache breakpoint
AssistantMessage#
A response message from the assistant, containing text and/or tool calls.
@dataclass class AssistantMessage: content: Optional[str] = None # Text content tool_calls: Optional[List[ToolCall]] = None # Tool calls made cache_breakpoint: Optional[CacheBreakpoint] = None # Anthropic cache breakpoint thought_signature: Optional[bytes] = None # Google Gemini thought signature
SystemMessage#
A system prompt message. Multiple system messages are collapsed into one for Anthropic and Google.
@dataclass class SystemMessage: content: str # System prompt content cache_breakpoint: Optional[CacheBreakpoint] = None # Anthropic cache breakpoint
ToolResultMessage#
The result of a tool call, passed back to the model.
@dataclass class ToolResultMessage: tool_call_id: str # ID of the tool call content: dict[str, Any] # Result JSON payload name: Optional[str] = None # Function name cache_breakpoint: Optional[CacheBreakpoint] = None # Anthropic cache breakpoint
Tool Types#
ToolDefinition#
Definition of a tool that the model can call.
@dataclass class ToolDefinition: type: Literal["function"] # Always "function" function: FunctionDefinition # Function definition cache_breakpoint: Optional[CacheBreakpoint] = None # Anthropic cache breakpoint
FunctionDefinition#
Definition of a function including its name, description, and parameter schema.
@dataclass class FunctionDefinition: name: str # Function name description: str # What the function does parameters: Dict[str, Any] # JSON schema for parameters strict: Optional[bool] = None # Strict validation (OpenAI only)
ToolCall#
A tool call made by the model, containing the function name and arguments.
@dataclass class ToolCall: id: str # Unique identifier type: Literal["function"] # Always "function" function: FunctionCall # Function call details index: Optional[int] = None # Index for ordering multiple calls thought_signature: Optional[bytes] = None # Google Gemini thought signature
FunctionCall#
The function name and arguments for a tool call.
@dataclass class FunctionCall: arguments: dict[str, Any] # Arguments passed to function name: str # Function name
Streaming Types#
ChatCompletionChunk#
Union of all streaming chunk types. Use chunk.type to determine the kind.
ChatCompletionChunk = Union[ AssistantMessageDeltaChunk, # type: "content_delta" AssistantMessageSentenceChunk, # type: "content_sentence" ToolCallChunk, # type: "tool_call" FinishReasonChunk, # type: "finish_reason" UsageChunk, # type: "usage" FinalResponseChunk # type: "response" ]
AssistantMessageDeltaChunk#
An incremental text token from the streaming response.
@dataclass class AssistantMessageDeltaChunk: content: str # Incremental text token @property def type(self) -> str: # Returns "content_delta"
AssistantMessageSentenceChunk#
A complete sentence extracted from the streaming response.
@dataclass class AssistantMessageSentenceChunk: sentence: str # Complete sentence @property def type(self) -> str: # Returns "content_sentence"
ToolCallChunk#
A complete tool call compiled from streamed parts.
@dataclass class ToolCallChunk: tool_call: ToolCall # Complete tool call @property def type(self) -> str: # Returns "tool_call"
FinalResponseChunk#
The complete response assembled from all streamed chunks.
@dataclass class FinalResponseChunk: response: ChatCompletionResponse # Complete response assembled from stream @property def type(self) -> str: # Returns "response"
Enums#
CompletionsProvider#
Supported LLM providers.
class CompletionsProvider(StrEnum): OPENAI = "openai" ANTHROPIC = "anthropic" GOOGLE = "google" ANTHROPIC_VERTEX = "anthropic_vertex" ALIBABA = "alibaba"
ToolChoice#
Strategy for how the model should choose tools.
ToolChoice = Union[Literal["none", "auto", "required"], str] # "auto" - Let the model decide # "none" - Don't allow tool calls # "required" - Require at least one tool call # "<name>" - Force a specific tool by name
Cache Types#
CacheBreakpoint#
Anthropic prompt caching breakpoint. Maximum of 4 per request.
@dataclass class CacheBreakpoint: ttl: Literal["5m", "1h"] # Cache time-to-live
Utility Functions#
serialize_conversation#
Convert a conversation history to a list of dictionaries for storage.
def serialize_conversation( conversation: ConversationHistory ) -> list[dict[str, Any]]
deserialize_conversation#
Convert a list of dictionaries back to typed message objects.
def deserialize_conversation( serialized_conversation: list[dict[str, Any]] ) -> ConversationHistory
