haimaker - Getting Started
Call 100+ LLMs using the OpenAI Input/Output Format
- Translate inputs to provider's
completion,embedding, andimage_generationendpoints - Consistent output, text responses will always be available at
['choices'][0]['message']['content'] - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - Router
- Track spend & set budgets per project haimaker Proxy Server
How to use haimaker
haimaker python SDK - Python Client to call 100+ LLMs, load balance, cost tracking
tip
Use haimaker Python SDK if you want to use haimaker in your python code
Typically used by developers building llm projects
- haimaker SDK gives you a unified interface to access multiple LLMs (100+ LLMs)
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - Router
haimaker Python SDK
Basic usage
# currently haimaker uses litellm's client library
pip install litellm
- OpenAI
- Anthropic
- xAI
- VertexAI
- NVIDIA
- HuggingFace
- Azure OpenAI
- Ollama
- Openrouter
- Novita AI
from litellm import completion
import os
## set ENV variables
os.environ["OPENAI_API_KEY"] = "your-api-key"
response = completion(
model="openai/gpt-4o",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
from litellm import completion
import os
## set ENV variables
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
response = completion(
model="anthropic/claude-3-sonnet-20240229",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
from litellm import completion
import os
## set ENV variables
os.environ["XAI_API_KEY"] = "your-api-key"
response = completion(
model="xai/grok-2-latest",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
from litellm import completion
import os
# auth: run 'gcloud auth application-default'
os.environ["VERTEXAI_PROJECT"] = "hardy-device-386718"
os.environ["VERTEXAI_LOCATION"] = "us-central1"
response = completion(
model="vertex_ai/gemini-1.5-pro",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
from litellm import completion
import os
## set ENV variables
os.environ["NVIDIA_NIM_API_KEY"] = "nvidia_api_key"
os.environ["NVIDIA_NIM_API_BASE"] = "nvidia_nim_endpoint_url"
response = completion(
model="nvidia_nim/<model_name>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
from litellm import completion
import os
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
# e.g. Call 'WizardLM/WizardCoder-Python-34B-V1.0' hosted on HF Inference endpoints
response = completion(
model="huggingface/WizardLM/WizardCoder-Python-34B-V1.0",
messages=[{ "content": "Hello, how are you?","role": "user"}],
api_base="https://my-endpoint.huggingface.cloud"
)
print(response)
from litellm import completion
import os
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
# azure call
response = completion(
"azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}]
)
from litellm import completion
response = completion(
model="ollama/llama2",
messages = [{ "content": "Hello, how are you?","role": "user"}],
api_base="http://localhost:11434"
)
from litellm import completion
import os
## set ENV variables
os.environ["OPENROUTER_API_KEY"] = "openrouter_api_key"
response = completion(
model="openrouter/google/palm-2-chat-bison",
messages = [{ "content": "Hello, how are you?","role": "user"}],
)
from litellm import completion
import os
## set ENV variables. Visit https://novita.ai/settings/key-management to get your API key
os.environ["NOVITA_API_KEY"] = "novita-api-key"
response = completion(
model="novita/deepseek/deepseek-r1",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
Response Format (OpenAI Format)
{
"id": "chatcmpl-565d891b-a42e-4c39-8d14-82a1f5208885",
"created": 1734366691,
"model": "claude-3-sonnet-20240229",
"object": "chat.completion",
"system_fingerprint": null,
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hello! As an AI language model, I don't have feelings, but I'm operating properly and ready to assist you with any questions or tasks you may have. How can I help you today?",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"usage": {
"completion_tokens": 43,
"prompt_tokens": 13,
"total_tokens": 56,
"completion_tokens_details": null,
"prompt_tokens_details": {
"audio_tokens": null,
"cached_tokens": 0
},
"cache_creation_input_tokens": 0,
"cache_read_input_tokens": 0
}
}
Streaming
Set stream=True in the completion args.
- OpenAI
- Anthropic
- xAI
- VertexAI
- NVIDIA
- HuggingFace
- Azure OpenAI
- Ollama
- Openrouter
- Novita AI
from litellm import completion
import os
## set ENV variables
os.environ["OPENAI_API_KEY"] = "your-api-key"
response = completion(
model="openai/gpt-4o",
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
from litellm import completion
import os
## set ENV variables
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
response = completion(
model="anthropic/claude-3-sonnet-20240229",
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
from litellm import completion
import os
## set ENV variables
os.environ["XAI_API_KEY"] = "your-api-key"
response = completion(
model="xai/grok-2-latest",
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
from litellm import completion
import os
# auth: run 'gcloud auth application-default'
os.environ["VERTEX_PROJECT"] = "hardy-device-386718"
os.environ["VERTEX_LOCATION"] = "us-central1"
response = completion(
model="vertex_ai/gemini-1.5-pro",
messages=[{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
from litellm import completion
import os
## set ENV variables
os.environ["NVIDIA_NIM_API_KEY"] = "nvidia_api_key"
os.environ["NVIDIA_NIM_API_BASE"] = "nvidia_nim_endpoint_url"
response = completion(
model="nvidia_nim/<model_name>",
messages=[{ "content": "Hello, how are you?","role": "user"}]
stream=True,
)
from litellm import completion
import os
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
# e.g. Call 'WizardLM/WizardCoder-Python-34B-V1.0' hosted on HF Inference endpoints
response = completion(
model="huggingface/WizardLM/WizardCoder-Python-34B-V1.0",
messages=[{ "content": "Hello, how are you?","role": "user"}],
api_base="https://my-endpoint.huggingface.cloud",
stream=True,
)
print(response)
from litellm import completion
import os
## set ENV variables
os.environ["AZURE_API_KEY"] = ""
os.environ["AZURE_API_BASE"] = ""
os.environ["AZURE_API_VERSION"] = ""
# azure call
response = completion(
"azure/<your_deployment_name>",
messages = [{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
from litellm import completion
response = completion(
model="ollama/llama2",
messages = [{ "content": "Hello, how are you?","role": "user"}],
api_base="http://localhost:11434",
stream=True,
)
from litellm import completion
import os
## set ENV variables
os.environ["OPENROUTER_API_KEY"] = "openrouter_api_key"
response = completion(
model="openrouter/google/palm-2-chat-bison",
messages = [{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
from litellm import completion
import os
## set ENV variables. Visit https://novita.ai/settings/key-management to get your API key
os.environ["NOVITA_API_KEY"] = "novita_api_key"
response = completion(
model="novita/deepseek/deepseek-r1",
messages = [{ "content": "Hello, how are you?","role": "user"}],
stream=True,
)
Streaming Response Format (OpenAI Format)
{
"id": "chatcmpl-2be06597-eb60-4c70-9ec5-8cd2ab1b4697",
"created": 1734366925,
"model": "claude-3-sonnet-20240229",
"object": "chat.completion.chunk",
"system_fingerprint": null,
"choices": [
{
"finish_reason": null,
"index": 0,
"delta": {
"content": "Hello",
"role": "assistant",
"function_call": null,
"tool_calls": null,
"audio": null
},
"logprobs": null
}
]
}
Exception handling
haimaker maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with haimaker.
from openai.error import OpenAIError
from litellm import completion
os.environ["ANTHROPIC_API_KEY"] = "bad-key"
try:
# some code
completion(model="claude-instant-1", messages=[{"role": "user", "content": "Hey, how's it going?"}])
except OpenAIError as e:
print(e)
Logging Observability - Log LLM Input/Output (Docs)
haimaker exposes pre defined callbacks to send data to Lunary, MLflow, Langfuse, Helicone, Promptlayer, Traceloop, Slack
from litellm import completion
## set env variables for logging tools (API key set up is not required when using MLflow)
os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" # get your public key at https://app.lunary.ai/settings
os.environ["HELICONE_API_KEY"] = "your-helicone-key"
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["OPENAI_API_KEY"]
# set callbacks
litellm.success_callback = ["lunary", "mlflow", "langfuse", "helicone"] # log input/output to lunary, mlflow, langfuse, helicone
#openai call
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
Track Costs, Usage, Latency for streaming
Use a callback function for this - more info on custom callbacks: https://docs.haimaker.ai/docs/observability/custom_callback
import litellm
# track_cost_callback
def track_cost_callback(
kwargs, # kwargs to completion
completion_response, # response from completion
start_time, end_time # start/end time
):
try:
response_cost = kwargs.get("response_cost", 0)
print("streaming response_cost", response_cost)
except:
pass
# set callback
litellm.success_callback = [track_cost_callback] # set custom callback function
# litellm.completion() call
response = completion(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": "Hi 👋 - i'm openai"
}
],
stream=True
)