Skip to main content

Prompt Caching

Supported Providers:

haimaker follows the OpenAI prompt caching usage object format:

"usage": {
"prompt_tokens": 2006,
"completion_tokens": 300,
"total_tokens": 2306,
"prompt_tokens_details": {
"cached_tokens": 1920
},
"completion_tokens_details": {
"reasoning_tokens": 0
},
"cache_creation_input_tokens": 0
}
  • prompt_tokens: All prompt tokens including cache-miss and cache-hit input tokens.
  • completion_tokens: Output tokens generated by the model.
  • total_tokens: Sum of prompt_tokens + completion_tokens.
  • prompt_tokens_details.cached_tokens: Tokens that were a cache-hit for that call.
  • ANTHROPIC_ONLY: cache_creation_input_tokens - Number of tokens that were written to cache (Anthropic charges for this).

Quick Start

Note: OpenAI caching is only available for prompts containing 1024 tokens or more.

Python

from openai import OpenAI

client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://api.haimaker.ai/v1"
)

for _ in range(2):
response = client.chat.completions.create(
model="openai/gpt-4o",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "Here is the full text of a complex legal agreement" * 400,
}
],
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
}
],
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
}
],
},
],
temperature=0.2,
max_tokens=10,
)

print("response=", response)
print("response.usage=", response.usage)

# On second call, cached_tokens should be > 0
assert response.usage.prompt_tokens_details.cached_tokens > 0

Anthropic Example

Anthropic charges for cache writes.

Specify the content to cache with "cache_control": {"type": "ephemeral"}.

If you pass that in for any other LLM provider, it will be ignored.

from openai import OpenAI

client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://api.haimaker.ai/v1"
)

response = client.chat.completions.create(
model="anthropic/claude-3-5-sonnet-20240620",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are an AI assistant tasked with analyzing legal documents.",
},
{
"type": "text",
"text": "Here is the full text of a complex legal agreement" * 400,
"cache_control": {"type": "ephemeral"},
},
],
},
{
"role": "user",
"content": "what are the key terms and conditions in this agreement?",
},
]
)

print(response.usage)

Deepseek Example

Works the same as OpenAI.

from openai import OpenAI

client = OpenAI(
api_key="YOUR_API_KEY",
base_url="https://api.haimaker.ai/v1"
)

messages_1 = [
{
"role": "system",
"content": "You are a history expert. The user will provide a series of questions, and your answers should be concise and start with `Answer:`",
},
{
"role": "user",
"content": "In what year did Qin Shi Huang unify the six states?",
},
{"role": "assistant", "content": "Answer: 221 BC"},
{"role": "user", "content": "Who was the founder of the Han Dynasty?"},
{"role": "assistant", "content": "Answer: Liu Bang"},
{"role": "user", "content": "Who was the last emperor of the Tang Dynasty?"},
{"role": "assistant", "content": "Answer: Li Zhu"},
{
"role": "user",
"content": "Who was the founding emperor of the Ming Dynasty?",
},
{"role": "assistant", "content": "Answer: Zhu Yuanzhang"},
{
"role": "user",
"content": "Who was the founding emperor of the Qing Dynasty?",
},
]

messages_2 = [
{
"role": "system",
"content": "You are a history expert. The user will provide a series of questions, and your answers should be concise and start with `Answer:`",
},
{
"role": "user",
"content": "In what year did Qin Shi Huang unify the six states?",
},
{"role": "assistant", "content": "Answer: 221 BC"},
{"role": "user", "content": "Who was the founder of the Han Dynasty?"},
{"role": "assistant", "content": "Answer: Liu Bang"},
{"role": "user", "content": "Who was the last emperor of the Tang Dynasty?"},
{"role": "assistant", "content": "Answer: Li Zhu"},
{
"role": "user",
"content": "Who was the founding emperor of the Ming Dynasty?",
},
{"role": "assistant", "content": "Answer: Zhu Yuanzhang"},
{"role": "user", "content": "When did the Shang Dynasty fall?"},
]

response_1 = client.chat.completions.create(model="deepseek/deepseek-chat", messages=messages_1)
response_2 = client.chat.completions.create(model="deepseek/deepseek-chat", messages=messages_2)

print(response_2.usage)