diff --git a/integrations/llms/gemini.mdx b/integrations/llms/gemini.mdx index 1888c44b..624d3804 100644 --- a/integrations/llms/gemini.mdx +++ b/integrations/llms/gemini.mdx @@ -1003,6 +1003,188 @@ curl --location 'https://api.portkey.ai/v1/chat/completions' \ --- +## Explicit context caching + +Google Gemini supports [context caching](https://ai.google.dev/gemini-api/docs/caching) to reduce costs and latency for repeated prompts with large amounts of context. You can explicitly create a cache and then reference it in subsequent inference requests. + +### Step 1: Create a context cache + +Use the Gemini `cachedContents` endpoint through Portkey to create a cache: + + + +```sh +curl 'https://api.portkey.ai/v1/cached_contents' \ +-H 'x-portkey-provider: google' \ +-H 'Content-Type: application/json' \ +-H 'x-portkey-api-key: {{your_api_key}}' \ +-H 'Authorization: {{your_gemini_api_key}}' \ +-H 'x-portkey-custom-host: https://generativelanguage.googleapis.com/v1beta' \ +-d '{ + "model": "models/gemini-1.5-pro-001", + "displayName": "my-cache-display-name", + "contents": [{ + "role": "user", + "parts": [{ + "text": "This is sample text to demonstrate explicit caching. (you need a minimum of 1024 tokens)" + }] + }, + { + "role": "model", + "parts": [{ + "text": "Thank you, I am your helpful assistant." + }] + }] +}' +``` + + +```python +import requests + +url = "https://api.portkey.ai/v1/cached_contents" +headers = { + "x-portkey-provider": "google", + "Content-Type": "application/json", + "x-portkey-api-key": "PORTKEY_API_KEY", + "Authorization": "GEMINI_API_KEY", + "x-portkey-custom-host": "https://generativelanguage.googleapis.com/v1beta" +} + +payload = { + "model": "models/gemini-1.5-pro-001", + "displayName": "my-cache-display-name", + "contents": [{ + "role": "user", + "parts": [{ + "text": "This is sample text to demonstrate explicit caching. (you need a minimum of 1024 tokens)" + }] + }, + { + "role": "model", + "parts": [{ + "text": "Thank you, I am your helpful assistant." + }] + }] +} + +response = requests.post(url, headers=headers, json=payload) +print(response.json()) +# Save the cache name from the response for use in step 2 +``` + + +```javascript +const response = await fetch("https://api.portkey.ai/v1/cached_contents", { + method: "POST", + headers: { + "x-portkey-provider": "google", + "Content-Type": "application/json", + "x-portkey-api-key": "PORTKEY_API_KEY", + "Authorization": "GEMINI_API_KEY", + "x-portkey-custom-host": "https://generativelanguage.googleapis.com/v1beta" + }, + body: JSON.stringify({ + model: "models/gemini-1.5-pro-001", + displayName: "my-cache-display-name", + contents: [{ + role: "user", + parts: [{ + text: "This is sample text to demonstrate explicit caching. (you need a minimum of 1024 tokens)" + }] + }, + { + role: "model", + parts: [{ + text: "Thank you, I am your helpful assistant." + }] + }] + }) +}); + +const data = await response.json(); +console.log(data); +// Save the cache name from the response for use in step 2 +``` + + + + +Context caching requires a minimum of **1024 tokens** in the cached content. The cache has a default TTL (time-to-live) which you can configure using the `ttl` parameter. + + +### Step 2: Use the cache in inference requests + +Once the cache is created, reference it in your chat completion requests using the `cached_content` parameter. Pass the cache name returned from step 1 (e.g., `cachedContents/abc123`): + + + +```sh +curl 'https://api.portkey.ai/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'x-portkey-api-key: {{your_api_key}}' \ +-H 'x-portkey-provider: google' \ +-H 'Authorization: {{your_gemini_api_key}}' \ +-d '{ + "model": "gemini-1.5-pro-001", + "cached_content": "cachedContents/abc123", + "messages": [ + { + "role": "user", + "content": "Based on the context I provided earlier, answer my question." + } + ] +}' +``` + + +```python +from portkey_ai import Portkey + +portkey = Portkey( + api_key="PORTKEY_API_KEY", +) + +completion = portkey.chat.completions.create( + model="@google/gemini-1.5-pro-001", + cached_content="cachedContents/abc123", + messages=[ + {"role": "user", "content": "Based on the context I provided earlier, answer my question."} + ] +) + +print(completion) +``` + + +```javascript +import Portkey from 'portkey-ai'; + +const portkey = new Portkey({ + apiKey: "PORTKEY_API_KEY", +}); + +const completion = await portkey.chat.completions.create({ + model: "@google/gemini-1.5-pro-001", + cached_content: "cachedContents/abc123", + messages: [ + { role: "user", content: "Based on the context I provided earlier, answer my question." } + ] +}); + +console.log(completion); +``` + + + + +The model used in the inference request **must match** the model used when creating the cache. + + +For more details on context caching options like TTL configuration and cache management, refer to the [Google Gemini context caching documentation](https://ai.google.dev/gemini-api/docs/caching). + +--- + ## Thought Signatures (Tool Calling Verification)