Integration & Best Practices
The most common use case for our API is augmenting a Large Language Model (LLM) with long-term memory. This guide demonstrates the best practices for injecting retrieved context into your prompts and handling our asynchronous architecture.
1. The RAG Pattern (Prompt Injection)
When a user sends a message, you should retrieve relevant memories before calling your LLM. Once the LLM generates a response, you save the interaction back to our API so the memory graph updates for the next turn.
import requests
from google import genai
from google.genai import types
gemini_client = genai.Client(api_key="your_gemini_api_key")
def chat_with_memory(user_message: str):
# 1. Retrieve relevant historical context
search_payload = {"query": user_message, "limit": 3}
search_res = requests.post(f"{MEMORY_API_URL}/memories/search", json=search_payload, headers=HEADERS)
past_memories = search_res.json().get("memories", [])
context_string = "\n".join([f"- {m['summary']}" for m in past_memories])
# 2. Set up the System Instruction
system_instruction = f"You are a helpful AI assistant. Past context:\n{context_string}"
# 3. Generate response
response = gemini_client.models.generate_content(
model="gemini-2.5-flash",
contents=user_message,
config=types.GenerateContentConfig(system_instruction=system_instruction)
)
# 4. Save interaction (Fire & Forget)
requests.post(f"{MEMORY_API_URL}/chat", json={"content": user_message}, headers=HEADERS)
return response.text
import { GoogleGenerativeAI } from "@google/generative-ai";
const genAI = new GoogleGenerativeAI("your_gemini_api_key");
const model = genAI.getGenerativeModel({ model: "gemini-2.5-flash" });
async function chatWithMemory(userMessage) {
// 1. Retrieve relevant historical context
const searchRes = await fetch(MEMORY_API_URL + '/memories/search', {
method: 'POST',
headers: HEADERS,
body: JSON.stringify({ query: userMessage, limit: 3 })
});
const { memories } = await searchRes.json();
const contextString = memories.map(m => `- ${m.summary}`).join('\n');
// 2. Generate response with context
const prompt = `You are a helpful AI assistant. Past context:\n${contextString}\n\nUser: ${userMessage}`;
const result = await model.generateContent(prompt);
// 3. Save interaction (Fire & Forget)
fetch(MEMORY_API_URL + '/chat', {
method: 'POST',
headers: HEADERS,
body: JSON.stringify({ content: userMessage })
});
return result.response.text();
}
2. "Fire and Forget" Async Processing
Because our API is completely asynchronous, you do not need to make your user wait for the memory
graph to update. Generate the LLM reply, return it instantly, and send the transcript to our
/chat endpoint in the background.
3. Handling Rate Limits
Default Limit: 60 requests per minute per user.
import time
def make_api_call_with_retry(endpoint, payload, max_retries=3):
for attempt in range(max_retries):
response = requests.post(endpoint, json=payload, headers=HEADERS)
if response.status_code == 429:
time.sleep(2 ** attempt)
continue
return response.json()
async function makeApiCallWithRetry(endpoint, payload, maxRetries = 3) {
for (let attempt = 0; attempt < maxRetries; attempt++) {
const response = await fetch(endpoint, {
method: 'POST',
headers: HEADERS,
body: JSON.stringify(payload)
});
if (response.status === 429) {
await new Promise(resolve => setTimeout(resolve, Math.pow(2, attempt) * 1000));
continue;
}
return response.json();
}
}