本章我们将实现智能缓存策略、Token 优化和流式处理,显著降低 API 成本并提升响应速度。
目标
- 实现多级缓存系统
- 优化 Token 使用
- 流式处理优化
- 成本监控与分析
缓存架构
┌─────────────────────────────────────────────────────────────┐
│ Cache Architecture │
├─────────────────────────────────────────────────────────────┤
│ │
│ User Query ──→ L1 (Memory) ──→ L2 (Disk) ──→ LLM API │
│ ↓ ↓ │
│ ┌────────┐ ┌────────┐ │
│ │ Semantic│ │ File │ │
│ │ Match │ │ Cache │ │
│ └────────┘ └────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
缓存接口
// src/cache/types.ts
export interface CacheEntry<T> {
key: string;
value: T;
createdAt: number;
expiresAt: number;
accessCount: number;
lastAccessed: number;
tags: string[];
}
export interface CacheOptions {
ttl?: number; // Time to live in ms
tags?: string[];
priority?: number;
}
export interface CacheStats {
hits: number;
misses: number;
hitRate: number;
size: number;
memoryUsage: number;
}
export interface Cache<T> {
name: string;
get(key: string): Promise<T | undefined>;
set(key: string, value: T, options?: CacheOptions): Promise<void>;
delete(key: string): Promise<void>;
clear(): Promise<void>;
has(key: string): Promise<boolean>;
getStats(): CacheStats;
invalidateTags(tags: string[]): Promise<number>;
}
内存缓存
// src/cache/MemoryCache.ts
import { Cache, CacheEntry, CacheOptions, CacheStats } from './types.js';
import { EventEmitter } from 'events';
interface MemoryCacheOptions {
maxSize?: number; // 最大条目数
maxMemoryMB?: number; // 最大内存占用
defaultTTL?: number; // 默认 TTL
}
export class MemoryCache<T> extends EventEmitter implements Cache<T> {
public name = 'memory';
private store = new Map<string, CacheEntry<T>>();
private options: MemoryCacheOptions;
private stats = { hits: 0, misses: 0 };
private cleanupInterval: NodeJS.Timeout;
constructor(options: MemoryCacheOptions = {}) {
super();
this.options = {
maxSize: 1000,
maxMemoryMB: 100,
defaultTTL: 5 * 60 * 1000, // 5 minutes
...options,
};
// 定期清理过期条目
this.cleanupInterval = setInterval(() => this.cleanup(), 60000);
}
async get(key: string): Promise<T | undefined> {
const entry = this.store.get(key);
if (!entry) {
this.stats.misses++;
return undefined;
}
// 检查过期
if (entry.expiresAt < Date.now()) {
this.store.delete(key);
this.stats.misses++;
return undefined;
}
// 更新访问统计
entry.accessCount++;
entry.lastAccessed = Date.now();
this.stats.hits++;
this.emit('hit', key);
return entry.value;
}
async set(key: string, value: T, options: CacheOptions = {}): Promise<void> {
// 检查容量
if (this.store.size >= this.options.maxSize!) {
this.evictLRU();
}
const ttl = options.ttl ?? this.options.defaultTTL!;
const entry: CacheEntry<T> = {
key,
value,
createdAt: Date.now(),
expiresAt: Date.now() + ttl,
accessCount: 0,
lastAccessed: Date.now(),
tags: options.tags || [],
};
this.store.set(key, entry);
this.emit('set', key);
}
async delete(key: string): Promise<void> {
this.store.delete(key);
this.emit('delete', key);
}
async clear(): Promise<void> {
this.store.clear();
this.emit('clear');
}
async has(key: string): Promise<boolean> {
const entry = this.store.get(key);
if (!entry) return false;
if (entry.expiresAt < Date.now()) {
this.store.delete(key);
return false;
}
return true;
}
getStats(): CacheStats {
const total = this.stats.hits + this.stats.misses;
return {
hits: this.stats.hits,
misses: this.stats.misses,
hitRate: total > 0 ? this.stats.hits / total : 0,
size: this.store.size,
memoryUsage: this.estimateMemoryUsage(),
};
}
async invalidateTags(tags: string[]): Promise<number> {
let count = 0;
const tagSet = new Set(tags);
for (const [key, entry] of this.store) {
if (entry.tags.some(t => tagSet.has(t))) {
this.store.delete(key);
count++;
}
}
return count;
}
private cleanup(): void {
const now = Date.now();
for (const [key, entry] of this.store) {
if (entry.expiresAt < now) {
this.store.delete(key);
}
}
}
private evictLRU(): void {
// 找到最久未访问的条目
let oldestKey: string | null = null;
let oldestTime = Infinity;
for (const [key, entry] of this.store) {
if (entry.lastAccessed < oldestTime) {
oldestTime = entry.lastAccessed;
oldestKey = key;
}
}
if (oldestKey) {
this.store.delete(oldestKey);
}
}
private estimateMemoryUsage(): number {
// 粗略估计内存使用
let total = 0;
for (const entry of this.store.values()) {
total += JSON.stringify(entry).length * 2; // UTF-16
}
return total;
}
destroy(): void {
clearInterval(this.cleanupInterval);
this.store.clear();
}
}
语义缓存
// src/cache/SemanticCache.ts
import { Cache, CacheEntry, CacheOptions } from './types.js';
import { LLMClient } from '../llm/Client.js';
interface SemanticCacheOptions {
similarityThreshold: number; // 相似度阈值
embeddingModel: string;
}
export class SemanticCache<T> implements Cache<T> {
public name = 'semantic';
private entries: Array<{
embedding: number[];
entry: CacheEntry<T>;
}> = [];
private options: SemanticCacheOptions;
private llm: LLMClient;
constructor(llm: LLMClient, options: SemanticCacheOptions) {
this.llm = llm;
this.options = options;
}
async get(key: string): Promise<T | undefined> {
// 为查询生成 embedding
const queryEmbedding = await this.generateEmbedding(key);
// 查找最相似的缓存条目
let bestMatch: { entry: CacheEntry<T>; similarity: number } | null = null;
for (const item of this.entries) {
const similarity = this.cosineSimilarity(queryEmbedding, item.embedding);
if (similarity > this.options.similarityThreshold) {
if (!bestMatch || similarity > bestMatch.similarity) {
bestMatch = { entry: item.entry, similarity };
}
}
}
if (bestMatch) {
// 检查过期
if (bestMatch.entry.expiresAt < Date.now()) {
this.removeEntry(bestMatch.entry.key);
return undefined;
}
return bestMatch.entry.value;
}
return undefined;
}
async set(key: string, value: T, options: CacheOptions = {}): Promise<void> {
const embedding = await this.generateEmbedding(key);
const entry: CacheEntry<T> = {
key,
value,
createdAt: Date.now(),
expiresAt: Date.now() + (options.ttl || 300000),
accessCount: 0,
lastAccessed: Date.now(),
tags: options.tags || [],
};
this.entries.push({ embedding, entry });
}
async delete(key: string): Promise<void> {
this.removeEntry(key);
}
async clear(): Promise<void> {
this.entries = [];
}
async has(key: string): Promise<boolean> {
return (await this.get(key)) !== undefined;
}
getStats() {
return {
hits: 0,
misses: 0,
hitRate: 0,
size: this.entries.length,
memoryUsage: 0,
};
}
async invalidateTags(tags: string[]): Promise<number> {
const tagSet = new Set(tags);
let count = 0;
this.entries = this.entries.filter(item => {
if (item.entry.tags.some(t => tagSet.has(t))) {
count++;
return false;
}
return true;
});
return count;
}
private async generateEmbedding(text: string): Promise<number[]> {
// 使用 LLM API 生成 embedding
const response = await this.llm.embeddings({
model: this.options.embeddingModel,
input: text,
});
return response.embedding;
}
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
private removeEntry(key: string): void {
this.entries = this.entries.filter(e => e.entry.key !== key);
}
}
多级缓存管理器
// src/cache/CacheManager.ts
import { Cache, CacheOptions } from './types.js';
export class CacheManager<T> {
private caches: Cache<T>[] = [];
addCache(cache: Cache<T>): void {
this.caches.push(cache);
// 按优先级排序(Memory -> Disk -> Semantic)
this.caches.sort((a, b) => this.getPriority(a) - this.getPriority(b));
}
async get(key: string): Promise<T | undefined> {
// 从 L1 到 Ln 逐级查找
for (let i = 0; i < this.caches.length; i++) {
const value = await this.caches[i].get(key);
if (value !== undefined) {
// 回填到更高级的缓存
for (let j = 0; j < i; j++) {
await this.caches[j].set(key, value);
}
return value;
}
}
return undefined;
}
async set(key: string, value: T, options: CacheOptions = {}): Promise<void> {
// 写入所有缓存
for (const cache of this.caches) {
await cache.set(key, value, options);
}
}
async invalidate(pattern: RegExp): Promise<number> {
let count = 0;
for (const cache of this.caches) {
// 需要缓存支持 key 遍历
}
return count;
}
async invalidateTags(tags: string[]): Promise<number> {
let total = 0;
for (const cache of this.caches) {
total += await cache.invalidateTags(tags);
}
return total;
}
getStats() {
return this.caches.map(c => ({
name: c.name,
stats: c.getStats(),
}));
}
private getPriority(cache: Cache<T>): number {
const priorities: Record<string, number> = {
memory: 1,
disk: 2,
semantic: 3,
};
return priorities[cache.name] || 99;
}
}
Token 优化
// src/optimization/TokenOptimizer.ts
export interface TokenOptimizerOptions {
maxContextTokens: number;
reserveTokens: number;
compressionRatio: number;
}
export class TokenOptimizer {
private options: TokenOptimizerOptions;
constructor(options: TokenOptimizerOptions) {
this.options = options;
}
// 估算 token 数量(简化版)
estimateTokens(text: string): number {
// 粗略估算:平均每个 token 约 4 个字符
return Math.ceil(text.length / 4);
}
// 压缩消息历史
compressMessages(messages: Message[]): Message[] {
const availableTokens = this.options.maxContextTokens - this.options.reserveTokens;
let currentTokens = messages.reduce((sum, m) => sum + this.estimateTokens(m.content), 0);
if (currentTokens <= availableTokens) {
return messages;
}
// 策略1: 移除旧的工具结果
const compressed = [...messages];
for (let i = compressed.length - 1; i >= 0; i--) {
if (compressed[i].role === 'tool') {
// 简化工具结果
compressed[i] = {
...compressed[i],
content: this.summarizeToolResult(compressed[i].content),
};
currentTokens = this.recalculateTokens(compressed);
if (currentTokens <= availableTokens) break;
}
}
// 策略2: 摘要早期消息
if (currentTokens > availableTokens) {
return this.summarizeEarlyMessages(compressed, availableTokens);
}
return compressed;
}
// 智能截断文件内容
smartTruncate(content: string, maxTokens: number): string {
const lines = content.split('\n');
const estimatedTokens = this.estimateTokens(content);
if (estimatedTokens <= maxTokens) {
return content;
}
// 保留文件头和关键部分
const headLines = lines.slice(0, 50);
const tailLines = lines.slice(-30);
return [
...headLines,
`\n... (${lines.length - 80} lines truncated) ...\n`,
...tailLines,
].join('\n');
}
// 构建高效提示
buildEfficientPrompt(context: {
files?: string[];
query: string;
history?: Message[];
}): string {
const parts: string[] = [];
// 系统提示(简洁)
parts.push('You are Claude Code, an AI coding assistant.');
// 相关文件(带行号范围)
if (context.files) {
parts.push('\nRelevant files:');
for (const file of context.files.slice(0, 10)) { // 限制文件数
parts.push(`- ${file}`);
}
}
// 历史摘要
if (context.history && context.history.length > 0) {
parts.push('\nConversation summary:');
parts.push(this.summarizeHistory(context.history));
}
// 用户查询
parts.push(`\nUser: ${context.query}`);
return parts.join('\n');
}
private summarizeToolResult(content: string): string {
// 简化工具输出
if (content.length > 500) {
return content.substring(0, 500) + `... (${content.length - 500} chars more)`;
}
return content;
}
private summarizeHistory(messages: Message[]): string {
// 提取关键信息
const topics = new Set<string>();
const files = new Set<string>();
for (const msg of messages) {
// 提取文件名
const fileMatches = msg.content.match(/\b\w+\.(ts|js|py|java|go|rs)\b/g);
fileMatches?.forEach(f => files.add(f));
}
return [
`Files discussed: ${Array.from(files).slice(0, 5).join(', ')}`,
].join('; ');
}
private summarizeEarlyMessages(messages: Message[], maxTokens: number): Message[] {
// 保留系统消息和最近的对话
const systemMsg = messages.find(m => m.role === 'system');
const recentMessages = messages.slice(-10);
const result: Message[] = [];
if (systemMsg) result.push(systemMsg);
// 添加摘要消息
const earlyMessages = messages.slice(0, -10);
if (earlyMessages.length > 0) {
result.push({
role: 'system',
content: `[Earlier conversation summarized: ${this.summarizeHistory(earlyMessages)}]`,
timestamp: Date.now(),
});
}
result.push(...recentMessages);
return result;
}
private recalculateTokens(messages: Message[]): number {
return messages.reduce((sum, m) => sum + this.estimateTokens(m.content), 0);
}
}
成本监控
// src/optimization/CostTracker.ts
export interface APICall {
timestamp: number;
model: string;
inputTokens: number;
outputTokens: number;
cost: number;
duration: number;
cacheHit: boolean;
}
export class CostTracker {
private calls: APICall[] = [];
private modelPrices: Record<string, { input: number; output: number }> = {
'claude-opus-4-6': { input: 15, output: 75 }, // per 1M tokens
'claude-sonnet-4-6': { input: 3, output: 15 },
'claude-haiku-4-5': { input: 0.25, output: 1.25 },
};
track(call: Omit<APICall, 'cost'>): void {
const price = this.modelPrices[call.model] || { input: 3, output: 15 };
const inputCost = (call.inputTokens / 1_000_000) * price.input;
const outputCost = (call.outputTokens / 1_000_000) * price.output;
this.calls.push({
...call,
cost: inputCost + outputCost,
});
}
getStats(timeRange: 'session' | 'day' | 'week' = 'session') {
const cutoff = this.getCutoffTime(timeRange);
const relevant = this.calls.filter(c => c.timestamp >= cutoff);
const totalTokens = relevant.reduce((sum, c) => sum + c.inputTokens + c.outputTokens, 0);
const totalCost = relevant.reduce((sum, c) => sum + c.cost, 0);
const cacheHits = relevant.filter(c => c.cacheHit).length;
return {
totalCalls: relevant.length,
totalTokens,
totalCost,
cacheHitRate: relevant.length > 0 ? cacheHits / relevant.length : 0,
averageLatency: relevant.length > 0
? relevant.reduce((sum, c) => sum + c.duration, 0) / relevant.length
: 0,
byModel: this.groupByModel(relevant),
};
}
private getCutoffTime(range: string): number {
const now = Date.now();
switch (range) {
case 'day': return now - 24 * 60 * 60 * 1000;
case 'week': return now - 7 * 24 * 60 * 60 * 1000;
default: return 0;
}
}
private groupByModel(calls: APICall[]) {
const grouped: Record<string, { calls: number; tokens: number; cost: number }> = {};
for (const call of calls) {
if (!grouped[call.model]) {
grouped[call.model] = { calls: 0, tokens: 0, cost: 0 };
}
grouped[call.model].calls++;
grouped[call.model].tokens += call.inputTokens + call.outputTokens;
grouped[call.model].cost += call.cost;
}
return grouped;
}
exportReport(): string {
const stats = this.getStats('session');
return `
API Usage Report
================
Total Calls: ${stats.totalCalls}
Total Tokens: ${stats.totalTokens.toLocaleString()}
Total Cost: $${stats.totalCost.toFixed(4)}
Cache Hit Rate: ${(stats.cacheHitRate * 100).toFixed(1)}%
Avg Latency: ${stats.averageLatency.toFixed(0)}ms
By Model:
${Object.entries(stats.byModel).map(([model, s]) =>
` ${model}: ${s.calls} calls, ${s.tokens.toLocaleString()} tokens, $${s.cost.toFixed(4)}`
).join('\n')}
`.trim();
}
}
本章小结
- ✓ 多级缓存系统(Memory → Semantic)
- ✓ Token 优化策略
- ✓ 成本监控与分析
- ✓ 智能压缩与摘要
下一步: Ch12: MCP 协议 - Model Context Protocol 实现。