Prompt Caching for LLMs

The idea is:

The ability to set / configure prompt caching with the LLMs that support it, as well as the ability to reference that prompt in future LLM calls.

My use case:

Referencing a static knowledge base, large system prompts in workflows, etc.

I think it would be beneficial to add this because:

Improve latency and reduce costs

Any resources to support this?

Public API docs from Gemini (context caching), Anthropic (prompt caching), etc.

Are you willing to work on this?

I’m willing to help test

Please somebody from team implement that at least to anthropic:

/* eslint-disable n8n-nodes-base/node-dirname-against-convention */

import { ChatAnthropic } from '@langchain/anthropic';
import type { LLMResult } from '@langchain/core/outputs';
import {
    NodeConnectionType,
    type INodePropertyOptions,
    type INodeProperties,
    type ISupplyDataFunctions,
    type INodeType,
    type INodeTypeDescription,
    type SupplyData,
} from 'n8n-workflow';

import { getConnectionHintNoticeField } from '@utils/sharedFields';
import { makeN8nLlmFailedAttemptHandler } from '../n8nLlmFailedAttemptHandler';
import { N8nLlmTracing } from '../N8nLlmTracing';

const modelField: INodeProperties = {
    displayName: 'Model',
    name: 'model',
    type: 'options',
    options: [
        {
            name: 'Claude 3.5 Sonnet(20241022)',
            value: 'claude-3-5-sonnet-20241022',
        },
        {
            name: 'Claude 3.5 Haiku(20241022)',
            value: 'claude-3-5-haiku-20241022',
        },
        {
            name: 'Claude 3 Opus(20240229)',
            value: 'claude-3-opus-20240229',
        },
        {
            name: 'Claude 3 Haiku(20240307)',
            value: 'claude-3-haiku-20240307',
        }
    ],
    description: 'The model which will generate the completion.',
    default: 'claude-3-5-sonnet-20241022',
};

export class LmChatAnthropic implements INodeType {
    description: INodeTypeDescription = {
        displayName: 'Anthropic Chat Model',
        name: 'lmChatAnthropic',
        icon: 'file:anthropic.svg',
        group: ['transform'],
        version: [1, 1.1, 1.2],
        defaultVersion: 1.2,
        description: 'Language Model Anthropic with System Message Caching',
        defaults: {
            name: 'Anthropic Chat Model',
        },
        properties: [
            getConnectionHintNoticeField([NodeConnectionType.AiChain, NodeConnectionType.AiChain]),
            {
                displayName: 'System Message',
                name: 'systemMessage',
                type: 'string',
                typeOptions: {
                    rows: 4,
                },
                default: '',
                description: 'System message that will be cached and reused across calls',
                placeholder: 'Enter a detailed system message (min. 1024 tokens for most models)',
            },
            {
                displayName: 'Cache System Message',
                name: 'cacheSystem',
                type: 'boolean',
                default: true,
                description: 'Enable caching for the system message to reduce API costs',
            },
            modelField,
            {
                displayName: 'Options',
                name: 'options',
                placeholder: 'Add Option',
                description: 'Additional options to add',
                type: 'collection',
                default: {},
                options: [
                    {
                        displayName: 'Maximum Tokens',
                        name: 'maxTokensToSample',
                        default: 4096,
                        description: 'The maximum number of tokens to generate',
                        type: 'number',
                    },
                    {
                        displayName: 'Temperature',
                        name: 'temperature',
                        default: 0.7,
                        typeOptions: { maxValue: 1, minValue: 0, numberPrecision: 1 },
                        description: 'Controls response randomness (0 = deterministic, 1 = creative)',
                        type: 'number',
                    }
                ],
            },
        ],
    };

    async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
        const credentials = await this.getCredentials('anthropicApi');
        const modelName = this.getNodeParameter('model', itemIndex) as string;
        const systemMessage = this.getNodeParameter('systemMessage', itemIndex, '') as string;
        const cacheSystem = this.getNodeParameter('cacheSystem', itemIndex, true) as boolean;
        const options = this.getNodeParameter('options', itemIndex, {}) as {
            maxTokensToSample?: number;
            temperature?: number;
        };

        // Create system message with cache control if enabled
        const system = systemMessage ? [
            {
                type: 'text',
                text: systemMessage,
                cache_control: cacheSystem ? { type: 'ephemeral' } : undefined
            }
        ] : [];

        const tokensUsageParser = (llmOutput: LLMResult['llmOutput']) => {
            const usage = (llmOutput?.usage as {
                input_tokens: number;
                output_tokens: number;
                cache_creation_input_tokens?: number;
                cache_read_input_tokens?: number;
            }) ?? {
                input_tokens: 0,
                output_tokens: 0,
                cache_creation_input_tokens: 0,
                cache_read_input_tokens: 0,
            };

            return {
                completionTokens: usage.output_tokens,
                promptTokens: usage.input_tokens,
                totalTokens: usage.input_tokens + usage.output_tokens,
                cacheCreationTokens: usage.cache_creation_input_tokens || 0,
                cacheReadTokens: usage.cache_read_input_tokens || 0,
            };
        };

        const model = new ChatAnthropic({
            anthropicApiKey: credentials.apiKey as string,
            modelName,
            maxTokens: options.maxTokensToSample,
            temperature: options.temperature,
            system: system.length > 0 ? system : undefined,
            callbacks: [new N8nLlmTracing(this, { tokensUsageParser })],
            onFailedAttempt: makeN8nLlmFailedAttemptHandler(this),
        });

        return {
            response: model,
        };
    }
}

Joining this request! SHouldn’t this be on github?