Chapter 2 - Providers and Prompt Caching¶

Companion to book/ch02_providers.md. Runs top-to-bottom in Google Colab in mock mode with no API key required.

In [ ]:

Copied!





# Clone the repo (skip if already present - Colab keeps files across runs in one session)
import os
if not os.path.exists("crafting-agentic-swarms"):
    !git clone https://github.com/TheAiSingularity/crafting-agentic-swarms.git
%cd crafting-agentic-swarms
!pip install -e ".[dev]" --quiet
!pip install matplotlib plotly ipywidgets --quiet
# Clone the repo (skip if already present - Colab keeps files across runs in one session)
import os
if not os.path.exists("crafting-agentic-swarms"):
    !git clone https://github.com/TheAiSingularity/crafting-agentic-swarms.git
%cd crafting-agentic-swarms
!pip install -e ".[dev]" --quiet
!pip install matplotlib plotly ipywidgets --quiet

In [ ]:

Copied!





import os
try:
    from google.colab import userdata
    os.environ["ANTHROPIC_API_KEY"] = userdata.get("ANTHROPIC_API_KEY")
    print("Using real API (key from Colab secrets).")
except (ImportError, Exception):
    os.environ.setdefault("SWARM_MOCK", "true")
    print("Running in mock mode (no API key needed).")
import os
try:
    from google.colab import userdata
    os.environ["ANTHROPIC_API_KEY"] = userdata.get("ANTHROPIC_API_KEY")
    print("Using real API (key from Colab secrets).")
except (ImportError, Exception):
    os.environ.setdefault("SWARM_MOCK", "true")
    print("Running in mock mode (no API key needed).")

What you'll build here¶

Drive the same prompt through three providers (Anthropic, OpenAI, LiteLLM) with one function call, and compare what comes back.
Visualize latency, input tokens, and estimated cost per provider as side-by-side bar charts.
Simulate prompt-cache warmup over 10 calls and see the cache-read curve that only Anthropic exposes for real.
Understand the tradeoff between a universal router (simplicity) and provider-native features (caching).

1. One call, three providers¶

swarm.core.client.call_agent does automatic routing based on the model string. claude-* hits AsyncAnthropic, gpt-* and o3/o4 hit AsyncOpenAI, everything else goes through litellm.acompletion. You never change caller code - only the model ID.

In [ ]:

Copied!





import asyncio, time
from swarm.core.client import call_agent

SAME_PROMPT = "Summarize the purpose of a retry budget in one sentence."
SYSTEM = "You are a concise technical writer."

TARGETS = [
    ("anthropic",  "claude-haiku-4-5-20251001"),
    ("openai",     "gpt-4o-mini"),
    ("litellm",    "gemini/gemini-2.5-flash-8b"),
]

async def probe(provider_label, model):
    t0 = time.monotonic()
    text, rec = await call_agent(
        agent_id=f"probe_{provider_label}",
        role="worker",
        task_id="ch02",
        system=SYSTEM,
        prompt=SAME_PROMPT,
        model=model,
        max_tokens=120,
    )
    wall_ms = int((time.monotonic() - t0) * 1000)
    return {
        "provider": provider_label,
        "model":    model,
        "tokens_in":  rec.input_tokens,
        "tokens_out": rec.output_tokens,
        "cost_usd":   rec.cost_usd,
        "latency_ms": rec.latency_ms or wall_ms,
        "text":       text[:120] + ("..." if len(text) > 120 else ""),
    }

results = [await probe(p, m) for p, m in TARGETS]
for r in results:
    print(f"{r['provider']:10s} {r['model']:32s} in={r['tokens_in']:4d} out={r['tokens_out']:4d} ${r['cost_usd']:.6f} {r['latency_ms']}ms")
    print(f"  -> {r['text']}\n")
import asyncio, time
from swarm.core.client import call_agent

SAME_PROMPT = "Summarize the purpose of a retry budget in one sentence."
SYSTEM = "You are a concise technical writer."

TARGETS = [
    ("anthropic",  "claude-haiku-4-5-20251001"),
    ("openai",     "gpt-4o-mini"),
    ("litellm",    "gemini/gemini-2.5-flash-8b"),
]

async def probe(provider_label, model):
    t0 = time.monotonic()
    text, rec = await call_agent(
        agent_id=f"probe_{provider_label}",
        role="worker",
        task_id="ch02",
        system=SYSTEM,
        prompt=SAME_PROMPT,
        model=model,
        max_tokens=120,
    )
    wall_ms = int((time.monotonic() - t0) * 1000)
    return {
        "provider": provider_label,
        "model":    model,
        "tokens_in":  rec.input_tokens,
        "tokens_out": rec.output_tokens,
        "cost_usd":   rec.cost_usd,
        "latency_ms": rec.latency_ms or wall_ms,
        "text":       text[:120] + ("..." if len(text) > 120 else ""),
    }

results = [await probe(p, m) for p, m in TARGETS]
for r in results:
    print(f"{r['provider']:10s} {r['model']:32s} in={r['tokens_in']:4d} out={r['tokens_out']:4d} ${r['cost_usd']:.6f} {r['latency_ms']}ms")
    print(f"  -> {r['text']}\n")

2. Side-by-side comparison¶

In [ ]:

Copied!





import matplotlib.pyplot as plt
import numpy as np

providers = [r["provider"] for r in results]
tokens_in = [r["tokens_in"] for r in results]
latency   = [r["latency_ms"] for r in results]
cost      = [r["cost_usd"] for r in results]

fig, axes = plt.subplots(1, 3, figsize=(12, 3.5))
colors = ["#3b82f6", "#10b981", "#f59e0b"]

axes[0].bar(providers, tokens_in, color=colors)
axes[0].set_title("input tokens")
axes[0].set_ylabel("tokens")

axes[1].bar(providers, latency, color=colors)
axes[1].set_title("latency (ms)")

axes[2].bar(providers, cost, color=colors)
axes[2].set_title("cost (USD)")
axes[2].ticklabel_format(axis="y", style="scientific", scilimits=(-4, -4))

for ax in axes:
    ax.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import numpy as np

providers = [r["provider"] for r in results]
tokens_in = [r["tokens_in"] for r in results]
latency   = [r["latency_ms"] for r in results]
cost      = [r["cost_usd"] for r in results]

fig, axes = plt.subplots(1, 3, figsize=(12, 3.5))
colors = ["#3b82f6", "#10b981", "#f59e0b"]

axes[0].bar(providers, tokens_in, color=colors)
axes[0].set_title("input tokens")
axes[0].set_ylabel("tokens")

axes[1].bar(providers, latency, color=colors)
axes[1].set_title("latency (ms)")

axes[2].bar(providers, cost, color=colors)
axes[2].set_title("cost (USD)")
axes[2].ticklabel_format(axis="y", style="scientific", scilimits=(-4, -4))

for ax in axes:
    ax.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

Mock numbers: same fixture produces identical tokens-in across providers, and compute_cost returns zero for a model whose pricing isn't in MODEL_PRICING (e.g. the (mock) suffixed IDs). Real API mode shows the actual spread: Gemini Flash 8B usually wins on cost, Haiku wins on latency when used with caching, OpenAI varies.

3. Prompt cache warmup¶

Anthropic's prompt caching rewards you for keeping a stable system block and static_doc across calls. The first call pays full input pricing; subsequent calls pay only cache_read pricing for the cached prefix.

Mock mode doesn't simulate cache_read_tokens out of the box, so we construct a hypothetical series here. It follows the shape you see in production: first call warms the cache, then reads plateau at the static size.

In [ ]:

Copied!





# Hypothetical warmup: 3000-token system prompt, 10 calls.
STATIC_SIZE = 3000

rows = []
for i in range(10):
    cache_read  = 0 if i == 0 else STATIC_SIZE
    cache_write = STATIC_SIZE if i == 0 else 0
    fresh_input = 120
    rows.append({"call": i + 1, "bucket": "cache_read",  "tokens": cache_read})
    rows.append({"call": i + 1, "bucket": "cache_write", "tokens": cache_write})
    rows.append({"call": i + 1, "bucket": "fresh_input", "tokens": fresh_input})
print(f"Built {len(rows)} rows for plotting")
# Hypothetical warmup: 3000-token system prompt, 10 calls.
STATIC_SIZE = 3000

rows = []
for i in range(10):
    cache_read  = 0 if i == 0 else STATIC_SIZE
    cache_write = STATIC_SIZE if i == 0 else 0
    fresh_input = 120
    rows.append({"call": i + 1, "bucket": "cache_read",  "tokens": cache_read})
    rows.append({"call": i + 1, "bucket": "cache_write", "tokens": cache_write})
    rows.append({"call": i + 1, "bucket": "fresh_input", "tokens": fresh_input})
print(f"Built {len(rows)} rows for plotting")

In [ ]:

Copied!





import plotly.express as px
import pandas as pd

df = pd.DataFrame(rows)
fig = px.bar(df, x="call", y="tokens", color="bucket", title="Cache warmup across 10 calls (hypothetical)")
fig.update_layout(barmode="stack", xaxis=dict(dtick=1))
fig.show()
import plotly.express as px
import pandas as pd

df = pd.DataFrame(rows)
fig = px.bar(df, x="call", y="tokens", color="bucket", title="Cache warmup across 10 calls (hypothetical)")
fig.update_layout(barmode="stack", xaxis=dict(dtick=1))
fig.show()

Read the stacked bars: the first call is pure cache_write (expensive - around 25% more than input). Every subsequent call pays cache_read, which is typically 90% cheaper than a fresh input token. That differential is the entire point of prompt caching.

4. Cost implications¶

In [ ]:

Copied!





from swarm.core.models import MODEL_PRICING

p = MODEL_PRICING["claude-sonnet-4-6"]
print("Sonnet pricing (per 1M tokens, USD):")
print(f"  input       ${p['input']:6.2f}")
print(f"  output      ${p['output']:6.2f}")
print(f"  cache_read  ${p['cache_read']:6.2f}  (90% off input)")
print(f"  cache_write ${p['cache_write']:6.2f}  (25% premium on input)")

# What does the 10-call warmup cost?
fresh_dollars = 10 * 120 * p["input"] / 1_000_000
no_cache     = 10 * (STATIC_SIZE + 120) * p["input"] / 1_000_000
with_cache   = (STATIC_SIZE * p["cache_write"] + 9 * STATIC_SIZE * p["cache_read"] + 10 * 120 * p["input"]) / 1_000_000
print(f"\nNo caching:   ${no_cache:.6f}")
print(f"With caching: ${with_cache:.6f}  ({(1 - with_cache / no_cache) * 100:.1f}% saved)")
from swarm.core.models import MODEL_PRICING

p = MODEL_PRICING["claude-sonnet-4-6"]
print("Sonnet pricing (per 1M tokens, USD):")
print(f"  input       ${p['input']:6.2f}")
print(f"  output      ${p['output']:6.2f}")
print(f"  cache_read  ${p['cache_read']:6.2f}  (90% off input)")
print(f"  cache_write ${p['cache_write']:6.2f}  (25% premium on input)")

# What does the 10-call warmup cost?
fresh_dollars = 10 * 120 * p["input"] / 1_000_000
no_cache     = 10 * (STATIC_SIZE + 120) * p["input"] / 1_000_000
with_cache   = (STATIC_SIZE * p["cache_write"] + 9 * STATIC_SIZE * p["cache_read"] + 10 * 120 * p["input"]) / 1_000_000
print(f"\nNo caching:   ${no_cache:.6f}")
print(f"With caching: ${with_cache:.6f}  ({(1 - with_cache / no_cache) * 100:.1f}% saved)")

Caching pays off whenever the stable prefix is read more than once. The break-even point is roughly two reads: the 25% write premium is recovered by the first 90%-off read. More repeats, more savings. This is why the book pushes every large agent context - system prompt, large docs, memory index - through static_doc.

5. Real-API gate¶

In [ ]:

Copied!





if os.environ.get("SWARM_MOCK") != "true":
    big_doc = "SPEC:\n" + "The system shall return results in under 200ms. " * 200
    for turn in range(3):
        _, rec = await call_agent(
            agent_id=f"warmup_{turn}",
            role="worker",
            task_id="ch02_cache",
            system="You are a QA engineer.",
            prompt=f"Turn {turn}: list one risk in the spec.",
            static_doc=big_doc,
            model="claude-sonnet-4-6",
            max_tokens=80,
        )
        print(f"turn {turn}: write={rec.cache_write_tokens} read={rec.cache_read_tokens} in={rec.input_tokens}")
else:
    print("Skipped (mock mode). In real mode, watch cache_write on turn 0 and cache_read on turns 1-2.")
if os.environ.get("SWARM_MOCK") != "true":
    big_doc = "SPEC:\n" + "The system shall return results in under 200ms. " * 200
    for turn in range(3):
        _, rec = await call_agent(
            agent_id=f"warmup_{turn}",
            role="worker",
            task_id="ch02_cache",
            system="You are a QA engineer.",
            prompt=f"Turn {turn}: list one risk in the spec.",
            static_doc=big_doc,
            model="claude-sonnet-4-6",
            max_tokens=80,
        )
        print(f"turn {turn}: write={rec.cache_write_tokens} read={rec.cache_read_tokens} in={rec.input_tokens}")
else:
    print("Skipped (mock mode). In real mode, watch cache_write on turn 0 and cache_read on turns 1-2.")

6. How the router picks a backend¶

The dispatch logic in swarm.core.providers.detect_provider is intentionally trivial - just string prefix matching. Here's what each model string resolves to:

In [ ]:

Copied!





from swarm.core.providers import detect_provider, MODEL_REGISTRY

samples = [
    "claude-haiku-4-5-20251001",
    "claude-opus-4-6",
    "gpt-4o-mini",
    "o3",
    "gemini/gemini-2.5-flash",
    "groq/llama-3.1-8b-instant",
    "ollama/qwen3:8b",
    "mistral/mistral-large-latest",
]
for m in samples:
    print(f"  {m:40s} -> {detect_provider(m)}")
from swarm.core.providers import detect_provider, MODEL_REGISTRY

samples = [
    "claude-haiku-4-5-20251001",
    "claude-opus-4-6",
    "gpt-4o-mini",
    "o3",
    "gemini/gemini-2.5-flash",
    "groq/llama-3.1-8b-instant",
    "ollama/qwen3:8b",
    "mistral/mistral-large-latest",
]
for m in samples:
    print(f"  {m:40s} -> {detect_provider(m)}")

7. Model registry tiers¶

In [ ]:

Copied!





import plotly.express as px
import pandas as pd

rows = []
for name, meta in MODEL_REGISTRY.items():
    rows.append({
        "model": name,
        "provider": meta.provider,
        "tier": meta.tier,
        "context_k": meta.context_k,
        "caching": meta.supports_caching,
    })
df_models = pd.DataFrame(rows)
print(df_models.groupby(["provider", "tier"]).size().unstack(fill_value=0))
import plotly.express as px
import pandas as pd

rows = []
for name, meta in MODEL_REGISTRY.items():
    rows.append({
        "model": name,
        "provider": meta.provider,
        "tier": meta.tier,
        "context_k": meta.context_k,
        "caching": meta.supports_caching,
    })
df_models = pd.DataFrame(rows)
print(df_models.groupby(["provider", "tier"]).size().unstack(fill_value=0))

In [ ]:

Copied!





fig = px.scatter(
    df_models, x="context_k", y="provider", color="tier",
    symbol="caching", hover_name="model",
    title="Model registry - context window vs provider",
)
fig.update_traces(marker=dict(size=14))
fig.update_xaxes(type="log", title="context window (K tokens, log scale)")
fig.show()
fig = px.scatter(
    df_models, x="context_k", y="provider", color="tier",
    symbol="caching", hover_name="model",
    title="Model registry - context window vs provider",
)
fig.update_traces(marker=dict(size=14))
fig.update_xaxes(type="log", title="context window (K tokens, log scale)")
fig.show()

Hover any dot to see the exact model. Circle marker = supports caching, diamond = no caching. The only dots with cache support are Anthropic. That is the provider lock-in cost of caching: you lose it when you move to any other backend.

8. Switching presets¶

In [ ]:

Copied!





from swarm.core.models import MODELS, ALL_PRESETS

for preset_name, mapping in ALL_PRESETS.items():
    workers = mapping.get("worker", "?")
    orch   = mapping.get("orchestrator", "?")
    print(f"  {preset_name:16s} orchestrator={orch:40s}  worker={workers}")

# In your code: `SwarmConfig(model_overrides=PRESET_OPENAI)` flips every
# role to a matching OpenAI model in one line, no call_agent changes.
from swarm.core.models import MODELS, ALL_PRESETS

for preset_name, mapping in ALL_PRESETS.items():
    workers = mapping.get("worker", "?")
    orch   = mapping.get("orchestrator", "?")
    print(f"  {preset_name:16s} orchestrator={orch:40s}  worker={workers}")

# In your code: `SwarmConfig(model_overrides=PRESET_OPENAI)` flips every
# role to a matching OpenAI model in one line, no call_agent changes.

9. Cost across presets for a fixed workload¶

In [ ]:

Copied!





from swarm.core.models import compute_cost
from swarm.core.records import CallRecord

# Simulated workload: 10 worker calls, each 500 in + 300 out tokens.
workload = 10
tokens_in, tokens_out = 500, 300
rows = []
for preset_name, mapping in ALL_PRESETS.items():
    worker_model = mapping["worker"]
    rec = CallRecord(
        agent_id="w",
        role="worker",
        model=worker_model,
        task_id="w",
        input_tokens=tokens_in,
        output_tokens=tokens_out,
    )
    per = compute_cost(rec)
    rows.append({"preset": preset_name, "model": worker_model, "total_usd": per * workload})

df_cost = pd.DataFrame(rows).sort_values("total_usd")
print(df_cost.to_string(index=False))
fig = px.bar(df_cost, x="preset", y="total_usd", hover_data=["model"],
             title="Worker-role cost for 10 calls across presets")
fig.show()
from swarm.core.models import compute_cost
from swarm.core.records import CallRecord

# Simulated workload: 10 worker calls, each 500 in + 300 out tokens.
workload = 10
tokens_in, tokens_out = 500, 300
rows = []
for preset_name, mapping in ALL_PRESETS.items():
    worker_model = mapping["worker"]
    rec = CallRecord(
        agent_id="w",
        role="worker",
        model=worker_model,
        task_id="w",
        input_tokens=tokens_in,
        output_tokens=tokens_out,
    )
    per = compute_cost(rec)
    rows.append({"preset": preset_name, "model": worker_model, "total_usd": per * workload})

df_cost = pd.DataFrame(rows).sort_values("total_usd")
print(df_cost.to_string(index=False))
fig = px.bar(df_cost, x="preset", y="total_usd", hover_data=["model"],
             title="Worker-role cost for 10 calls across presets")
fig.show()

Local (Ollama) reports $0 because we don't meter electricity. Groq's 8B is the cheapest real-API option; Claude Opus is roughly 50x more expensive per output token than Haiku. The preset pattern lets you flip the entire swarm's cost profile with one line.

Takeaways¶

call_agent dispatches by model string; callers don't change when you change providers.
Tokens and pricing are apples-to-oranges across providers - always compute dollars before comparing.
Prompt caching is the highest-leverage optimization in this book. Pay the write premium once, then read cheaply forever.
Presets let you re-target the whole swarm in one line - useful for A/B-ing provider cost.

Continue¶

Chapter 3a: The ReAct Loop