Skip to content

Examples

Self-contained scripts for running MAC on standard benchmarks. Each script includes data loading, metric definition, and the full training loop.


Smoke Test

The fastest way to verify your setup. Runs all 6 configuration modes (2 styles x 3 setups) on a tiny dataset in 2-4 minutes.

mini_test.py (full source)
#!/usr/bin/env python3
"""
Mini smoke-test — run MAC through every config mode with tiny data.

No external datasets needed.  Each config runs 1 epoch on 4 train / 2
holdout examples so a full sweep finishes in minutes, not hours.

Configs tested (when all flags are supplied):

  1. API worker  + API MAC   — adapt style
  2. API worker  + API MAC   — custom style
  3. vLLM worker + API MAC   — adapt style   (needs --vllm-url + --vllm-model)
  4. vLLM worker + API MAC   — custom style   (needs --vllm-url + --vllm-model)
  5. Fully local (vLLM only) — adapt style   (needs --vllm-url + --vllm-model)
  6. Fully local (vLLM only) — custom style   (needs --vllm-url + --vllm-model)

Usage
-----
# API-only (configs 1-2):
python examples/mini_test.py

# All six configs:
python examples/mini_test.py \\
    --vllm-url http://localhost:8000/v1 \\
    --vllm-model Qwen/Qwen3-8B

# Override the cloud models:
python examples/mini_test.py \\
    --worker-model gpt-4o-mini --mac-model gpt-5.2
"""

import sys, os, time, argparse, traceback

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC

# ---------------------------------------------------------------------------
# Inline data — trivial arithmetic so even small models can score > 0
# ---------------------------------------------------------------------------

TRAIN = [
    Example(input="What is 12 + 7?", output="19"),
    Example(input="What is 25 - 8?", output="17"),
    Example(input="What is 6 * 9?", output="54"),
    Example(input="What is 100 / 4?", output="25"),
]

HOLDOUT = [
    Example(input="What is 15 + 23?", output="38"),
    Example(input="What is 9 * 7?", output="63"),
]

TASK_DESCRIPTION = (
    "Solve the arithmetic problem. Return ONLY the numeric answer, "
    "no explanation, no units."
)
RULE_TYPE = "arithmetic reasoning rules"

CUSTOM_PROMPT = """\
You are a calculator.  Solve the arithmetic problem step by step.

{{CONSTITUTION_BLOCK}}

Return your answer as JSON: {"answer": "<number>"}"""


def numeric_match(prediction, gold):
    """Metric: 1.0 if the numbers match, else 0.0."""
    try:
        return 1.0 if float(str(prediction).strip().rstrip(".")) == float(str(gold).strip()) else 0.0
    except (ValueError, TypeError):
        return 1.0 if str(prediction).strip() == str(gold).strip() else 0.0


# ---------------------------------------------------------------------------
# Config builders
# ---------------------------------------------------------------------------

def _base_kwargs(mac_model, epochs=1, batch_size=2):
    return dict(
        mac_model=mac_model,
        num_epochs=epochs,
        batch_size=batch_size,
        task_description=TASK_DESCRIPTION,
        rule_type=RULE_TYPE,
    )


def build_configs(args):
    """Return list of (name, MAC-constructor-kwargs) tuples."""
    configs = []

    # --- API-only configs ---------------------------------------------------
    base = _base_kwargs(args.mac_model, args.epochs, args.batch_size)

    # 1) API adapt
    configs.append((
        "API worker + API MAC (adapt)",
        dict(model=args.worker_model, **base),
    ))

    # 2) API custom
    configs.append((
        "API worker + API MAC (custom)",
        dict(model=args.worker_model, task_prompt=CUSTOM_PROMPT, **base),
    ))

    # --- vLLM configs (only when --vllm-url is given) -----------------------
    if args.vllm_url:
        vllm_model = args.vllm_model

        # 3) vLLM worker + cloud MAC — adapt
        configs.append((
            "vLLM worker + API MAC (adapt)",
            dict(model=vllm_model, base_url=args.vllm_url, **base),
        ))

        # 4) vLLM worker + cloud MAC — custom
        configs.append((
            "vLLM worker + API MAC (custom)",
            dict(model=vllm_model, base_url=args.vllm_url,
                 task_prompt=CUSTOM_PROMPT, **base),
        ))

        # 5) Fully local — adapt
        local_base = _base_kwargs(vllm_model, args.epochs, args.batch_size)
        local_base["mac_base_url"] = args.vllm_url
        configs.append((
            "Fully local (adapt)",
            dict(model=vllm_model, base_url=args.vllm_url, **local_base),
        ))

        # 6) Fully local — custom
        local_base2 = _base_kwargs(vllm_model, args.epochs, args.batch_size)
        local_base2["mac_base_url"] = args.vllm_url
        configs.append((
            "Fully local (custom)",
            dict(model=vllm_model, base_url=args.vllm_url,
                 task_prompt=CUSTOM_PROMPT, **local_base2),
        ))

    return configs


# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------

def run_one(name, mac_kwargs):
    """Run a single config.  Returns (name, status, score, n_rules, elapsed)."""
    print(f"\n{'='*60}")
    print(f"  CONFIG: {name}")
    print(f"{'='*60}")
    t0 = time.time()
    try:
        compiler = MAC(**mac_kwargs)
        result = compiler.compile(trainset=TRAIN, holdout=HOLDOUT, metric=numeric_match)
        elapsed = time.time() - t0

        result.overview()
        n_rules = len(result.rules)
        score = result.result.holdout_metrics.get("f1", 0.0) if result.result else 0.0

        # Quick inference sanity check
        answer = result("What is 3 + 4?")
        print(f"\n  Inference check:  '3 + 4' -> '{answer}'")

        return (name, "PASS", score, n_rules, elapsed)

    except Exception:
        elapsed = time.time() - t0
        traceback.print_exc()
        return (name, "FAIL", 0.0, 0, elapsed)


def main():
    parser = argparse.ArgumentParser(
        description="Mini smoke-test — run MAC through every config mode",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="API worker model (default: gpt-4o-mini)")
    parser.add_argument("--mac-model", default="gpt-4o",
                        help="API MAC-agent model (default: gpt-4o)")
    parser.add_argument("--vllm-url", default=None,
                        help="vLLM base URL (e.g. http://localhost:8000/v1)")
    parser.add_argument("--vllm-model", default="Qwen/Qwen3-8B",
                        help="Model name served by vLLM (default: Qwen/Qwen3-8B)")
    parser.add_argument("--epochs", type=int, default=1,
                        help="Training epochs per config (default: 1)")
    parser.add_argument("--batch-size", type=int, default=2,
                        help="Batch size per config (default: 2)")
    args = parser.parse_args()

    configs = build_configs(args)

    print(f"\nMini smoke-test: {len(configs)} configs, "
          f"{len(TRAIN)} train / {len(HOLDOUT)} holdout, "
          f"{args.epochs} epoch(s), batch_size={args.batch_size}")
    print(f"Worker model : {args.worker_model}")
    print(f"MAC model    : {args.mac_model}")
    if args.vllm_url:
        print(f"vLLM endpoint: {args.vllm_url}  ({args.vllm_model})")

    results = []
    for name, kw in configs:
        results.append(run_one(name, kw))

    # --- Summary table ------------------------------------------------------
    print(f"\n\n{'='*70}")
    print("  SUMMARY")
    print(f"{'='*70}")
    print(f"  {'Config':<40} {'Status':>6}  {'Score':>6}  {'Rules':>5}  {'Time':>7}")
    print(f"  {'-'*40} {'-'*6}  {'-'*6}  {'-'*5}  {'-'*7}")
    for name, status, score, n_rules, elapsed in results:
        tag = "PASS" if status == "PASS" else "FAIL"
        print(f"  {name:<40} {tag:>6}  {score:>5.1%}  {n_rules:>5}  {elapsed:>6.1f}s")

    n_pass = sum(1 for r in results if r[1] == "PASS")
    n_fail = len(results) - n_pass
    print(f"\n  {n_pass} passed, {n_fail} failed out of {len(results)} configs")

    if n_fail:
        sys.exit(1)


if __name__ == "__main__":
    main()

GSM8K (Math Reasoning)

Grade-school math word problems. MAC learns rules for step-by-step arithmetic, unit tracking, and answer verification.

"""
GSM8K — adapt style (Mode 2: MAC auto-adapts prompt)

MAC learns constitution rules on GSM8K math problems.
Prompt is automatically adapted by the MAC agent.

Usage:
    # OpenAI models (Config C / D)
    python examples/gsm8k_adapt.py --worker-model gpt-4o-mini --mac-model gpt-5.2

    # vLLM worker + OpenAI MAC (Config A)
    python examples/gsm8k_adapt.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model gpt-5.2

    # All vLLM (Config B)
    python examples/gsm8k_adapt.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model Qwen/Qwen3-8B --mac-base-url http://localhost:8000/v1
"""

import sys, os, re, random, argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC


def load_gsm8k(n_train=80, n_holdout=16, seed=42):
    from datasets import load_dataset
    ds = load_dataset("gsm8k", "main", split="test")
    items = list(ds)
    random.seed(seed)
    random.shuffle(items)
    items = items[:n_train + n_holdout]
    examples = []
    for row in items:
        match = re.search(r'####\s*(.+)', row["answer"])
        answer = match.group(1).strip().replace(",", "") if match else row["answer"].strip()
        examples.append(Example(input=row["question"], output=answer))
    return examples[:n_train], examples[n_train:]


def numeric_match(prediction, gold):
    try:
        return 1.0 if float(str(prediction).strip().rstrip('.')) == float(str(gold).strip()) else 0.0
    except (ValueError, TypeError):
        return 1.0 if str(prediction).strip() == str(gold).strip() else 0.0


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="GSM8K — adapt style")
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="Worker (student) model. e.g. gpt-4o-mini, Qwen/Qwen3-8B")
    parser.add_argument("--worker-base-url", default=None,
                        help="Base URL for worker (e.g. http://localhost:8000/v1 for vLLM)")
    parser.add_argument("--mac-model", default="gpt-5.2",
                        help="MAC controller model (decision/proposer/editor agents)")
    parser.add_argument("--mac-base-url", default=None,
                        help="Base URL for MAC model (if hosted via vLLM)")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--train", type=int, default=80)
    parser.add_argument("--holdout", type=int, default=16)
    parser.add_argument("--output", default="gsm8k_adapt_rules.json")
    args = parser.parse_args()

    train, holdout = load_gsm8k(args.train, args.holdout)
    print(f"GSM8K: {len(train)} train / {len(holdout)} holdout")

    mac_kwargs = dict(
        mac_model=args.mac_model,
        num_epochs=args.epochs,
        batch_size=4,
        provider="openai",
        task_description="Solve the math word problem step by step. Return only the final numeric answer.",
        rule_type="math reasoning rules",
    )
    if args.worker_base_url:
        mac_kwargs["base_url"] = args.worker_base_url
    if args.mac_base_url:
        mac_kwargs["mac_base_url"] = args.mac_base_url

    compiler = MAC(model=args.worker_model, **mac_kwargs)
    optimized = compiler.compile(trainset=train, holdout=holdout, metric=numeric_match)

    optimized.overview()
    print(f"\nRules learned: {len(optimized.rules)}")
    for i, r in enumerate(optimized.rules, 1):
        print(f"  {i}. {r}")

    optimized.save(args.output)
    print(f"\nSaved to {args.output}")
"""
GSM8K — custom style (Mode 1: user-supplied prompt with {{CONSTITUTION_BLOCK}})

MAC learns constitution rules on GSM8K math problems.
You supply the full prompt; MAC injects learned rules at {{CONSTITUTION_BLOCK}}.

Usage:
    # OpenAI models (Config C / D)
    python examples/gsm8k_custom.py --worker-model gpt-4o-mini --mac-model gpt-5.2

    # vLLM worker + OpenAI MAC (Config A)
    python examples/gsm8k_custom.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model gpt-5.2

    # All vLLM (Config B)
    python examples/gsm8k_custom.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model Qwen/Qwen3-8B --mac-base-url http://localhost:8000/v1
"""

import sys, os, re, random, argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC

TASK_PROMPT = """\
You are an expert math tutor solving grade-school math word problems.
Think step by step. Show your work clearly.

{{CONSTITUTION_BLOCK}}

Return your answer as JSON: {"reasoning": "...", "answer": "<number>"}"""


def load_gsm8k(n_train=80, n_holdout=16, seed=42):
    from datasets import load_dataset
    ds = load_dataset("gsm8k", "main", split="test")
    items = list(ds)
    random.seed(seed)
    random.shuffle(items)
    items = items[:n_train + n_holdout]
    examples = []
    for row in items:
        match = re.search(r'####\s*(.+)', row["answer"])
        answer = match.group(1).strip().replace(",", "") if match else row["answer"].strip()
        examples.append(Example(input=row["question"], output=answer))
    return examples[:n_train], examples[n_train:]


def numeric_match(prediction, gold):
    try:
        return 1.0 if float(str(prediction).strip().rstrip('.')) == float(str(gold).strip()) else 0.0
    except (ValueError, TypeError):
        return 1.0 if str(prediction).strip() == str(gold).strip() else 0.0


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="GSM8K — custom prompt style")
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="Worker (student) model. e.g. gpt-4o-mini, Qwen/Qwen3-8B")
    parser.add_argument("--worker-base-url", default=None,
                        help="Base URL for worker (e.g. http://localhost:8000/v1 for vLLM)")
    parser.add_argument("--mac-model", default="gpt-5.2",
                        help="MAC controller model (decision/proposer/editor agents)")
    parser.add_argument("--mac-base-url", default=None,
                        help="Base URL for MAC model (if hosted via vLLM)")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--train", type=int, default=80)
    parser.add_argument("--holdout", type=int, default=16)
    parser.add_argument("--output", default="gsm8k_custom_rules.json")
    args = parser.parse_args()

    train, holdout = load_gsm8k(args.train, args.holdout)
    print(f"GSM8K: {len(train)} train / {len(holdout)} holdout")

    mac_kwargs = dict(
        mac_model=args.mac_model,
        num_epochs=args.epochs,
        batch_size=4,
        provider="openai",
        task_prompt=TASK_PROMPT,
        task_description="Solve grade-school math word problems step by step.",
        rule_type="math reasoning rules",
    )
    if args.worker_base_url:
        mac_kwargs["base_url"] = args.worker_base_url
    if args.mac_base_url:
        mac_kwargs["mac_base_url"] = args.mac_base_url

    compiler = MAC(model=args.worker_model, **mac_kwargs)
    optimized = compiler.compile(trainset=train, holdout=holdout, metric=numeric_match)

    optimized.overview()
    print(f"\nRules learned: {len(optimized.rules)}")
    for i, r in enumerate(optimized.rules, 1):
        print(f"  {i}. {r}")

    optimized.save(args.output)
    print(f"\nSaved to {args.output}")

HotpotQA (Multi-Hop QA)

Questions requiring reasoning across multiple Wikipedia paragraphs. MAC learns rules for entity tracking, bridge reasoning, and answer extraction.

"""
HotpotQA — adapt style (Mode 2: MAC auto-adapts prompt)

MAC learns constitution rules on HotpotQA multi-hop questions.
Prompt is automatically adapted by the MAC agent.

Usage:
    # OpenAI models (Config C / D)
    python examples/hotpotqa_adapt.py --worker-model gpt-4o-mini --mac-model gpt-5.2

    # vLLM worker + OpenAI MAC (Config A)
    python examples/hotpotqa_adapt.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model gpt-5.2

    # All vLLM (Config B)
    python examples/hotpotqa_adapt.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model Qwen/Qwen3-8B --mac-base-url http://localhost:8000/v1
"""

import sys, os, random, argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC


def load_hotpotqa(n_train=80, n_holdout=16, seed=42):
    from datasets import load_dataset
    ds = load_dataset("hotpot_qa", "distractor", split="validation")
    items = list(ds)
    random.seed(seed)
    random.shuffle(items)
    items = items[:n_train + n_holdout]
    examples = [Example(input=row["question"], output=row["answer"]) for row in items]
    return examples[:n_train], examples[n_train:]


def token_f1(prediction, gold):
    pred_tokens = set(str(prediction).lower().split())
    gold_tokens = set(str(gold).lower().split())
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    tp = len(pred_tokens & gold_tokens)
    prec = tp / len(pred_tokens)
    rec = tp / len(gold_tokens)
    return 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="HotpotQA — adapt style")
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="Worker (student) model. e.g. gpt-4o-mini, Qwen/Qwen3-8B")
    parser.add_argument("--worker-base-url", default=None,
                        help="Base URL for worker (e.g. http://localhost:8000/v1 for vLLM)")
    parser.add_argument("--mac-model", default="gpt-5.2",
                        help="MAC controller model (decision/proposer/editor agents)")
    parser.add_argument("--mac-base-url", default=None,
                        help="Base URL for MAC model (if hosted via vLLM)")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--train", type=int, default=80)
    parser.add_argument("--holdout", type=int, default=16)
    parser.add_argument("--output", default="hotpotqa_adapt_rules.json")
    args = parser.parse_args()

    train, holdout = load_hotpotqa(args.train, args.holdout)
    print(f"HotpotQA: {len(train)} train / {len(holdout)} holdout")

    mac_kwargs = dict(
        mac_model=args.mac_model,
        num_epochs=args.epochs,
        batch_size=4,
        provider="openai",
        task_description="Answer the multi-hop question concisely. Return only the answer.",
        rule_type="multi-hop QA reasoning rules",
    )
    if args.worker_base_url:
        mac_kwargs["base_url"] = args.worker_base_url
    if args.mac_base_url:
        mac_kwargs["mac_base_url"] = args.mac_base_url

    compiler = MAC(model=args.worker_model, **mac_kwargs)
    optimized = compiler.compile(trainset=train, holdout=holdout, metric=token_f1)

    optimized.overview()
    print(f"\nRules learned: {len(optimized.rules)}")
    for i, r in enumerate(optimized.rules, 1):
        print(f"  {i}. {r}")

    optimized.save(args.output)
    print(f"\nSaved to {args.output}")
"""
HotpotQA — custom style (Mode 1: user-supplied prompt with {{CONSTITUTION_BLOCK}})

MAC learns constitution rules on HotpotQA multi-hop questions.
You supply the full prompt; MAC injects learned rules at {{CONSTITUTION_BLOCK}}.

Usage:
    # OpenAI models (Config C / D)
    python examples/hotpotqa_custom.py --worker-model gpt-4o-mini --mac-model gpt-5.2

    # vLLM worker + OpenAI MAC (Config A)
    python examples/hotpotqa_custom.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model gpt-5.2

    # All vLLM (Config B)
    python examples/hotpotqa_custom.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model Qwen/Qwen3-8B --mac-base-url http://localhost:8000/v1
"""

import sys, os, random, argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC

TASK_PROMPT = """\
You are an expert question-answering system specializing in multi-hop reasoning.
Break complex questions into sub-questions, reason through each hop, then combine.

{{CONSTITUTION_BLOCK}}

Return your answer as JSON: {"reasoning": "...", "answer": "<concise answer>"}"""


def load_hotpotqa(n_train=80, n_holdout=16, seed=42):
    from datasets import load_dataset
    ds = load_dataset("hotpot_qa", "distractor", split="validation")
    items = list(ds)
    random.seed(seed)
    random.shuffle(items)
    items = items[:n_train + n_holdout]
    examples = [Example(input=row["question"], output=row["answer"]) for row in items]
    return examples[:n_train], examples[n_train:]


def token_f1(prediction, gold):
    pred_tokens = set(str(prediction).lower().split())
    gold_tokens = set(str(gold).lower().split())
    if not gold_tokens:
        return 1.0 if not pred_tokens else 0.0
    if not pred_tokens:
        return 0.0
    tp = len(pred_tokens & gold_tokens)
    prec = tp / len(pred_tokens)
    rec = tp / len(gold_tokens)
    return 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="HotpotQA — custom prompt style")
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="Worker (student) model. e.g. gpt-4o-mini, Qwen/Qwen3-8B")
    parser.add_argument("--worker-base-url", default=None,
                        help="Base URL for worker (e.g. http://localhost:8000/v1 for vLLM)")
    parser.add_argument("--mac-model", default="gpt-5.2",
                        help="MAC controller model (decision/proposer/editor agents)")
    parser.add_argument("--mac-base-url", default=None,
                        help="Base URL for MAC model (if hosted via vLLM)")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--train", type=int, default=80)
    parser.add_argument("--holdout", type=int, default=16)
    parser.add_argument("--output", default="hotpotqa_custom_rules.json")
    args = parser.parse_args()

    train, holdout = load_hotpotqa(args.train, args.holdout)
    print(f"HotpotQA: {len(train)} train / {len(holdout)} holdout")

    mac_kwargs = dict(
        mac_model=args.mac_model,
        num_epochs=args.epochs,
        batch_size=4,
        provider="openai",
        task_prompt=TASK_PROMPT,
        task_description="Answer multi-hop questions by reasoning through sub-questions.",
        rule_type="multi-hop QA reasoning rules",
    )
    if args.worker_base_url:
        mac_kwargs["base_url"] = args.worker_base_url
    if args.mac_base_url:
        mac_kwargs["mac_base_url"] = args.mac_base_url

    compiler = MAC(model=args.worker_model, **mac_kwargs)
    optimized = compiler.compile(trainset=train, holdout=holdout, metric=token_f1)

    optimized.overview()
    print(f"\nRules learned: {len(optimized.rules)}")
    for i, r in enumerate(optimized.rules, 1):
        print(f"  {i}. {r}")

    optimized.save(args.output)
    print(f"\nSaved to {args.output}")

HoVer (Fact Verification)

Multi-hop fact verification against Wikipedia evidence. MAC learns rules about evidence sufficiency, claim decomposition, and contradiction detection.

"""
HoVer — adapt style (Mode 2: MAC auto-adapts prompt)

MAC learns constitution rules on HoVer fact verification (yes/no).
Prompt is automatically adapted by the MAC agent.

Usage:
    # OpenAI models (Config C / D)
    python examples/hover_adapt.py --worker-model gpt-4o-mini --mac-model gpt-5.2

    # vLLM worker + OpenAI MAC (Config A)
    python examples/hover_adapt.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model gpt-5.2

    # All vLLM (Config B)
    python examples/hover_adapt.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model Qwen/Qwen3-8B --mac-base-url http://localhost:8000/v1
"""

import sys, os, random, argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC


def load_hover(n_train=80, n_holdout=16, seed=42):
    """Load yes/no questions from HotpotQA as a HoVer proxy."""
    from datasets import load_dataset
    ds = load_dataset("hotpot_qa", "distractor", split="validation")
    items = [row for row in ds if str(row["answer"]).lower() in ("yes", "no")]
    random.seed(seed)
    random.shuffle(items)
    items = items[:n_train + n_holdout]
    examples = [Example(input=row["question"], output=row["answer"].lower()) for row in items]
    return examples[:n_train], examples[n_train:]


def exact_match(prediction, gold):
    return 1.0 if str(prediction).strip().lower() == str(gold).strip().lower() else 0.0


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="HoVer — adapt style")
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="Worker (student) model. e.g. gpt-4o-mini, Qwen/Qwen3-8B")
    parser.add_argument("--worker-base-url", default=None,
                        help="Base URL for worker (e.g. http://localhost:8000/v1 for vLLM)")
    parser.add_argument("--mac-model", default="gpt-5.2",
                        help="MAC controller model (decision/proposer/editor agents)")
    parser.add_argument("--mac-base-url", default=None,
                        help="Base URL for MAC model (if hosted via vLLM)")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--train", type=int, default=80)
    parser.add_argument("--holdout", type=int, default=16)
    parser.add_argument("--output", default="hover_adapt_rules.json")
    args = parser.parse_args()

    train, holdout = load_hover(args.train, args.holdout)
    print(f"HoVer: {len(train)} train / {len(holdout)} holdout")

    mac_kwargs = dict(
        mac_model=args.mac_model,
        num_epochs=args.epochs,
        batch_size=4,
        provider="openai",
        task_description="Determine whether the claim is true or false. Answer 'yes' or 'no'.",
        rule_type="fact verification rules",
    )
    if args.worker_base_url:
        mac_kwargs["base_url"] = args.worker_base_url
    if args.mac_base_url:
        mac_kwargs["mac_base_url"] = args.mac_base_url

    compiler = MAC(model=args.worker_model, **mac_kwargs)
    optimized = compiler.compile(trainset=train, holdout=holdout, metric=exact_match)

    optimized.overview()
    print(f"\nRules learned: {len(optimized.rules)}")
    for i, r in enumerate(optimized.rules, 1):
        print(f"  {i}. {r}")

    optimized.save(args.output)
    print(f"\nSaved to {args.output}")
"""
HoVer — custom style (Mode 1: user-supplied prompt with {{CONSTITUTION_BLOCK}})

MAC learns constitution rules on HoVer fact verification (yes/no).
You supply the full prompt; MAC injects learned rules at {{CONSTITUTION_BLOCK}}.

Usage:
    # OpenAI models (Config C / D)
    python examples/hover_custom.py --worker-model gpt-4o-mini --mac-model gpt-5.2

    # vLLM worker + OpenAI MAC (Config A)
    python examples/hover_custom.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model gpt-5.2

    # All vLLM (Config B)
    python examples/hover_custom.py \\
        --worker-model Qwen/Qwen3-8B --worker-base-url http://localhost:8000/v1 \\
        --mac-model Qwen/Qwen3-8B --mac-base-url http://localhost:8000/v1
"""

import sys, os, random, argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from mac import Example, MAC

TASK_PROMPT = """\
You are a fact verification expert. Given a claim, determine whether it is
SUPPORTED ("yes") or NOT SUPPORTED ("no") based on your knowledge.
Reason carefully before deciding.

{{CONSTITUTION_BLOCK}}

Return your answer as JSON: {"reasoning": "...", "answer": "yes" or "no"}"""


def load_hover(n_train=80, n_holdout=16, seed=42):
    """Load yes/no questions from HotpotQA as a HoVer proxy."""
    from datasets import load_dataset
    ds = load_dataset("hotpot_qa", "distractor", split="validation")
    items = [row for row in ds if str(row["answer"]).lower() in ("yes", "no")]
    random.seed(seed)
    random.shuffle(items)
    items = items[:n_train + n_holdout]
    examples = [Example(input=row["question"], output=row["answer"].lower()) for row in items]
    return examples[:n_train], examples[n_train:]


def exact_match(prediction, gold):
    return 1.0 if str(prediction).strip().lower() == str(gold).strip().lower() else 0.0


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="HoVer — custom prompt style")
    parser.add_argument("--worker-model", default="gpt-4o-mini",
                        help="Worker (student) model. e.g. gpt-4o-mini, Qwen/Qwen3-8B")
    parser.add_argument("--worker-base-url", default=None,
                        help="Base URL for worker (e.g. http://localhost:8000/v1 for vLLM)")
    parser.add_argument("--mac-model", default="gpt-5.2",
                        help="MAC controller model (decision/proposer/editor agents)")
    parser.add_argument("--mac-base-url", default=None,
                        help="Base URL for MAC model (if hosted via vLLM)")
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--train", type=int, default=80)
    parser.add_argument("--holdout", type=int, default=16)
    parser.add_argument("--output", default="hover_custom_rules.json")
    args = parser.parse_args()

    train, holdout = load_hover(args.train, args.holdout)
    print(f"HoVer: {len(train)} train / {len(holdout)} holdout")

    mac_kwargs = dict(
        mac_model=args.mac_model,
        num_epochs=args.epochs,
        batch_size=4,
        provider="openai",
        task_prompt=TASK_PROMPT,
        task_description="Determine whether claims are supported or not supported.",
        rule_type="fact verification rules",
    )
    if args.worker_base_url:
        mac_kwargs["base_url"] = args.worker_base_url
    if args.mac_base_url:
        mac_kwargs["mac_base_url"] = args.mac_base_url

    compiler = MAC(model=args.worker_model, **mac_kwargs)
    optimized = compiler.compile(trainset=train, holdout=holdout, metric=exact_match)

    optimized.overview()
    print(f"\nRules learned: {len(optimized.rules)}")
    for i, r in enumerate(optimized.rules, 1):
        print(f"  {i}. {r}")

    optimized.save(args.output)
    print(f"\nSaved to {args.output}")