lm-sys · joaquinhuigomez · Mar 16, 2026
diff --git a/fastchat/llm_judge/compute_agreement.py b/fastchat/llm_judge/compute_agreement.py
@@ -98,6 +98,97 @@ def get_mt_bench_agreement(data, judge1, judge2, ban):
         raise Exception("Unsupported judges.")
 
 
+def compute_position_bias(g1_winners, g2_winners):
+    """Compute position bias rate and direction from paired judgments.
+
+    Args:
+        g1_winners: list of winners from game 1 (original order).
+        g2_winners: list of winners from game 2 (swapped order).
+
+    Returns:
+        bias_rate: fraction of pairs where g1 and g2 disagree.
+        direction: "first" if judge favors position A, "second" if position B,
+                   "none" if balanced or no bias detected.
+    """
+    total = len(g1_winners)
+    if total == 0:
+        return 0.0, "none"
+
+    disagree = 0
+    favor_first = 0  # g1 says model_1, g2 says model_2 (both favor position A)
+    favor_second = 0
+    for g1, g2 in zip(g1_winners, g2_winners):
+        if g1 != g2:
+            disagree += 1
+            # g1: model_1 in position A; g2: model_2 in position A
+            # If g1=model_1 and g2=model_2, judge always picked position A
+            if g1 == "model_1" and g2 == "model_2":
+                favor_first += 1
+            elif g1 == "model_2" and g2 == "model_1":
+                favor_second += 1
+
+    bias_rate = disagree / total
+    if favor_first > favor_second:
+        direction = "first"
+    elif favor_second > favor_first:
+        direction = "second"
+    else:
+        direction = "none"
+
+    return bias_rate, direction
+
+
+def compute_cohens_kappa(g1_winners, g2_winners):
+    """Compute Cohen's kappa treating game1 and game2 as two raters.
+
+    Categories: model_1, model_2, tie.
+
+    Args:
+        g1_winners: list of winners from game 1.
+        g2_winners: list of winners from game 2.
+
+    Returns:
+        kappa: Cohen's kappa coefficient.
+    """
+    categories = ["model_1", "model_2", "tie"]
+    total = len(g1_winners)
+    if total == 0:
+        return 0.0
+
+    # Observed agreement
+    agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2)
+    p_o = agree / total
+
+    # Expected agreement by chance
+    p_e = 0.0
+    for cat in categories:
+        p1 = sum(1 for g in g1_winners if g == cat) / total
+        p2 = sum(1 for g in g2_winners if g == cat) / total
+        p_e += p1 * p2
+
+    if p_e == 1.0:
+        return 1.0
+
+    kappa = (p_o - p_e) / (1.0 - p_e)
+    return kappa
+
+
+def interpret_kappa(kappa):
+    """Interpret kappa using the Landis & Koch scale."""
+    if kappa < 0.0:
+        return "poor"
+    elif kappa < 0.21:
+        return "slight"
+    elif kappa < 0.41:
+        return "fair"
+    elif kappa < 0.61:
+        return "moderate"
+    elif kappa < 0.81:
+        return "substantial"
+    else:
+        return "almost perfect"
+
+
 def run_mt_bench_agreement(judges, votefiles):
     # votes[i]: List of votes
     votes = []

diff --git a/fastchat/llm_judge/show_result.py b/fastchat/llm_judge/show_result.py
@@ -5,6 +5,12 @@
 import argparse
 import pandas as pd
 
+from fastchat.llm_judge.compute_agreement import (
+    compute_position_bias,
+    compute_cohens_kappa,
+    interpret_kappa,
+)
+
 
 def display_result_single(args):
     if args.input_file is None:
@@ -91,6 +97,57 @@ def display_result_pairwise(args):
     # print(df.sort_values(by="loss_rate", ascending=True))
     print(df.sort_values(by="win_rate_adjusted", ascending=False))
 
+    # Print one-line consistency summary
+    g1_winners = df_all["g1_winner"].tolist()
+    g2_winners = df_all["g2_winner"].tolist()
+    bias_rate, _ = compute_position_bias(g1_winners, g2_winners)
+    kappa = compute_cohens_kappa(g1_winners, g2_winners)
+    print(
+        f"\n[Consistency] position bias: {bias_rate:.1%}, "
+        f"Cohen's kappa: {kappa:.3f} ({interpret_kappa(kappa)})"
+    )
+
+
+def display_consistency_metrics(args):
+    """Display detailed position bias and Cohen's kappa metrics."""
+    if args.input_file is None:
+        input_file = (
+            f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
+        )
+    else:
+        input_file = args.input_file
+
+    print(f"Input file: {input_file}")
+    df_all = pd.read_json(input_file, lines=True)
+    df_all = df_all[(df_all["g1_winner"] != "error") & (df_all["g2_winner"] != "error")]
+
+    g1_winners = df_all["g1_winner"].tolist()
+    g2_winners = df_all["g2_winner"].tolist()
+    total = len(g1_winners)
+
+    if total == 0:
+        print("No valid judgments found.")
+        return
+
+    # Position bias
+    bias_rate, direction = compute_position_bias(g1_winners, g2_winners)
+
+    # Cohen's kappa
+    kappa = compute_cohens_kappa(g1_winners, g2_winners)
+    interpretation = interpret_kappa(kappa)
+
+    # Simple agreement
+    agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2)
+    agree_rate = agree / total
+
+    print(f"\n########## Consistency Metrics ##########")
+    print(f"Total pairs:             {total}")
+    print(f"Agreement rate:          {agree_rate:.1%} ({agree}/{total})")
+    print(f"Position bias rate:      {bias_rate:.1%}")
+    print(f"Position bias direction: {direction}")
+    print(f"Cohen's kappa:           {kappa:.3f}")
+    print(f"Interpretation:          {interpretation} (Landis & Koch)")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -117,14 +174,22 @@ def display_result_pairwise(args):
             "`single` runs single answer grading."
         ),
     )
+    parser.add_argument(
+        "--show-consistency",
+        action="store_true",
+        help="Show detailed position bias and Cohen's kappa metrics.",
+    )
     args = parser.parse_args()
 
-    if args.mode == "single":
+    if args.show_consistency:
+        display_consistency_metrics(args)
+    elif args.mode == "single":
         display_result_func = display_result_single
+        print(f"Mode: {args.mode}")
+        display_result_func(args)
     else:
         if args.mode == "pairwise-all":
             args.baseline_model = None
         display_result_func = display_result_pairwise
-
-    print(f"Mode: {args.mode}")
-    display_result_func(args)
+        print(f"Mode: {args.mode}")
+        display_result_func(args)
diff --git a/tests/test_consistency_metrics.py b/tests/test_consistency_metrics.py
@@ -0,0 +1,113 @@
+"""Unit tests for position bias and Cohen's kappa metrics."""
+import unittest
+
+from fastchat.llm_judge.compute_agreement import (
+    compute_position_bias,
+    compute_cohens_kappa,
+    interpret_kappa,
+)
+
+
+class TestPositionBias(unittest.TestCase):
+    def test_no_bias(self):
+        """Perfect agreement means zero bias."""
+        g1 = ["model_1", "model_2", "tie"]
+        g2 = ["model_1", "model_2", "tie"]
+        rate, direction = compute_position_bias(g1, g2)
+        self.assertEqual(rate, 0.0)
+        self.assertEqual(direction, "none")
+
+    def test_full_first_position_bias(self):
+        """Judge always picks position A -> favor first."""
+        g1 = ["model_1", "model_1", "model_1"]
+        g2 = ["model_2", "model_2", "model_2"]
+        rate, direction = compute_position_bias(g1, g2)
+        self.assertEqual(rate, 1.0)
+        self.assertEqual(direction, "first")
+
+    def test_full_second_position_bias(self):
+        """Judge always picks position B -> favor second."""
+        g1 = ["model_2", "model_2"]
+        g2 = ["model_1", "model_1"]
+        rate, direction = compute_position_bias(g1, g2)
+        self.assertEqual(rate, 1.0)
+        self.assertEqual(direction, "second")
+
+    def test_mixed_bias(self):
+        """Partial disagreement with balanced direction."""
+        g1 = ["model_1", "model_2", "model_1", "model_2"]
+        g2 = ["model_1", "model_2", "model_2", "model_1"]
+        rate, direction = compute_position_bias(g1, g2)
+        self.assertAlmostEqual(rate, 0.5)
+        # one favor_first (idx 2) and one favor_second (idx 3)
+        self.assertEqual(direction, "none")
+
+    def test_empty_input(self):
+        rate, direction = compute_position_bias([], [])
+        self.assertEqual(rate, 0.0)
+        self.assertEqual(direction, "none")
+
+
+class TestCohensKappa(unittest.TestCase):
+    def test_perfect_agreement(self):
+        g1 = ["model_1", "model_2", "tie", "model_1"]
+        g2 = ["model_1", "model_2", "tie", "model_1"]
+        kappa = compute_cohens_kappa(g1, g2)
+        self.assertAlmostEqual(kappa, 1.0)
+
+    def test_no_agreement_disjoint(self):
+        """Disjoint categories: p_o=0 and p_e=0 -> kappa=0."""
+        g1 = ["model_1", "model_1", "model_1"]
+        g2 = ["model_2", "model_2", "model_2"]
+        kappa = compute_cohens_kappa(g1, g2)
+        self.assertAlmostEqual(kappa, 0.0)
+
+    def test_below_chance_agreement(self):
+        """Agreement below chance should give kappa < 0."""
+        # Both raters use all categories, but disagree more than chance
+        g1 = ["model_1", "model_2", "tie", "model_1", "model_2", "tie"]
+        g2 = ["model_2", "tie", "model_1", "model_2", "tie", "model_1"]
+        kappa = compute_cohens_kappa(g1, g2)
+        self.assertLess(kappa, 0.0)
+
+    def test_chance_agreement(self):
+        """When agreement equals chance, kappa should be ~0."""
+        # Two raters each pick model_1 50% and model_2 50%
+        # but they disagree in a pattern that matches chance
+        g1 = ["model_1", "model_2", "model_1", "model_2"]
+        g2 = ["model_2", "model_1", "model_1", "model_2"]
+        kappa = compute_cohens_kappa(g1, g2)
+        # p_o = 2/4 = 0.5, p_e = 0.5*0.5 + 0.5*0.5 = 0.5
+        self.assertAlmostEqual(kappa, 0.0)
+
+    def test_empty_input(self):
+        kappa = compute_cohens_kappa([], [])
+        self.assertEqual(kappa, 0.0)
+
+    def test_all_same_category(self):
+        """Both raters always say the same thing."""
+        g1 = ["tie", "tie", "tie"]
+        g2 = ["tie", "tie", "tie"]
+        kappa = compute_cohens_kappa(g1, g2)
+        self.assertAlmostEqual(kappa, 1.0)
+
+
+class TestInterpretKappa(unittest.TestCase):
+    def test_scale(self):
+        self.assertEqual(interpret_kappa(-0.1), "poor")
+        self.assertEqual(interpret_kappa(0.1), "slight")
+        self.assertEqual(interpret_kappa(0.3), "fair")
+        self.assertEqual(interpret_kappa(0.5), "moderate")
+        self.assertEqual(interpret_kappa(0.7), "substantial")
+        self.assertEqual(interpret_kappa(0.9), "almost perfect")
+
+    def test_boundaries(self):
+        self.assertEqual(interpret_kappa(0.0), "slight")
+        self.assertEqual(interpret_kappa(0.21), "fair")
+        self.assertEqual(interpret_kappa(0.41), "moderate")
+        self.assertEqual(interpret_kappa(0.61), "substantial")
+        self.assertEqual(interpret_kappa(0.81), "almost perfect")
+
+
+if __name__ == "__main__":
+    unittest.main()