From 02dbd00a1abf636b481b438c10cc66c37c8a3724 Mon Sep 17 00:00:00 2001 From: Joaquin Hui Gomez Date: Mon, 16 Mar 2026 20:29:31 +0000 Subject: [PATCH] Add position bias detection and Cohen's kappa to llm_judge Extends compute_agreement.py with compute_position_bias(), compute_cohens_kappa(), and interpret_kappa() functions that work with the standard _pair.jsonl output format. Modifies show_result.py to print a one-line consistency summary at the bottom of pairwise results and adds --show-consistency flag for detailed metrics (bias rate, direction, kappa, Landis & Koch). Includes 13 unit tests with synthetic data in tests/test_consistency_metrics.py. --- fastchat/llm_judge/compute_agreement.py | 91 +++++++++++++++++++ fastchat/llm_judge/show_result.py | 73 ++++++++++++++- tests/test_consistency_metrics.py | 113 ++++++++++++++++++++++++ 3 files changed, 273 insertions(+), 4 deletions(-) create mode 100644 tests/test_consistency_metrics.py diff --git a/fastchat/llm_judge/compute_agreement.py b/fastchat/llm_judge/compute_agreement.py index 1b940bf5a..f4de30763 100644 --- a/fastchat/llm_judge/compute_agreement.py +++ b/fastchat/llm_judge/compute_agreement.py @@ -98,6 +98,97 @@ def get_mt_bench_agreement(data, judge1, judge2, ban): raise Exception("Unsupported judges.") +def compute_position_bias(g1_winners, g2_winners): + """Compute position bias rate and direction from paired judgments. + + Args: + g1_winners: list of winners from game 1 (original order). + g2_winners: list of winners from game 2 (swapped order). + + Returns: + bias_rate: fraction of pairs where g1 and g2 disagree. + direction: "first" if judge favors position A, "second" if position B, + "none" if balanced or no bias detected. + """ + total = len(g1_winners) + if total == 0: + return 0.0, "none" + + disagree = 0 + favor_first = 0 # g1 says model_1, g2 says model_2 (both favor position A) + favor_second = 0 + for g1, g2 in zip(g1_winners, g2_winners): + if g1 != g2: + disagree += 1 + # g1: model_1 in position A; g2: model_2 in position A + # If g1=model_1 and g2=model_2, judge always picked position A + if g1 == "model_1" and g2 == "model_2": + favor_first += 1 + elif g1 == "model_2" and g2 == "model_1": + favor_second += 1 + + bias_rate = disagree / total + if favor_first > favor_second: + direction = "first" + elif favor_second > favor_first: + direction = "second" + else: + direction = "none" + + return bias_rate, direction + + +def compute_cohens_kappa(g1_winners, g2_winners): + """Compute Cohen's kappa treating game1 and game2 as two raters. + + Categories: model_1, model_2, tie. + + Args: + g1_winners: list of winners from game 1. + g2_winners: list of winners from game 2. + + Returns: + kappa: Cohen's kappa coefficient. + """ + categories = ["model_1", "model_2", "tie"] + total = len(g1_winners) + if total == 0: + return 0.0 + + # Observed agreement + agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2) + p_o = agree / total + + # Expected agreement by chance + p_e = 0.0 + for cat in categories: + p1 = sum(1 for g in g1_winners if g == cat) / total + p2 = sum(1 for g in g2_winners if g == cat) / total + p_e += p1 * p2 + + if p_e == 1.0: + return 1.0 + + kappa = (p_o - p_e) / (1.0 - p_e) + return kappa + + +def interpret_kappa(kappa): + """Interpret kappa using the Landis & Koch scale.""" + if kappa < 0.0: + return "poor" + elif kappa < 0.21: + return "slight" + elif kappa < 0.41: + return "fair" + elif kappa < 0.61: + return "moderate" + elif kappa < 0.81: + return "substantial" + else: + return "almost perfect" + + def run_mt_bench_agreement(judges, votefiles): # votes[i]: List of votes votes = [] diff --git a/fastchat/llm_judge/show_result.py b/fastchat/llm_judge/show_result.py index f20801b11..458541256 100644 --- a/fastchat/llm_judge/show_result.py +++ b/fastchat/llm_judge/show_result.py @@ -5,6 +5,12 @@ import argparse import pandas as pd +from fastchat.llm_judge.compute_agreement import ( + compute_position_bias, + compute_cohens_kappa, + interpret_kappa, +) + def display_result_single(args): if args.input_file is None: @@ -91,6 +97,57 @@ def display_result_pairwise(args): # print(df.sort_values(by="loss_rate", ascending=True)) print(df.sort_values(by="win_rate_adjusted", ascending=False)) + # Print one-line consistency summary + g1_winners = df_all["g1_winner"].tolist() + g2_winners = df_all["g2_winner"].tolist() + bias_rate, _ = compute_position_bias(g1_winners, g2_winners) + kappa = compute_cohens_kappa(g1_winners, g2_winners) + print( + f"\n[Consistency] position bias: {bias_rate:.1%}, " + f"Cohen's kappa: {kappa:.3f} ({interpret_kappa(kappa)})" + ) + + +def display_consistency_metrics(args): + """Display detailed position bias and Cohen's kappa metrics.""" + if args.input_file is None: + input_file = ( + f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl" + ) + else: + input_file = args.input_file + + print(f"Input file: {input_file}") + df_all = pd.read_json(input_file, lines=True) + df_all = df_all[(df_all["g1_winner"] != "error") & (df_all["g2_winner"] != "error")] + + g1_winners = df_all["g1_winner"].tolist() + g2_winners = df_all["g2_winner"].tolist() + total = len(g1_winners) + + if total == 0: + print("No valid judgments found.") + return + + # Position bias + bias_rate, direction = compute_position_bias(g1_winners, g2_winners) + + # Cohen's kappa + kappa = compute_cohens_kappa(g1_winners, g2_winners) + interpretation = interpret_kappa(kappa) + + # Simple agreement + agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2) + agree_rate = agree / total + + print(f"\n########## Consistency Metrics ##########") + print(f"Total pairs: {total}") + print(f"Agreement rate: {agree_rate:.1%} ({agree}/{total})") + print(f"Position bias rate: {bias_rate:.1%}") + print(f"Position bias direction: {direction}") + print(f"Cohen's kappa: {kappa:.3f}") + print(f"Interpretation: {interpretation} (Landis & Koch)") + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -117,14 +174,22 @@ def display_result_pairwise(args): "`single` runs single answer grading." ), ) + parser.add_argument( + "--show-consistency", + action="store_true", + help="Show detailed position bias and Cohen's kappa metrics.", + ) args = parser.parse_args() - if args.mode == "single": + if args.show_consistency: + display_consistency_metrics(args) + elif args.mode == "single": display_result_func = display_result_single + print(f"Mode: {args.mode}") + display_result_func(args) else: if args.mode == "pairwise-all": args.baseline_model = None display_result_func = display_result_pairwise - - print(f"Mode: {args.mode}") - display_result_func(args) + print(f"Mode: {args.mode}") + display_result_func(args) diff --git a/tests/test_consistency_metrics.py b/tests/test_consistency_metrics.py new file mode 100644 index 000000000..efb1c901d --- /dev/null +++ b/tests/test_consistency_metrics.py @@ -0,0 +1,113 @@ +"""Unit tests for position bias and Cohen's kappa metrics.""" +import unittest + +from fastchat.llm_judge.compute_agreement import ( + compute_position_bias, + compute_cohens_kappa, + interpret_kappa, +) + + +class TestPositionBias(unittest.TestCase): + def test_no_bias(self): + """Perfect agreement means zero bias.""" + g1 = ["model_1", "model_2", "tie"] + g2 = ["model_1", "model_2", "tie"] + rate, direction = compute_position_bias(g1, g2) + self.assertEqual(rate, 0.0) + self.assertEqual(direction, "none") + + def test_full_first_position_bias(self): + """Judge always picks position A -> favor first.""" + g1 = ["model_1", "model_1", "model_1"] + g2 = ["model_2", "model_2", "model_2"] + rate, direction = compute_position_bias(g1, g2) + self.assertEqual(rate, 1.0) + self.assertEqual(direction, "first") + + def test_full_second_position_bias(self): + """Judge always picks position B -> favor second.""" + g1 = ["model_2", "model_2"] + g2 = ["model_1", "model_1"] + rate, direction = compute_position_bias(g1, g2) + self.assertEqual(rate, 1.0) + self.assertEqual(direction, "second") + + def test_mixed_bias(self): + """Partial disagreement with balanced direction.""" + g1 = ["model_1", "model_2", "model_1", "model_2"] + g2 = ["model_1", "model_2", "model_2", "model_1"] + rate, direction = compute_position_bias(g1, g2) + self.assertAlmostEqual(rate, 0.5) + # one favor_first (idx 2) and one favor_second (idx 3) + self.assertEqual(direction, "none") + + def test_empty_input(self): + rate, direction = compute_position_bias([], []) + self.assertEqual(rate, 0.0) + self.assertEqual(direction, "none") + + +class TestCohensKappa(unittest.TestCase): + def test_perfect_agreement(self): + g1 = ["model_1", "model_2", "tie", "model_1"] + g2 = ["model_1", "model_2", "tie", "model_1"] + kappa = compute_cohens_kappa(g1, g2) + self.assertAlmostEqual(kappa, 1.0) + + def test_no_agreement_disjoint(self): + """Disjoint categories: p_o=0 and p_e=0 -> kappa=0.""" + g1 = ["model_1", "model_1", "model_1"] + g2 = ["model_2", "model_2", "model_2"] + kappa = compute_cohens_kappa(g1, g2) + self.assertAlmostEqual(kappa, 0.0) + + def test_below_chance_agreement(self): + """Agreement below chance should give kappa < 0.""" + # Both raters use all categories, but disagree more than chance + g1 = ["model_1", "model_2", "tie", "model_1", "model_2", "tie"] + g2 = ["model_2", "tie", "model_1", "model_2", "tie", "model_1"] + kappa = compute_cohens_kappa(g1, g2) + self.assertLess(kappa, 0.0) + + def test_chance_agreement(self): + """When agreement equals chance, kappa should be ~0.""" + # Two raters each pick model_1 50% and model_2 50% + # but they disagree in a pattern that matches chance + g1 = ["model_1", "model_2", "model_1", "model_2"] + g2 = ["model_2", "model_1", "model_1", "model_2"] + kappa = compute_cohens_kappa(g1, g2) + # p_o = 2/4 = 0.5, p_e = 0.5*0.5 + 0.5*0.5 = 0.5 + self.assertAlmostEqual(kappa, 0.0) + + def test_empty_input(self): + kappa = compute_cohens_kappa([], []) + self.assertEqual(kappa, 0.0) + + def test_all_same_category(self): + """Both raters always say the same thing.""" + g1 = ["tie", "tie", "tie"] + g2 = ["tie", "tie", "tie"] + kappa = compute_cohens_kappa(g1, g2) + self.assertAlmostEqual(kappa, 1.0) + + +class TestInterpretKappa(unittest.TestCase): + def test_scale(self): + self.assertEqual(interpret_kappa(-0.1), "poor") + self.assertEqual(interpret_kappa(0.1), "slight") + self.assertEqual(interpret_kappa(0.3), "fair") + self.assertEqual(interpret_kappa(0.5), "moderate") + self.assertEqual(interpret_kappa(0.7), "substantial") + self.assertEqual(interpret_kappa(0.9), "almost perfect") + + def test_boundaries(self): + self.assertEqual(interpret_kappa(0.0), "slight") + self.assertEqual(interpret_kappa(0.21), "fair") + self.assertEqual(interpret_kappa(0.41), "moderate") + self.assertEqual(interpret_kappa(0.61), "substantial") + self.assertEqual(interpret_kappa(0.81), "almost perfect") + + +if __name__ == "__main__": + unittest.main()