Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions fastchat/llm_judge/compute_agreement.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,97 @@ def get_mt_bench_agreement(data, judge1, judge2, ban):
raise Exception("Unsupported judges.")


def compute_position_bias(g1_winners, g2_winners):
"""Compute position bias rate and direction from paired judgments.

Args:
g1_winners: list of winners from game 1 (original order).
g2_winners: list of winners from game 2 (swapped order).

Returns:
bias_rate: fraction of pairs where g1 and g2 disagree.
direction: "first" if judge favors position A, "second" if position B,
"none" if balanced or no bias detected.
"""
total = len(g1_winners)
if total == 0:
return 0.0, "none"

disagree = 0
favor_first = 0 # g1 says model_1, g2 says model_2 (both favor position A)
favor_second = 0
for g1, g2 in zip(g1_winners, g2_winners):
if g1 != g2:
disagree += 1
# g1: model_1 in position A; g2: model_2 in position A
# If g1=model_1 and g2=model_2, judge always picked position A
if g1 == "model_1" and g2 == "model_2":
favor_first += 1
elif g1 == "model_2" and g2 == "model_1":
favor_second += 1

bias_rate = disagree / total
if favor_first > favor_second:
direction = "first"
elif favor_second > favor_first:
direction = "second"
else:
direction = "none"

return bias_rate, direction


def compute_cohens_kappa(g1_winners, g2_winners):
"""Compute Cohen's kappa treating game1 and game2 as two raters.

Categories: model_1, model_2, tie.

Args:
g1_winners: list of winners from game 1.
g2_winners: list of winners from game 2.

Returns:
kappa: Cohen's kappa coefficient.
"""
categories = ["model_1", "model_2", "tie"]
total = len(g1_winners)
if total == 0:
return 0.0

# Observed agreement
agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2)
p_o = agree / total

# Expected agreement by chance
p_e = 0.0
for cat in categories:
p1 = sum(1 for g in g1_winners if g == cat) / total
p2 = sum(1 for g in g2_winners if g == cat) / total
p_e += p1 * p2

if p_e == 1.0:
return 1.0

kappa = (p_o - p_e) / (1.0 - p_e)
return kappa


def interpret_kappa(kappa):
"""Interpret kappa using the Landis & Koch scale."""
if kappa < 0.0:
return "poor"
elif kappa < 0.21:
return "slight"
elif kappa < 0.41:
return "fair"
elif kappa < 0.61:
return "moderate"
elif kappa < 0.81:
return "substantial"
else:
return "almost perfect"


def run_mt_bench_agreement(judges, votefiles):
# votes[i]: List of votes
votes = []
Expand Down
73 changes: 69 additions & 4 deletions fastchat/llm_judge/show_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
import argparse
import pandas as pd

from fastchat.llm_judge.compute_agreement import (
compute_position_bias,
compute_cohens_kappa,
interpret_kappa,
)


def display_result_single(args):
if args.input_file is None:
Expand Down Expand Up @@ -91,6 +97,57 @@ def display_result_pairwise(args):
# print(df.sort_values(by="loss_rate", ascending=True))
print(df.sort_values(by="win_rate_adjusted", ascending=False))

# Print one-line consistency summary
g1_winners = df_all["g1_winner"].tolist()
g2_winners = df_all["g2_winner"].tolist()
bias_rate, _ = compute_position_bias(g1_winners, g2_winners)
kappa = compute_cohens_kappa(g1_winners, g2_winners)
print(
f"\n[Consistency] position bias: {bias_rate:.1%}, "
f"Cohen's kappa: {kappa:.3f} ({interpret_kappa(kappa)})"
)


def display_consistency_metrics(args):
"""Display detailed position bias and Cohen's kappa metrics."""
if args.input_file is None:
input_file = (
f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
)
else:
input_file = args.input_file

print(f"Input file: {input_file}")
df_all = pd.read_json(input_file, lines=True)
df_all = df_all[(df_all["g1_winner"] != "error") & (df_all["g2_winner"] != "error")]

g1_winners = df_all["g1_winner"].tolist()
g2_winners = df_all["g2_winner"].tolist()
total = len(g1_winners)

if total == 0:
print("No valid judgments found.")
return

# Position bias
bias_rate, direction = compute_position_bias(g1_winners, g2_winners)

# Cohen's kappa
kappa = compute_cohens_kappa(g1_winners, g2_winners)
interpretation = interpret_kappa(kappa)

# Simple agreement
agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2)
agree_rate = agree / total

print(f"\n########## Consistency Metrics ##########")
print(f"Total pairs: {total}")
print(f"Agreement rate: {agree_rate:.1%} ({agree}/{total})")
print(f"Position bias rate: {bias_rate:.1%}")
print(f"Position bias direction: {direction}")
print(f"Cohen's kappa: {kappa:.3f}")
print(f"Interpretation: {interpretation} (Landis & Koch)")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand All @@ -117,14 +174,22 @@ def display_result_pairwise(args):
"`single` runs single answer grading."
),
)
parser.add_argument(
"--show-consistency",
action="store_true",
help="Show detailed position bias and Cohen's kappa metrics.",
)
args = parser.parse_args()

if args.mode == "single":
if args.show_consistency:
display_consistency_metrics(args)
elif args.mode == "single":
display_result_func = display_result_single
print(f"Mode: {args.mode}")
display_result_func(args)
else:
if args.mode == "pairwise-all":
args.baseline_model = None
display_result_func = display_result_pairwise

print(f"Mode: {args.mode}")
display_result_func(args)
print(f"Mode: {args.mode}")
display_result_func(args)
113 changes: 113 additions & 0 deletions tests/test_consistency_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Unit tests for position bias and Cohen's kappa metrics."""
import unittest

from fastchat.llm_judge.compute_agreement import (
compute_position_bias,
compute_cohens_kappa,
interpret_kappa,
)


class TestPositionBias(unittest.TestCase):
def test_no_bias(self):
"""Perfect agreement means zero bias."""
g1 = ["model_1", "model_2", "tie"]
g2 = ["model_1", "model_2", "tie"]
rate, direction = compute_position_bias(g1, g2)
self.assertEqual(rate, 0.0)
self.assertEqual(direction, "none")

def test_full_first_position_bias(self):
"""Judge always picks position A -> favor first."""
g1 = ["model_1", "model_1", "model_1"]
g2 = ["model_2", "model_2", "model_2"]
rate, direction = compute_position_bias(g1, g2)
self.assertEqual(rate, 1.0)
self.assertEqual(direction, "first")

def test_full_second_position_bias(self):
"""Judge always picks position B -> favor second."""
g1 = ["model_2", "model_2"]
g2 = ["model_1", "model_1"]
rate, direction = compute_position_bias(g1, g2)
self.assertEqual(rate, 1.0)
self.assertEqual(direction, "second")

def test_mixed_bias(self):
"""Partial disagreement with balanced direction."""
g1 = ["model_1", "model_2", "model_1", "model_2"]
g2 = ["model_1", "model_2", "model_2", "model_1"]
rate, direction = compute_position_bias(g1, g2)
self.assertAlmostEqual(rate, 0.5)
# one favor_first (idx 2) and one favor_second (idx 3)
self.assertEqual(direction, "none")

def test_empty_input(self):
rate, direction = compute_position_bias([], [])
self.assertEqual(rate, 0.0)
self.assertEqual(direction, "none")


class TestCohensKappa(unittest.TestCase):
def test_perfect_agreement(self):
g1 = ["model_1", "model_2", "tie", "model_1"]
g2 = ["model_1", "model_2", "tie", "model_1"]
kappa = compute_cohens_kappa(g1, g2)
self.assertAlmostEqual(kappa, 1.0)

def test_no_agreement_disjoint(self):
"""Disjoint categories: p_o=0 and p_e=0 -> kappa=0."""
g1 = ["model_1", "model_1", "model_1"]
g2 = ["model_2", "model_2", "model_2"]
kappa = compute_cohens_kappa(g1, g2)
self.assertAlmostEqual(kappa, 0.0)

def test_below_chance_agreement(self):
"""Agreement below chance should give kappa < 0."""
# Both raters use all categories, but disagree more than chance
g1 = ["model_1", "model_2", "tie", "model_1", "model_2", "tie"]
g2 = ["model_2", "tie", "model_1", "model_2", "tie", "model_1"]
kappa = compute_cohens_kappa(g1, g2)
self.assertLess(kappa, 0.0)

def test_chance_agreement(self):
"""When agreement equals chance, kappa should be ~0."""
# Two raters each pick model_1 50% and model_2 50%
# but they disagree in a pattern that matches chance
g1 = ["model_1", "model_2", "model_1", "model_2"]
g2 = ["model_2", "model_1", "model_1", "model_2"]
kappa = compute_cohens_kappa(g1, g2)
# p_o = 2/4 = 0.5, p_e = 0.5*0.5 + 0.5*0.5 = 0.5
self.assertAlmostEqual(kappa, 0.0)

def test_empty_input(self):
kappa = compute_cohens_kappa([], [])
self.assertEqual(kappa, 0.0)

def test_all_same_category(self):
"""Both raters always say the same thing."""
g1 = ["tie", "tie", "tie"]
g2 = ["tie", "tie", "tie"]
kappa = compute_cohens_kappa(g1, g2)
self.assertAlmostEqual(kappa, 1.0)


class TestInterpretKappa(unittest.TestCase):
def test_scale(self):
self.assertEqual(interpret_kappa(-0.1), "poor")
self.assertEqual(interpret_kappa(0.1), "slight")
self.assertEqual(interpret_kappa(0.3), "fair")
self.assertEqual(interpret_kappa(0.5), "moderate")
self.assertEqual(interpret_kappa(0.7), "substantial")
self.assertEqual(interpret_kappa(0.9), "almost perfect")

def test_boundaries(self):
self.assertEqual(interpret_kappa(0.0), "slight")
self.assertEqual(interpret_kappa(0.21), "fair")
self.assertEqual(interpret_kappa(0.41), "moderate")
self.assertEqual(interpret_kappa(0.61), "substantial")
self.assertEqual(interpret_kappa(0.81), "almost perfect")


if __name__ == "__main__":
unittest.main()
Loading