Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/deep_learning/llm/rlhf/claude_judge.py
2616 views
1
"""
2
LLM as pairwise judge. This script takes in two dataframe with prompt/responses,
3
uses AWS bedrock's claude as LLM judge,
4
prints out a win/tie/lose table
5
6
https://aws.amazon.com/blogs/aws/anthropics-claude-3-sonnet-foundation-model-is-now-available-in-amazon-bedrock/
7
"""
8
import json
9
import boto3
10
import numpy as np
11
import pandas as pd
12
from typing import Optional
13
from botocore.exceptions import ClientError
14
15
16
class PairwiseBedRockLLMJudgeModule:
17
18
default_system_prompt = '''
19
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
20
21
Instruction: {prompt}
22
23
Model Outputs: Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
24
25
"model_identifier": "1", "output": """{response1}""" "model_identifier": "2", "output": """{response2}"""
26
27
Task Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
28
'''
29
30
def __init__(
31
self,
32
prompt_col_name: str = "prompts",
33
response1_col_name: str = "responses1",
34
response2_col_name: str = "responses2",
35
system_prompt: Optional[str] = None,
36
model_id: str = "anthropic.claude-3-sonnet-20240229-v1:0",
37
max_tokens: int = 512,
38
temperature: float = 0.1,
39
):
40
self.prompt_col_name = prompt_col_name
41
self.response1_col_name = response1_col_name
42
self.response2_col_name = response2_col_name
43
self.model_id = model_id
44
self.max_tokens = max_tokens
45
self.temperature = temperature
46
self.system_prompt = system_prompt if system_prompt is not None else self.default_system_prompt
47
48
def __call__(self, features):
49
df_responses = self.generate_responses(features)
50
df_responses = self.calculate_result(df_responses)
51
return df_responses
52
53
def generate_responses(self, features):
54
"""
55
prompt/response1/response2 are basically pass through column saved for interpretability
56
our main goal is to obtain judge's response (swapped position to account for position bias)
57
"""
58
prompts = []
59
responses1 = []
60
responses2 = []
61
judge_responses = []
62
judge_responses_swapped_position = []
63
for feature in features:
64
prompt = feature[self.prompt_col_name]
65
response1 = feature[self.response1_col_name]
66
response2 = feature[self.response2_col_name]
67
68
judge_prompt = self.system_prompt.format(
69
prompt=prompt, response1=response1, response2=response2
70
)
71
judge_swapped_position_prompt = self.system_prompt.format(
72
prompt=prompt, response1=response2, response2=response1
73
)
74
judge_response = self.call_bedrock(judge_prompt, self.model_id)
75
judge_responses.append(judge_response)
76
77
judge_response_swapped_position = self.call_bedrock(judge_swapped_position_prompt, self.model_id)
78
judge_responses_swapped_position.append(judge_response_swapped_position)
79
80
prompts.append(prompt)
81
responses1.append(response1)
82
responses2.append(response2)
83
84
responses = {
85
"prompts": prompts,
86
"responses1": responses1,
87
"responses2": responses2,
88
"judge_responses": judge_responses,
89
"judge_responses_swapped_position": judge_responses_swapped_position
90
}
91
df_responses = pd.DataFrame(responses)
92
return df_responses
93
94
@staticmethod
95
def calculate_result(df_responses):
96
"""calculate win/tie/loss result from LLM judge's response"""
97
conditions = [
98
(df_responses['judge_responses'] > df_responses['judge_responses_swapped_position']),
99
(df_responses['judge_responses'] == df_responses['judge_responses_swapped_position']),
100
(df_responses['judge_responses'] < df_responses['judge_responses_swapped_position'])
101
]
102
choices = ['win', 'tie', 'lose']
103
df_responses['result'] = np.select(conditions, choices, default='error')
104
return df_responses
105
106
def call_bedrock(
107
self,
108
prompt,
109
model_id
110
):
111
"""
112
References
113
----------
114
https://docs.aws.amazon.com/code-library/latest/ug/python_3_bedrock-runtime_code_examples.html#anthropic_claude
115
"""
116
client = boto3.client("bedrock-runtime", region_name="us-east-1")
117
118
# Format the request payload using the model's native structure,
119
# note different model variants may have different native structure,
120
# this is for claude
121
native_request = {
122
"anthropic_version": "bedrock-2023-05-31",
123
"max_tokens": self.max_tokens,
124
"temperature": self.temperature,
125
"messages": [
126
{
127
"role": "user",
128
"content": [{"type": "text", "text": prompt}],
129
}
130
],
131
}
132
133
request = json.dumps(native_request)
134
try:
135
response = client.invoke_model(modelId=model_id, body=request)
136
except (ClientError, Exception) as e:
137
print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
138
exit(1)
139
140
model_response = json.loads(response["body"].read())
141
response_text = model_response["content"][0]["text"]
142
return response_text
143
144
145
146
if __name__ == "__main__":
147
# model completion/answer, we treat prediction1's as our baseline/reference model
148
prediction1_path = "prediction_instruction_3B_model"
149
prediction2_path = "prediction_dpo_model_v7"
150
llm_judge_response_path = "llm_judge_responses_v7.parquet"
151
152
df_prediction1 = pd.read_parquet(prediction1_path).rename(columns={"responses": "responses1"})
153
df_prediction2 = pd.read_parquet(prediction2_path).rename(columns={"responses": "responses2"})
154
df_prediction = df_prediction1.merge(df_prediction2, on=["prompts"])
155
examples = df_prediction.to_dict("records")
156
pairwise_judge = PairwiseBedRockLLMJudgeModule()
157
df_responses = pairwise_judge(examples)
158
df_responses.to_parquet(llm_judge_response_path, index=False)
159
print(df_responses["result"].value_counts())
160
161