CoCalc -- claude

GitHub Repository: ethen8181/machine-learning
Path: blob/master/deep_learning/llm/rlhf/claude_judge.py
²⁶¹⁶ views
1
"""
2
LLM as pairwise judge. This script takes in two dataframe with prompt/responses,
3
uses AWS bedrock's claude as LLM judge,
4
prints out a win/tie/lose table
5

6
https://aws.amazon.com/blogs/aws/anthropics-claude-3-sonnet-foundation-model-is-now-available-in-amazon-bedrock/
7
"""
8
import json
9
import boto3
10
import numpy as np
11
import pandas as pd
12
from typing import Optional
13
from botocore.exceptions import ClientError
14

15

16
class PairwiseBedRockLLMJudgeModule:
17

18
    default_system_prompt = '''
19
    I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
20

21
    Instruction: {prompt}
22

23
    Model Outputs: Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
24

25
    "model_identifier": "1", "output": """{response1}""" "model_identifier": "2", "output": """{response2}"""
26

27
    Task Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
28
    '''
29

30
    def __init__(
31
        self,
32
        prompt_col_name: str = "prompts",
33
        response1_col_name: str = "responses1",
34
        response2_col_name: str = "responses2",
35
        system_prompt: Optional[str] = None,
36
        model_id: str = "anthropic.claude-3-sonnet-20240229-v1:0",
37
        max_tokens: int = 512,
38
        temperature: float = 0.1,
39
    ):
40
        self.prompt_col_name = prompt_col_name
41
        self.response1_col_name = response1_col_name
42
        self.response2_col_name = response2_col_name
43
        self.model_id = model_id
44
        self.max_tokens = max_tokens
45
        self.temperature = temperature
46
        self.system_prompt = system_prompt if system_prompt is not None else self.default_system_prompt
47

48
    def __call__(self, features):
49
        df_responses = self.generate_responses(features)
50
        df_responses = self.calculate_result(df_responses)
51
        return df_responses
52

53
    def generate_responses(self, features):
54
        """
55
        prompt/response1/response2 are basically pass through column saved for interpretability
56
        our main goal is to obtain judge's response (swapped position to account for position bias)
57
        """
58
        prompts = []
59
        responses1 = []
60
        responses2 = []
61
        judge_responses = []
62
        judge_responses_swapped_position = []
63
        for feature in features:
64
            prompt = feature[self.prompt_col_name]
65
            response1 = feature[self.response1_col_name]
66
            response2 = feature[self.response2_col_name]
67
            
68
            judge_prompt = self.system_prompt.format(
69
                prompt=prompt, response1=response1, response2=response2
70
            )
71
            judge_swapped_position_prompt = self.system_prompt.format(
72
                prompt=prompt, response1=response2, response2=response1
73
            )
74
            judge_response = self.call_bedrock(judge_prompt, self.model_id)
75
            judge_responses.append(judge_response)
76

77
            judge_response_swapped_position = self.call_bedrock(judge_swapped_position_prompt, self.model_id)
78
            judge_responses_swapped_position.append(judge_response_swapped_position)
79

80
            prompts.append(prompt)
81
            responses1.append(response1)
82
            responses2.append(response2)
83

84
        responses = {
85
            "prompts": prompts,
86
            "responses1": responses1,
87
            "responses2": responses2,
88
            "judge_responses": judge_responses,
89
            "judge_responses_swapped_position": judge_responses_swapped_position
90
        }
91
        df_responses = pd.DataFrame(responses)
92
        return df_responses
93

94
    @staticmethod
95
    def calculate_result(df_responses):
96
        """calculate win/tie/loss result from LLM judge's response"""
97
        conditions = [
98
            (df_responses['judge_responses'] > df_responses['judge_responses_swapped_position']),
99
            (df_responses['judge_responses'] == df_responses['judge_responses_swapped_position']),
100
            (df_responses['judge_responses'] < df_responses['judge_responses_swapped_position'])
101
        ]
102
        choices = ['win', 'tie', 'lose']
103
        df_responses['result'] = np.select(conditions, choices, default='error')
104
        return df_responses
105

106
    def call_bedrock(
107
        self,
108
        prompt,
109
        model_id
110
    ):
111
        """
112
        References
113
        ----------
114
        https://docs.aws.amazon.com/code-library/latest/ug/python_3_bedrock-runtime_code_examples.html#anthropic_claude
115
        """
116
        client = boto3.client("bedrock-runtime", region_name="us-east-1")
117
    
118
        # Format the request payload using the model's native structure,
119
        # note different model variants may have different native structure,
120
        # this is for claude
121
        native_request = {
122
            "anthropic_version": "bedrock-2023-05-31",
123
            "max_tokens": self.max_tokens,
124
            "temperature": self.temperature,
125
            "messages": [
126
                {
127
                    "role": "user",
128
                    "content": [{"type": "text", "text": prompt}],
129
                }
130
            ],
131
        }
132
        
133
        request = json.dumps(native_request)
134
        try:
135
            response = client.invoke_model(modelId=model_id, body=request)
136
        except (ClientError, Exception) as e:
137
            print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")
138
            exit(1)
139
    
140
        model_response = json.loads(response["body"].read())
141
        response_text = model_response["content"][0]["text"]
142
        return response_text
143

144

145

146
if __name__ == "__main__":
147
    # model completion/answer, we treat prediction1's as our baseline/reference model
148
    prediction1_path = "prediction_instruction_3B_model"
149
    prediction2_path = "prediction_dpo_model_v7"
150
    llm_judge_response_path = "llm_judge_responses_v7.parquet"
151
    
152
    df_prediction1 = pd.read_parquet(prediction1_path).rename(columns={"responses": "responses1"})
153
    df_prediction2 = pd.read_parquet(prediction2_path).rename(columns={"responses": "responses2"})
154
    df_prediction = df_prediction1.merge(df_prediction2, on=["prompts"])
155
    examples = df_prediction.to_dict("records")
156
    pairwise_judge = PairwiseBedRockLLMJudgeModule()
157
    df_responses = pairwise_judge(examples)
158
    df_responses.to_parquet(llm_judge_response_path, index=False)
159
    print(df_responses["result"].value_counts())
160

161
Product

Resources

Company