Path: blob/master/deep_learning/llm/rlhf/claude_judge.py
2616 views
"""1LLM as pairwise judge. This script takes in two dataframe with prompt/responses,2uses AWS bedrock's claude as LLM judge,3prints out a win/tie/lose table45https://aws.amazon.com/blogs/aws/anthropics-claude-3-sonnet-foundation-model-is-now-available-in-amazon-bedrock/6"""7import json8import boto39import numpy as np10import pandas as pd11from typing import Optional12from botocore.exceptions import ClientError131415class PairwiseBedRockLLMJudgeModule:1617default_system_prompt = '''18I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.1920Instruction: {prompt}2122Model Outputs: Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.2324"model_identifier": "1", "output": """{response1}""" "model_identifier": "2", "output": """{response2}"""2526Task Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).27'''2829def __init__(30self,31prompt_col_name: str = "prompts",32response1_col_name: str = "responses1",33response2_col_name: str = "responses2",34system_prompt: Optional[str] = None,35model_id: str = "anthropic.claude-3-sonnet-20240229-v1:0",36max_tokens: int = 512,37temperature: float = 0.1,38):39self.prompt_col_name = prompt_col_name40self.response1_col_name = response1_col_name41self.response2_col_name = response2_col_name42self.model_id = model_id43self.max_tokens = max_tokens44self.temperature = temperature45self.system_prompt = system_prompt if system_prompt is not None else self.default_system_prompt4647def __call__(self, features):48df_responses = self.generate_responses(features)49df_responses = self.calculate_result(df_responses)50return df_responses5152def generate_responses(self, features):53"""54prompt/response1/response2 are basically pass through column saved for interpretability55our main goal is to obtain judge's response (swapped position to account for position bias)56"""57prompts = []58responses1 = []59responses2 = []60judge_responses = []61judge_responses_swapped_position = []62for feature in features:63prompt = feature[self.prompt_col_name]64response1 = feature[self.response1_col_name]65response2 = feature[self.response2_col_name]6667judge_prompt = self.system_prompt.format(68prompt=prompt, response1=response1, response2=response269)70judge_swapped_position_prompt = self.system_prompt.format(71prompt=prompt, response1=response2, response2=response172)73judge_response = self.call_bedrock(judge_prompt, self.model_id)74judge_responses.append(judge_response)7576judge_response_swapped_position = self.call_bedrock(judge_swapped_position_prompt, self.model_id)77judge_responses_swapped_position.append(judge_response_swapped_position)7879prompts.append(prompt)80responses1.append(response1)81responses2.append(response2)8283responses = {84"prompts": prompts,85"responses1": responses1,86"responses2": responses2,87"judge_responses": judge_responses,88"judge_responses_swapped_position": judge_responses_swapped_position89}90df_responses = pd.DataFrame(responses)91return df_responses9293@staticmethod94def calculate_result(df_responses):95"""calculate win/tie/loss result from LLM judge's response"""96conditions = [97(df_responses['judge_responses'] > df_responses['judge_responses_swapped_position']),98(df_responses['judge_responses'] == df_responses['judge_responses_swapped_position']),99(df_responses['judge_responses'] < df_responses['judge_responses_swapped_position'])100]101choices = ['win', 'tie', 'lose']102df_responses['result'] = np.select(conditions, choices, default='error')103return df_responses104105def call_bedrock(106self,107prompt,108model_id109):110"""111References112----------113https://docs.aws.amazon.com/code-library/latest/ug/python_3_bedrock-runtime_code_examples.html#anthropic_claude114"""115client = boto3.client("bedrock-runtime", region_name="us-east-1")116117# Format the request payload using the model's native structure,118# note different model variants may have different native structure,119# this is for claude120native_request = {121"anthropic_version": "bedrock-2023-05-31",122"max_tokens": self.max_tokens,123"temperature": self.temperature,124"messages": [125{126"role": "user",127"content": [{"type": "text", "text": prompt}],128}129],130}131132request = json.dumps(native_request)133try:134response = client.invoke_model(modelId=model_id, body=request)135except (ClientError, Exception) as e:136print(f"ERROR: Can't invoke '{model_id}'. Reason: {e}")137exit(1)138139model_response = json.loads(response["body"].read())140response_text = model_response["content"][0]["text"]141return response_text142143144145if __name__ == "__main__":146# model completion/answer, we treat prediction1's as our baseline/reference model147prediction1_path = "prediction_instruction_3B_model"148prediction2_path = "prediction_dpo_model_v7"149llm_judge_response_path = "llm_judge_responses_v7.parquet"150151df_prediction1 = pd.read_parquet(prediction1_path).rename(columns={"responses": "responses1"})152df_prediction2 = pd.read_parquet(prediction2_path).rename(columns={"responses": "responses2"})153df_prediction = df_prediction1.merge(df_prediction2, on=["prompts"])154examples = df_prediction.to_dict("records")155pairwise_judge = PairwiseBedRockLLMJudgeModule()156df_responses = pairwise_judge(examples)157df_responses.to_parquet(llm_judge_response_path, index=False)158print(df_responses["result"].value_counts())159160161