Evaluating prompts at scale with Prompt Management and Prompt Flows for Amazon Bedrock
Table of Contents
Setup
pip3 install boto3 botocore matplotlib -qU --user
Workflow Diagrams
Prompt evaluation logic flow
flowchart LR A[Prompts] --> B((LLM invocation)) B --> C[Outputs] A --> D{LLM-as-a-judge evaluation} C --> D D --> E[Evaluation results]
Simple prompt evaluation flow
flowchart TD A[Flow input] -->|document| B[Invoke] B -->|modelCompletion| C[Evaluate] A -->|document| C C -->|modelCompletion| D[Flow output]
Prompt evaluation flow at scale
flowchart TD A[Flow input] -->|document| B[S3 Retrieval] B -->|s3Content| C[Iterator] C -->|arrayItem| D[Invoker] D -->|modelCompletion| E[Evaluator] C -->|arrayItem| E E -->|modelCompletion| F[Collector] C -->|arraySize| F F -->|collectedArray| G[S3 Storage] A -->|document| G G -->|s3Uri| H[Flow output]
Configuration
import boto3 import json import os import tempfile from datetime import datetime # Adjust with your preferred region accordingly: region = "us-east-1" # Adjust with your preferred model IDs for invocations and evaluation # Note some models are only available in certain regions: modelInvokeId = "amazon.titan-text-premier-v1:0" modelEvalId = "anthropic.claude-3-sonnet-20240229-v1:0" # Create AWS service clients bedrock_agent = boto3.client(service_name="bedrock-agent", region_name=region) iam = boto3.client('iam') # Get current user or role information try: user_info = iam.get_user() entity_type = "User" entity_name = user_info['User']['UserName'] except Exception: # If get_user() fails, we're probably in a role role_info = iam.get_role(RoleName=os.environ.get('AWS_EXECUTION_ENV', 'Unknown')) entity_type = "Role" entity_name = role_info['Role']['RoleName'] # Create a configuration dictionary config = { "region": region, "modelInvokeId": modelInvokeId, "modelEvalId": modelEvalId, "timestamp": datetime.now().isoformat(), f"current_{entity_type.lower()}": entity_name } # Save configuration to a temporary file with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file: json.dump(config, temp_file, indent=2) temp_file_path = temp_file.name # Print information about the user/role and services print(f"Configuration saved to: {temp_file_path}") print(f"Current {entity_type}: {entity_name}") print(f"Bedrock Agent Client: {bedrock_agent}") print(f"IAM Client: {iam}") # You can use these as header values if needed headers = { "X-Config-File": temp_file_path, "X-Entity-Type": entity_type, "X-Entity-Name": entity_name } print(f"Header values: {json.dumps(headers, indent=2)}")
Evaluation Template
You're an evaluator for the prompts and answers provided by a generative AI model.
Consider the input prompt in the <input> tags, the output answer in the <output> tags, the prompt evaluation criteria in the <prompt_criteria> tags, and the answer evaluation criteria in the <answer_criteria> tags.
<input>
{{input}}
</input>
<output>
{{output}}
</output>
<prompt_criteria>
- The prompt should be clear, direct, and detailed.
- The question, task, or goal should be well explained and be grammatically correct.
- The prompt is better if containing examples.
- The prompt is better if specifies a role or sets a context.
- The prompt is better if provides details about the format and tone of the expected answer.
</prompt_criteria>
<answer_criteria>
- The answers should be correct, well structured, and technically complete.
- The answers should not have any hallucinations, made up content, or toxic content.
- The answer should be grammatically correct.
- The answer should be fully aligned with the question or instruction in the prompt.
</answer_criteria>
Evaluate the answer the generative AI model provided in the <output> with a score from 0 to 100 according to the <answer_criteria> provided; any hallucinations, even if small, should dramatically impact the evaluation score.
Also evaluate the prompt passed to that generative AI model provided in the <input> with a score from 0 to 100 according to the <prompt_criteria> provided.
Respond only with a JSON having:
- An 'answer-score' key with the score number you evaluated the answer with.
- A 'prompt-score' key with the score number you evaluated the prompt with.
- A 'justification' key with a justification for the two evaluations you provided to the answer and the prompt; make sure to explicitely include any errors or hallucinations in this part.
- An 'input' key with the content of the <input> tags.
- An 'output' key with the content of the <output> tags.
- A 'prompt-recommendations' key with recommendations for improving the prompt based on the evaluations performed.
Skip any preamble or any other text apart from the JSON in your answer.
Create Evaluation Prompt
import boto3 import json import os from botocore.exceptions import ClientError # Read region and model ID from environment variables or use defaults region = os.environ.get('AWS_REGION', 'us-east-1') modelEvalId = os.environ.get('MODEL_EVAL_ID', 'anthropic.claude-3-sonnet-20240229-v1:0') # Create Bedrock agent client bedrock_agent = boto3.client(service_name="bedrock-agent", region_name=region) # Read the template file try: with open('03_ai_prompt_answer_evaluator.tmpl', 'r') as file: template = file.read() except FileNotFoundError: print("Error: Template file '03_ai_prompt_answer_evaluator.tmpl' not found.") exit(1) # Create the prompt try: response = bedrock_agent.create_prompt( name="prompt-evaluator", description="Prompt template for evaluating prompt responses with LLM-as-a-judge", variants=[ { "inferenceConfiguration": { "text": { "maxTokens": 2000, "temperature": 0, } }, "modelId": modelEvalId, "name": "variantOne", "templateConfiguration": { "text": { "inputVariables": [ {"name": "input"}, {"name": "output"} ], "text": template } }, "templateType": "TEXT" } ], defaultVariant="variantOne" ) print(json.dumps(response, indent=2, default=str)) promptEvalId = response["id"] promptEvalArn = response["arn"] promptEvalName = response["name"] print(f"Prompt ID: {promptEvalId}") print(f"Prompt ARN: {promptEvalArn}") print(f"Prompt Name: {promptEvalName}") # Create a new version of the prompt version_response = bedrock_agent.create_prompt_version(promptIdentifier=promptEvalId) print(json.dumps(version_response, indent=2, default=str)) # Save prompt details for later use with open('prompt_details.json', 'w') as f: json.dump({ "promptEvalId": promptEvalId, "promptEvalArn": promptEvalArn, "promptEvalName": promptEvalName }, f) print("Prompt details saved to 'prompt_details.json'") except ClientError as e: error_code = e.response['Error']['Code'] error_message = e.response['Error']['Message'] if error_code == 'ConflictException' and 'already exists' in error_message: print(f"Error: A prompt with the name 'prompt-evaluator' already exists. Please use a different name or delete the existing prompt.") else: print(f"An error occurred while interacting with AWS: {error_code} - {error_message}") except Exception as e: print(f"An unexpected error occurred: {str(e)}")
Evaluation Flow
Policy
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "bedrock.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
Create Role
import boto3 import json from botocore.exceptions import ClientError # Create IAM client iam = boto3.client('iam') # Define the role name role_name = 'AmazonBedrockExecutionRoleForAgentFlowEval' # Define the trust relationship policy trust_relationship = { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Service": "bedrock.amazonaws.com" }, "Action": "sts:AssumeRole" } ] } # Define the role policy role_policy = { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "bedrock:InvokeModel" ], "Resource": "*" } ] } try: # Create the IAM role create_role_response = iam.create_role( RoleName=role_name, AssumeRolePolicyDocument=json.dumps(trust_relationship), Description='Execution role for Amazon Bedrock Agent Flow' ) print("Role created successfully:") print(json.dumps(create_role_response, indent=2, default=str)) # Attach the inline policy to the role iam.put_role_policy( RoleName=role_name, PolicyName='BedrockInvokeModelPolicy', PolicyDocument=json.dumps(role_policy) ) print("Policy attached successfully") # Get the role ARN role_arn = create_role_response['Role']['Arn'] print(f"Role ARN: {role_arn}") # Save role details for later use with open('role_details.json', 'w') as f: json.dump({ "roleName": role_name, "roleArn": role_arn }, f) print("Role details saved to 'role_details.json'") except ClientError as e: if e.response['Error']['Code'] == 'EntityAlreadyExists': print(f"Error: The role '{role_name}' already exists.") # If the role exists, we can still get its ARN try: existing_role = iam.get_role(RoleName=role_name) role_arn = existing_role['Role']['Arn'] print(f"Existing Role ARN: {role_arn}") # Save role details for later use with open('role_details.json', 'w') as f: json.dump({ "roleName": role_name, "roleArn": role_arn }, f) print("Existing role details saved to 'role_details.json'") except ClientError as inner_e: print(f"Error retrieving existing role: {str(inner_e)}") else: print(f"An error occurred: {str(e)}") except Exception as e: print(f"An unexpected error occurred: {str(e)}")
Flow Definition
{
"nodes": [
{
"name": "Start",
"type": "Input",
"configuration": {
"input": {}
},
"outputs": [
{
"name": "document",
"type": "String"
}
]
},
{
"name": "End",
"type": "Output",
"configuration": {
"output": {}
},
"inputs": [
{
"expression": "$.data",
"name": "document",
"type": "String"
}
]
},
{
"name": "Invoke",
"type": "Prompt",
"configuration": {
"prompt": {
"sourceConfiguration": {
"inline": {
"inferenceConfiguration": {
"text": {
"maxTokens": 2000,
"temperature": 0
}
},
"modelId": "$MODEL_INVOKE_ID",
"templateConfiguration": {
"text": {
"inputVariables": [
{
"name": "input"
}
],
"text": "{{input}}"
}
},
"templateType": "TEXT"
}
}
}
},
"inputs": [
{
"expression": "$.data",
"name": "input",
"type": "String"
}
],
"outputs": [
{
"name": "modelCompletion",
"type": "String"
}
]
},
{
"name": "Evaluate",
"type": "Prompt",
"configuration": {
"prompt": {
"sourceConfiguration": {
"resource": {
"promptArn": "$PROMPT_EVAL_ARN"
}
}
}
},
"inputs": [
{
"expression": "$.data",
"name": "input",
"type": "String"
},
{
"expression": "$.data",
"name": "output",
"type": "String"
}
],
"outputs": [
{
"name": "modelCompletion",
"type": "String"
}
]
}
],
"connections": [
{
"name": "StartToInvoke",
"source": "Start",
"target": "Invoke",
"type": "Data",
"configuration": {
"data": {
"sourceOutput": "document",
"targetInput": "input"
}
}
},
{
"name": "InvokeToEvaluate",
"source": "Invoke",
"target": "Evaluate",
"type": "Data",
"configuration": {
"data": {
"sourceOutput": "modelCompletion",
"targetInput": "output"
}
}
},
{
"name": "StartToEvaluate",
"source": "Start",
"target": "Evaluate",
"type": "Data",
"configuration": {
"data": {
"sourceOutput": "document",
"targetInput": "input"
}
}
},
{
"name": "EvaluateToEnd",
"source": "Evaluate",
"target": "End",
"type": "Data",
"configuration": {
"data": {
"sourceOutput": "modelCompletion",
"targetInput": "document"
}
}
}
]
}
Create Flow
import boto3 import json import os # Read AWS region from environment variable or use a default region = os.environ.get('AWS_REGION', 'us-east-1') # Create Bedrock agent client bedrock_agent = boto3.client(service_name="bedrock-agent", region_name=region) # Read model ID from environment variable or use a default modelInvokeId = os.environ.get('MODEL_INVOKE_ID', 'amazon.titan-text-express-v1') # Read role details try: with open('role_details.json', 'r') as f: role_details = json.load(f) role_arn = role_details['roleArn'] except FileNotFoundError: print("Error: 'role_details.json' not found. Please run the script to create the IAM role first.") exit(1) # Read prompt details try: with open('prompt_details.json', 'r') as f: prompt_details = json.load(f) promptEvalArn = prompt_details['promptEvalArn'] except FileNotFoundError: print("Error: 'prompt_details.json' not found. Please run the script to create the prompt first.") exit(1) # Read flow definition try: with open('07_prompt_eval_flow_defn.json', 'r') as f: flow_definition = json.load(f) except FileNotFoundError: print("Error: '07_prompt_eval_flow_defn.json' not found. Please make sure the flow definition file exists.") exit(1) # Replace placeholders in the flow definition def replace_placeholders(obj, replacements): if isinstance(obj, dict): for k, v in obj.items(): if isinstance(v, (dict, list)): replace_placeholders(v, replacements) elif isinstance(v, str): for placeholder, value in replacements.items(): if v == placeholder: obj[k] = value elif isinstance(obj, list): for i, v in enumerate(obj): if isinstance(v, (dict, list)): replace_placeholders(v, replacements) elif isinstance(v, str): for placeholder, value in replacements.items(): if v == placeholder: obj[i] = value replacements = { "$MODEL_INVOKE_ID": modelInvokeId, "$PROMPT_EVAL_ARN": promptEvalArn } replace_placeholders(flow_definition, replacements) # Create the flow try: response = bedrock_agent.create_flow( name="prompt-eval-flow", description="Prompt Flow for evaluating prompts with LLM-as-a-judge.", executionRoleArn=role_arn, definition=flow_definition ) print("Flow created successfully:") print(json.dumps(response, indent=2, default=str)) flowEvalId = response["id"] flowEvalArn = response["arn"] flowEvalName = response["name"] # Save flow details for later use with open('flow_details.json', 'w') as f: json.dump({ "flowId": flowEvalId, "flowArn": flowEvalArn, "flowName": flowEvalName }, f) print("Flow details saved to 'flow_details.json'") except boto3.exceptions.Boto3Error as e: print(f"An error occurred while interacting with AWS: {str(e)}") except Exception as e: print(f"An unexpected error occurred: {str(e)}")
Prepare and Version the Flow
Prepare the Flow
import boto3 import json import time # Assuming you're using the same region as before region = "us-east-1" bedrock_agent = boto3.client(service_name="bedrock-agent", region_name=region) # Read flow details with open('flow_details.json', 'r') as f: flow_details = json.load(f) flow_id = flow_details['flowId'] try: response = bedrock_agent.prepare_flow( flowIdentifier=flow_id ) print("Flow preparation started:") print(json.dumps(response, indent=2, default=str)) except Exception as e: print(f"An error occurred while preparing the flow: {str(e)}") # Poll the flow status until it's prepared max_attempts = 30 attempts = 0 while attempts < max_attempts: try: response = bedrock_agent.get_flow( flowIdentifier=flow_id ) status = response['status'] print(f"Current flow status: {status}") if status == 'Prepared': print("Flow is prepared and ready to use!") break elif status == 'Failed': print("Flow preparation failed. Check the AWS console for more details.") break except Exception as e: print(f"An error occurred while checking flow status: {str(e)}") break time.sleep(10) # Wait for 10 seconds before checking again attempts += 1 if attempts == max_attempts: print("Flow preparation timed out. Please check the AWS console for more information.")
Create Flow Version
import boto3 import json region = "us-east-1" bedrock_agent = boto3.client(service_name="bedrock-agent", region_name=region) # Read flow details with open('flow_details.json', 'r') as f: flow_details = json.load(f) flow_id = flow_details['flowId'] try: response = bedrock_agent.create_flow_version( flowIdentifier=flow_id ) print("Flow version created:") print(json.dumps(response, indent=2, default=str)) # Update flow details with the new version flow_details['flowVersion'] = response['version'] with open('flow_details.json', 'w') as f: json.dump(flow_details, f, indent=2) print("Flow details updated with new version.") except Exception as e: print(f"An error occurred while creating the flow version: {str(e)}")
Create Flow Alias
import boto3 import json region = "us-east-1" bedrock_agent = boto3.client(service_name="bedrock-agent", region_name=region) # Read flow details with open('flow_details.json', 'r') as f: flow_details = json.load(f) flow_id = flow_details['flowId'] flow_name = flow_details['flowName'] flow_version = flow_details.get('flowVersion', '1') # Default to '1' if not set try: response = bedrock_agent.create_flow_alias( flowIdentifier=flow_id, name=flow_name, description=f"Alias for {flow_name}", routingConfiguration=[ { "flowVersion": flow_version } ] ) print("Flow alias created:") print(json.dumps(response, indent=2, default=str)) # Update flow details with the alias information flow_details['flowAliasId'] = response['id'] flow_details['flowAliasArn'] = response['arn'] with open('flow_details.json', 'w') as f: json.dump(flow_details, f, indent=2) print("Flow details updated with alias information.") except Exception as e: print(f"An error occurred while creating the flow alias: {str(e)}")
List of Prompts and Prompt Flows
- Prompts:
- Evaluation Prompt: "prompt-evaluator" (Created in 04createevaluationprompt.py)
- Prompt Flows:
- Simple Prompt Evaluation Flow: "prompt-eval-flow" (Created in 08createflow.py)
Evaluation
Good Prompt
You are an expert data scientist with years of experience in machine learning and statistical analysis. A junior data analyst has approached you for help with a problem they're facing. They have a dataset of customer information and purchase history, and they want to predict future purchasing behavior. Please provide a detailed, step-by-step guide on how to approach this problem. Include the following in your response: 1. Initial data exploration steps 2. Feature engineering suggestions 3. Potential machine learning models to consider 4. Model evaluation metrics 5. Tips for interpreting the results Use clear, concise language and explain any technical terms you use. Your goal is to educate the junior analyst and give them a solid foundation to start their project.
Bad Prompt
Tell me about machine learning.
Prompts Evaluation
import boto3 import json import time from botocore.exceptions import ClientError # Assuming you're using the same region as before region = "us-east-1" bedrock_agent_runtime = boto3.client(service_name='bedrock-agent-runtime', region_name=region) # Read flow details with open('flow_details.json', 'r') as f: flow_details = json.load(f) flow_id = flow_details['flowId'] flow_alias_id = flow_details.get('flowAliasId') # Use the alias if available def invoke_flow(input_text, timeout=300): # 5 minutes timeout try: response = bedrock_agent_runtime.invoke_flow( flowIdentifier=flow_id, flowAliasIdentifier=flow_alias_id, inputs=[ { "content": { "document": input_text }, "nodeName": "Start", "nodeOutputName": "document" } ] ) event_stream = response["responseStream"] result = "" start_time = time.time() for event in event_stream: if time.time() - start_time > timeout: print(f"Flow invocation timed out after {timeout} seconds.") return None if "flowOutputEvent" in event: result += event["flowOutputEvent"]["content"]["document"] return json.loads(result) except ClientError as e: error_code = e.response['Error']['Code'] error_message = e.response['Error']['Message'] print(f"AWS Error: {error_code} - {error_message}") return None except Exception as e: print(f"An unexpected error occurred: {str(e)}") return None def read_prompt(file_path): with open(file_path, 'r') as file: return file.read().strip() # Test the flow with good and bad prompts good_prompt = read_prompt('prompts/evaluation_good_prompt.tmpl') bad_prompt = read_prompt('prompts/evaluation_bad_prompt.tmpl') for prompt_type, prompt in [("Good", good_prompt), ("Bad", bad_prompt)]: print(f"\nTesting {prompt_type} Prompt:") print("-" * 40) print(f"Prompt: {prompt[:100]}...") # Print first 100 characters of the prompt print("Invoking flow... This may take a few minutes.") result = invoke_flow(prompt) if result: print("\nFlow invocation result:") print(f"Prompt Score: {result.get('prompt-score', 'N/A')}") print(f"Answer Score: {result.get('answer-score', 'N/A')}") print(f"\nJustification: {result.get('justification', 'N/A')[:200]}...") # Print first 200 characters print(f"\nPrompt Recommendations: {result.get('prompt-recommendations', 'N/A')[:200]}...") # Print first 200 characters else: print("Failed to get a result from the flow.") print("\n" + "=" * 50 + "\n")