import functools
def evaluate_predictions_with_grader(
predictions,
grader_func=combined_grader,
):
results = []
if isinstance(predictions, dict):
predictions = [predictions]
def run_grading(pred):
model_prediction = {"output_text": pred["model_prediction"]}
item = pred["input"]
score = grader_func(model_prediction, item)
result = pred.copy()
result["score"] = score
return result
if len(predictions) == 1:
result = run_grading(predictions[0])
results.append(result)
else:
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(run_grading, pred) for pred in predictions]
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Grading predictions"):
results.append(future.result())
total = len(results)
correct = sum(r["score"] for r in results)
accuracy = correct / total if total else 0.0
metrics = {
"total_samples": total,
"accuracy": accuracy,
}
print(metrics)
return metrics, results
def run_prediction_evaluation(
model_name="o4-mini",
reasoning_effort="medium",
prompt_type="simple",
subset="train",
grader_func=combined_grader,
num_runs=3,
):
if isinstance(grader_func, functools.partial):
name = grader_func.func.__name__
mg = grader_func.keywords["model_grader"]
mg_name = mg["name"]
name = f"{name}_{mg_name}"
else:
name = getattr(grader_func, "__name__", getattr(grader_func, "__class__", type(grader_func)).__name__)
grader_func_name = name.replace(" ", "_").replace(":", "_").replace("/", "_").replace(",", "_")
for i in range(num_runs):
preds_path = f"data/rft/predictions/{subset}_{prompt_type}_{model_name}_{reasoning_effort}_predictions_run{i+1}.json"
with open(preds_path, "r") as f:
preds = json.load(f)
metrics, results_with_scores = evaluate_predictions_with_grader(preds, grader_func=grader_func)
# Save the scored results
with open(f"data/rft/predictions/{subset}_{prompt_type}_{model_name}_{reasoning_effort}_{grader_func_name}_predictions_run_{i+1}_scored.json", "w") as f:
json.dump(results_with_scores, f, indent=2)
# Save the metrics
with open(f"data/rft/predictions/{subset}_{prompt_type}_{model_name}_{reasoning_effort}_{grader_func_name}_predictions_run_{i+1}_metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
# Save the scores (if present in results_with_scores)
scores = [item.get("score") for item in results_with_scores if "score" in item]
with open(f"data/rft/predictions/{subset}_{prompt_type}_{model_name}_{reasoning_effort}_{grader_func_name}_predictions_run_{i+1}_scores.json", "w") as f:
json.dump(scores, f, indent=2)
def load_predictions(
model_name="o4-mini",
reasoning_effort="medium",
prompt_type="simple",
subset="train",
grader_func_name="clinical_phrase_grader",
num_runs=3
):
all_predictions = []
all_metrics = []
for run in range(1, num_runs + 1):
pred_path = f"data/rft/predictions/{subset}_{prompt_type}_{model_name}_{reasoning_effort}_{grader_func_name}_predictions_run_{run}_scored.json"
metrics_path = f"data/rft/predictions/{subset}_{prompt_type}_{model_name}_{reasoning_effort}_{grader_func_name}_predictions_run_{run}_metrics.json"
try:
with open(pred_path, "r") as f:
predictions = json.load(f)
except FileNotFoundError:
predictions = None
try:
with open(metrics_path, "r") as f:
metrics = json.load(f)
except FileNotFoundError:
metrics = None
all_predictions.append(predictions)
all_metrics.append(metrics)
return all_predictions, all_metrics