banner("STEP 3 — Building the analysis DataFrame")
def process_example(ex):
traj = normalize_trajectory(ex.get("trajectory"))
rc = role_counts(traj)
nf, add, dele, _files, _exts = parse_patch(ex.get("model_patch"))
meta = normalize_metadata(ex.get("metadata"))
full_text = "\n".join(message_text(m) for m in traj)
return {
"instance_id": ex.get("instance_id"),
"repo": ex.get("repo"),
"language": (ex.get("language") or "unknown").lower(),
"license": ex.get("license"),
"resolved": ex.get("resolved"),
"agent": ex.get("_agent"),
"model": ex.get("_model"),
"n_messages": len(traj),
"n_system": rc.get("system", 0),
"n_user": rc.get("user", 0),
"n_assistant": rc.get("assistant", 0),
"n_tool": rc.get("tool", 0),
"patch_files": nf,
"patch_add": add,
"patch_del": dele,
"patch_churn": add + dele,
"traj_tokens": count_tokens(full_text),
"category": meta.get("category"),
"meta_files": meta.get("num_modified_files"),
"meta_lines": meta.get("num_modified_lines"),
"_tools": extract_tool_names(traj),
}
records = [process_example(ex) for ex in raw_rows]
df = pd.DataFrame(records)
df["is_resolved"] = (df["resolved"] == 1)
df["known_label"] = df["resolved"].isin([0, 1])
print(f"DataFrame: {df.shape[0]} rows x {df.shape[1]} cols")
print("\nNumeric summary:")
print(df[["n_messages", "n_assistant", "n_tool",
"patch_files", "patch_churn", "traj_tokens"]].describe().round(1))