def is_success(row):
res = (row.get("result") or "").lower()
if res in ("resolved", "success", "pass", "passed", "correct"):
return True
rw = row.get("reward")
try:
return float(rw) >= 1.0
except (TypeError, ValueError):
return False
out_path = "agenttrove_clean_sft.jsonl"
kept, scanned, SCAN, KEEP = 0, 0, 1500, 200
print(f"\n⏳ Scanning up to {SCAN} rows, keeping up to {KEEP} successful traces…")
with open(out_path, "w") as f:
for row in itertools.islice(load_dataset(REPO, split="train", streaming=True), SCAN):
scanned += 1
if not is_success(row):
continue
turns = normalize_turns(row[TRACE_KEY])
conv = [{"from": r, "value": c} for r, c in turns if c.strip()]
if len(conv) < 2:
continue
f.write(json.dumps({
"conversations": conv,
"source": row.get("original_source"),
"teacher": row.get("original_teacher"),
}) + "\n")
kept += 1
if kept >= KEEP:
break
print(f"✅ Scanned {scanned} rows → wrote {kept} clean traces to '{out_path}'")
def search_traces(keyword=None, source=None, limit=3, scan=3000):
"""Stream the dataset and yield-print traces matching filters."""
hits = 0
for row in itertools.islice(load_dataset(REPO, split="train", streaming=True), scan):
if source and row.get("original_source") != source:
continue
if keyword:
blob = " ".join(c for _, c in normalize_turns(row[TRACE_KEY]))
if keyword.lower() not in blob.lower():
continue
render_trace(row, max_chars=300)
hits += 1
if hits >= limit:
break
if hits == 0:
print("No matches in the scanned window — try increasing `scan`.")
print("\n🔍 Searching for 'nl2bash' source traces:")
search_traces(source="nl2bash", limit=2, scan=4000)
print("\n🎉 Tutorial complete! Next ideas:")
print(" • Increase N / SCAN for bigger analyses.")
print(" • Filter by original_source (swesmith, codeforces, r2egym…) for a domain SFT set.")
print(" • Feed agenttrove_clean_sft.jsonl into Axolotl / LLaMA-Factory for fine-tuning.")
Credit: Source link

























