Video-R1 / merge_data.py
DingZhenDojoCat's picture
Add files using upload-large-folder tool
bb7f76d verified
import json
from pathlib import Path
from typing import Iterator, Dict
# -----------------------------
# Inputs
# -----------------------------
files = [
"pool_multiple_choice_chunk_01.json",
"pool_multiple_choice_chunk_02.json",
"pool_multiple_choice_chunk_03.json",
"pool_multiple_choice_chunk_04.json",
"pool_numerical_chunk_01.json",
"pool_numerical_chunk_02.json",
"pool_numerical_chunk_03.json",
"pool_regression_chunk_01.json",
]
out_path = Path("merged_train.json")
# -----------------------------
# Read records from JSON/JSONL
# -----------------------------
def iter_records(path: Path) -> Iterator[Dict]:
"""
Yields records from a file that can be:
- JSONL (one JSON object per line), or
- a single JSON array, or
- a single JSON object.
"""
text = path.read_text(encoding="utf-8")
# Try whole-file JSON first (array or object)
try:
data = json.loads(text)
if isinstance(data, list):
for rec in data:
yield rec
elif isinstance(data, dict):
yield data
else:
raise ValueError(f"Unsupported top-level JSON type in {path}")
except json.JSONDecodeError:
# Fallback: treat as JSONL
for i, line in enumerate(text.splitlines(), 1):
line = line.strip()
if not line:
continue
try:
yield json.loads(line)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON on line {i} in {path}: {e}") from e
# -----------------------------
# Merge & write single JSON file
# -----------------------------
out_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
with out_path.open("w", encoding="utf-8") as out:
out.write("[\n")
first = True
for fp in files:
for rec in iter_records(Path(fp)):
if not first:
out.write(",\n")
out.write(json.dumps(rec, ensure_ascii=False))
first = False
count += 1
out.write("\n]")
print(f"✓ Wrote {count} records to {out_path.resolve()}")