import json from pathlib import Path from typing import Iterator, Dict # ----------------------------- # Inputs # ----------------------------- files = [ "pool_multiple_choice_chunk_01.json", "pool_multiple_choice_chunk_02.json", "pool_multiple_choice_chunk_03.json", "pool_multiple_choice_chunk_04.json", "pool_numerical_chunk_01.json", "pool_numerical_chunk_02.json", "pool_numerical_chunk_03.json", "pool_regression_chunk_01.json", ] out_path = Path("merged_train.json") # ----------------------------- # Read records from JSON/JSONL # ----------------------------- def iter_records(path: Path) -> Iterator[Dict]: """ Yields records from a file that can be: - JSONL (one JSON object per line), or - a single JSON array, or - a single JSON object. """ text = path.read_text(encoding="utf-8") # Try whole-file JSON first (array or object) try: data = json.loads(text) if isinstance(data, list): for rec in data: yield rec elif isinstance(data, dict): yield data else: raise ValueError(f"Unsupported top-level JSON type in {path}") except json.JSONDecodeError: # Fallback: treat as JSONL for i, line in enumerate(text.splitlines(), 1): line = line.strip() if not line: continue try: yield json.loads(line) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON on line {i} in {path}: {e}") from e # ----------------------------- # Merge & write single JSON file # ----------------------------- out_path.parent.mkdir(parents=True, exist_ok=True) count = 0 with out_path.open("w", encoding="utf-8") as out: out.write("[\n") first = True for fp in files: for rec in iter_records(Path(fp)): if not first: out.write(",\n") out.write(json.dumps(rec, ensure_ascii=False)) first = False count += 1 out.write("\n]") print(f"✓ Wrote {count} records to {out_path.resolve()}")