|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import Iterator, Dict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files = [ |
|
|
"pool_multiple_choice_chunk_01.json", |
|
|
"pool_multiple_choice_chunk_02.json", |
|
|
"pool_multiple_choice_chunk_03.json", |
|
|
"pool_multiple_choice_chunk_04.json", |
|
|
"pool_numerical_chunk_01.json", |
|
|
"pool_numerical_chunk_02.json", |
|
|
"pool_numerical_chunk_03.json", |
|
|
"pool_regression_chunk_01.json", |
|
|
] |
|
|
|
|
|
out_path = Path("merged_train.json") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def iter_records(path: Path) -> Iterator[Dict]: |
|
|
""" |
|
|
Yields records from a file that can be: |
|
|
- JSONL (one JSON object per line), or |
|
|
- a single JSON array, or |
|
|
- a single JSON object. |
|
|
""" |
|
|
text = path.read_text(encoding="utf-8") |
|
|
|
|
|
try: |
|
|
data = json.loads(text) |
|
|
if isinstance(data, list): |
|
|
for rec in data: |
|
|
yield rec |
|
|
elif isinstance(data, dict): |
|
|
yield data |
|
|
else: |
|
|
raise ValueError(f"Unsupported top-level JSON type in {path}") |
|
|
except json.JSONDecodeError: |
|
|
|
|
|
for i, line in enumerate(text.splitlines(), 1): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
try: |
|
|
yield json.loads(line) |
|
|
except json.JSONDecodeError as e: |
|
|
raise ValueError(f"Invalid JSON on line {i} in {path}: {e}") from e |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
out_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
count = 0 |
|
|
with out_path.open("w", encoding="utf-8") as out: |
|
|
out.write("[\n") |
|
|
first = True |
|
|
for fp in files: |
|
|
for rec in iter_records(Path(fp)): |
|
|
if not first: |
|
|
out.write(",\n") |
|
|
out.write(json.dumps(rec, ensure_ascii=False)) |
|
|
first = False |
|
|
count += 1 |
|
|
out.write("\n]") |
|
|
|
|
|
print(f"✓ Wrote {count} records to {out_path.resolve()}") |
|
|
|