File size: 2,146 Bytes
bb7f76d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import json
from pathlib import Path
from typing import Iterator, Dict

# -----------------------------
# Inputs
# -----------------------------
files = [
    "pool_multiple_choice_chunk_01.json",
    "pool_multiple_choice_chunk_02.json",
    "pool_multiple_choice_chunk_03.json",
    "pool_multiple_choice_chunk_04.json",
    "pool_numerical_chunk_01.json",
    "pool_numerical_chunk_02.json",
    "pool_numerical_chunk_03.json",
    "pool_regression_chunk_01.json",
]

out_path = Path("merged_train.json")

# -----------------------------
# Read records from JSON/JSONL
# -----------------------------
def iter_records(path: Path) -> Iterator[Dict]:
    """
    Yields records from a file that can be:
      - JSONL (one JSON object per line), or
      - a single JSON array, or
      - a single JSON object.
    """
    text = path.read_text(encoding="utf-8")
    # Try whole-file JSON first (array or object)
    try:
        data = json.loads(text)
        if isinstance(data, list):
            for rec in data:
                yield rec
        elif isinstance(data, dict):
            yield data
        else:
            raise ValueError(f"Unsupported top-level JSON type in {path}")
    except json.JSONDecodeError:
        # Fallback: treat as JSONL
        for i, line in enumerate(text.splitlines(), 1):
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                raise ValueError(f"Invalid JSON on line {i} in {path}: {e}") from e

# -----------------------------
# Merge & write single JSON file
# -----------------------------
out_path.parent.mkdir(parents=True, exist_ok=True)

count = 0
with out_path.open("w", encoding="utf-8") as out:
    out.write("[\n")
    first = True
    for fp in files:
        for rec in iter_records(Path(fp)):
            if not first:
                out.write(",\n")
            out.write(json.dumps(rec, ensure_ascii=False))
            first = False
            count += 1
    out.write("\n]")

print(f"✓ Wrote {count} records to {out_path.resolve()}")