DingZhenDojoCat commited on Aug 21, 2025

Commit

7ed0fb5

verified ·

1 Parent(s): bb7f76d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +9 -0
images/dataset.png +3 -0
images/demo1.png +3 -0
previous_version/Video-R1-main-previous/images/2B_curve.png +3 -0
previous_version/Video-R1-main-previous/images/7B_curve.png +3 -0
previous_version/Video-R1-main-previous/images/7B_nextqa.png +3 -0
previous_version/Video-R1-main-previous/images/CATER_new_003595.gif +3 -0
previous_version/Video-R1-main-previous/images/sample.png +3 -0
previous_version/Video-R1-main-previous/src/distill_r1/create_hf_dataset.py +119 -0
previous_version/Video-R1-main-previous/src/distill_r1/generate_scene_qa_pairs.ipynb +569 -0
previous_version/Video-R1-main-previous/src/distill_r1/grpo_r1_distilled.jpg +3 -0
previous_version/Video-R1-main-previous/src/distill_r1/query_r1.py +114 -0
previous_version/Video-R1-main-previous/src/eval/prompts/geoqa_test_prompts.jsonl +0 -0
previous_version/Video-R1-main-previous/src/eval/prompts/superclevr_test200_counting_problems.jsonl +200 -0
previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_counting_superclevr.py +136 -0
previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_geoqa.py +149 -0
previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_geoqa_multigpu.py +205 -0
previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_video_counting.py +141 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/.python-version +1 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/README.md +94 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/pyproject.toml +75 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/requirements-dev.lock +84 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/requirements.lock +32 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/src/qwen_vl_utils/__init__.py +7 -0
previous_version/Video-R1-main-previous/src/qwen-vl-utils/src/qwen_vl_utils/vision_process.py +379 -0
previous_version/Video-R1-main-previous/src/r1-v/temp_image.png +3 -0
src/r1-v/.gitignore +178 -0
src/r1-v/LICENSE +201 -0
src/r1-v/Makefile +20 -0
src/r1-v/setup.cfg +41 -0
src/r1-v/setup.py +132 -0
src/r1-v/src/open_r1/__init__.py +0 -0
src/r1-v/src/open_r1/evaluate.py +85 -0
src/r1-v/src/open_r1/generate.py +156 -0
src/r1-v/src/open_r1/grpo-cot-72BEval.py +489 -0
src/r1-v/src/open_r1/grpo-cot-LLMEval.py +552 -0
src/r1-v/src/open_r1/grpo-cot-answerBERT-eval.py +429 -0
src/r1-v/src/open_r1/grpo-cot-noDesEval.py +446 -0
src/r1-v/src/open_r1/grpo-cot-noInfo.py +346 -0
src/r1-v/src/open_r1/grpo-cot-qwenEval.py +523 -0
src/r1-v/src/open_r1/grpo-cot-selfEval.py +457 -0
src/r1-v/src/open_r1/grpo-cot-selfEvalConst.py +456 -0
src/r1-v/src/open_r1/grpo-cot.py +351 -0
src/r1-v/src/open_r1/grpo-description-LLMEval.py +579 -0
src/r1-v/src/open_r1/grpo.py +318 -0
src/r1-v/src/open_r1/grpo_vllm_caption.py +266 -0
src/r1-v/src/open_r1/sft_video.py +304 -0
src/r1-v/src/open_r1/trainer/__init__.py +12 -0
src/r1-v/src/open_r1/trainer/vllm_grpo_trainer_modified_error.py +1061 -0
src/r1-v/src/open_r1/trainer/vllm_grpo_trainer_modified_orig.py +935 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 images/curves.png filter=lfs diff=lfs merge=lfs -text
 images/demo2.png filter=lfs diff=lfs merge=lfs -text
 images/performance.png filter=lfs diff=lfs merge=lfs -text

 images/curves.png filter=lfs diff=lfs merge=lfs -text
 images/demo2.png filter=lfs diff=lfs merge=lfs -text
 images/performance.png filter=lfs diff=lfs merge=lfs -text
+images/dataset.png filter=lfs diff=lfs merge=lfs -text
+images/demo1.png filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/src/r1-v/temp_image.png filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/src/distill_r1/grpo_r1_distilled.jpg filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/images/7B_nextqa.png filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/images/sample.png filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/images/CATER_new_003595.gif filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/images/2B_curve.png filter=lfs diff=lfs merge=lfs -text
+previous_version/Video-R1-main-previous/images/7B_curve.png filter=lfs diff=lfs merge=lfs -text

images/dataset.png ADDED Viewed

Git LFS Details

SHA256: e0affaa1cf8d870c6a6ec41be54494e073c51987fe5ad424a8ee3437b1dcc116
Pointer size: 131 Bytes
Size of remote file: 589 kB

images/demo1.png ADDED Viewed

Git LFS Details

SHA256: 94c40671d8761915a8de02f4548f0e1715069aa8d171f08d5b27af3f2a715548
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

previous_version/Video-R1-main-previous/images/2B_curve.png ADDED Viewed

Git LFS Details

SHA256: 8f5b2aaa2c501639cc570bf9f8b8a94dedf3e3d8f9b2ad2ef6e13d01478b733d
Pointer size: 131 Bytes
Size of remote file: 321 kB

previous_version/Video-R1-main-previous/images/7B_curve.png ADDED Viewed

Git LFS Details

SHA256: 38e42d31de8bf93659529b9334c1aa58c71d91fa55e1eeef7f4f6fece1ca4663
Pointer size: 131 Bytes
Size of remote file: 310 kB

previous_version/Video-R1-main-previous/images/7B_nextqa.png ADDED Viewed

Git LFS Details

SHA256: 99c0f930a3f67a870386ee16896b1f45a3c84dfd43b27dd4d128a8ae66406f19
Pointer size: 131 Bytes
Size of remote file: 334 kB

previous_version/Video-R1-main-previous/images/CATER_new_003595.gif ADDED Viewed

Git LFS Details

SHA256: 9ed0306a7a088e526eb2ccfb8e0f44d987fa48548248649f1cd4a270955634cd
Pointer size: 131 Bytes
Size of remote file: 777 kB

previous_version/Video-R1-main-previous/images/sample.png ADDED Viewed

Git LFS Details

SHA256: e616764501a3833e9035ccd48b79b19f23cc02c597cedde681edf0b63f27d09c
Pointer size: 131 Bytes
Size of remote file: 244 kB

previous_version/Video-R1-main-previous/src/distill_r1/create_hf_dataset.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import json
+import os
+import random
+from datasets import load_dataset
+from tqdm import tqdm
+random.seed(1234)
+VAL_NUM = 5000
+def create_r1_train_dataset(
+    valid_pair_json,
+    data_dir,
+    img_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/trainA/",
+):
+    os.makedirs(data_dir, exist_ok=True)
+    pairs = [json.loads(line) for line in open(valid_pair_json, "r")]
+    mapped_pairs = []
+    for idx, pair in tqdm(enumerate(pairs)):
+        img_filename = pair["img_filename"]
+        new_pair = {}
+        try:
+            new_pair["thinking"] = (
+                pair["r1_response"]
+                .split("<think>")[1]
+                .split("</think>")[0]
+                .replace("scene description", "image")
+            )
+        except Exception as e:
+            print(f"Error processing pair response: ", pair["r1_response"])
+            continue  # skip this pair
+        # add index to distinguish the same image
+        dataset_filename = (
+            img_filename.split(".")[0] + "_" + str(idx) + "." + img_filename.split(".")[1]
+        )
+        if not os.path.exists(f"{data_dir}/{img_filename}"):
+            os.system(f"cp {img_dir}/{img_filename} {data_dir}/{dataset_filename}")
+        q, a = pair["q"], pair["a"]
+        new_pair["problem"] = q
+        # get the thinking path
+        new_pair["thinking"] = "<think>" + new_pair["thinking"] + "</think>"
+        new_pair["solution"] = f"<answer> {a} </answer>"
+        new_pair["file_name"] = dataset_filename
+        mapped_pairs.append(new_pair)
+    with open(f"{data_dir}/metadata.jsonl", "w") as f:
+        for pair in mapped_pairs:
+            f.write(json.dumps(pair) + "\n")
+    train_dataset = load_dataset(
+        "imagefolder",
+        data_dir=data_dir,
+        split="train",
+    )
+    return train_dataset
+def create_val_dataset(
+    json_file,
+    data_dir,
+    val_num=VAL_NUM,
+    image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB",
+):
+    os.makedirs(data_dir, exist_ok=True)
+    val = json.load(open(json_file))
+    random.shuffle(val)
+    val = val[:val_num]
+    val_pairs = []
+    for idx, pair in tqdm(enumerate(val)):
+        q, a = pair["q"], pair["a"]
+        img_filename = pair["img_filename"]
+        # copy images to the DATA_DIR
+        val_filename = (
+            img_filename.split(".")[0] + f"_{idx}." + img_filename.split(".")[1]
+        )
+        if not os.path.exists(f"{data_dir}/{img_filename}"):
+            os.system(f"cp {image_dir}/{img_filename} {data_dir}/{val_filename}")
+        new_pair = {}
+        new_pair["problem"] = q
+        new_pair["solution"] = f"<answer> {a} </answer>"
+        new_pair["file_name"] = val_filename
+        val_pairs.append(new_pair)
+    with open(f"{data_dir}/metadata.jsonl", "w") as f:
+        for pair in val_pairs:
+            f.write(json.dumps(pair) + "\n")
+    val_dataset = load_dataset("imagefolder", data_dir=data_dir, split="train")
+    return val_dataset
+# valA split
+VALA_DATA_DIR = "data/Clevr_CoGenT_ValA"
+VALB_DATA_DIR = "data/Clevr_CoGenT_ValB"
+valA_json = (
+    "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valA.json"
+)
+valB_json = (
+    "/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_valB.json"
+)
+TRAIN_DATADIR = "data/Clevr_CoGenT_TrainA_R1"
+train_dataset = create_r1_train_dataset(
+    "/home/lilei/Visual-R1/filter_results_v2/valid_pairs.jsonl",
+    TRAIN_DATADIR,
+)
+# print(train_dataset)
+valA_dataset = create_val_dataset(
+    valA_json,
+    VALA_DATA_DIR,
+    image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valA",
+)
+valB_dataset = create_val_dataset(
+    valB_json,
+    VALB_DATA_DIR,
+    image_dir="/home/lilei/Visual-R1/CLEVR_CoGenT_v1.0/images/valB",
+)
+valA_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValA")
+valB_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_ValB")
+train_dataset.push_to_hub("MMInstruction/Clevr_CoGenT_TrainA_R1")

previous_version/Video-R1-main-previous/src/distill_r1/generate_scene_qa_pairs.ipynb ADDED Viewed

	@@ -0,0 +1,569 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3a704ea6-2e61-4aaa-97aa-416579c9bc13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c4920a8f-cddd-4063-8cab-215d238b5dad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CLEVR_trainA_scenes.json  CLEVR_valA_scenes.json  CLEVR_valB_scenes.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls CLEVR_CoGenT_v1.0/scenes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "934fa005-3b2a-43ed-8a71-6a12b7579546",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "split = \"valB\"\n",
+    "clevr_train_json = f\"CLEVR_CoGenT_v1.0/scenes/CLEVR_{split}_scenes.json\"\n",
+    "train_qs = f\"CLEVR_CoGenT_v1.0/questions/CLEVR_{split}_questions.json\"\n",
+    "data = json.load(open(clevr_train_json))\n",
+    "qs = json.load(open(train_qs))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1f0d6180-94c4-4aea-bd2b-8d5cfeb0aecb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'pixel_coords': [343, 131, 11.278693199157715], 'size': 'small', 'color': 'green', 'material': 'metal', 'shape': 'sphere', '3d_coords': [0.9906095862388611, 2.083291530609131, 0.3499999940395355], 'rotation': 107.73596690369371}, {'pixel_coords': [396, 172, 9.857704162597656], 'size': 'small', 'color': 'cyan', 'material': 'rubber', 'shape': 'sphere', '3d_coords': [2.69626522064209, 1.5257188081741333, 0.3499999940395355], 'rotation': 305.3536122513589}, {'pixel_coords': [115, 182, 8.91348934173584], 'size': 'large', 'color': 'yellow', 'material': 'rubber', 'shape': 'cylinder', '3d_coords': [0.049163494259119034, -2.864100217819214, 0.699999988079071], 'rotation': 161.8370138842408}, {'pixel_coords': [203, 131, 10.548327445983887], 'size': 'large', 'color': 'purple', 'material': 'rubber', 'shape': 'cube', '3d_coords': [-0.4719269275665283, -0.5699371695518494, 0.699999988079071], 'rotation': 159.41862667811446}, {'pixel_coords': [253, 75, 13.141877174377441], 'size': 'large', 'color': 'red', 'material': 'rubber', 'shape': 'cube', '3d_coords': [-2.036878824234009, 2.222999334335327, 0.699999988079071], 'rotation': 37.40490732771224}]\n",
+      "len:  5\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data['scenes'][0]['objects'])\n",
+    "print(\"len: \", len(data['scenes'][0]['objects']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7c828ca4-08f9-4927-a745-224a95379c2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def object_info_to_description(object_list):\n",
+    "    descriptions = []\n",
+    "    random.shuffle(object_list)\n",
+    "    for obj in object_list:\n",
+    "        desc = f\"A {obj['size']} {obj['color']} {obj['material']} {obj['shape']}\"\n",
+    "        desc += f\" rotated {obj['rotation']:.1f}° located at\"\n",
+    "        desc += f\" 3D coordinates ({obj['3d_coords'][0]:.2f}, {obj['3d_coords'][1]:.2f}, {obj['3d_coords'][2]:.2f})\"\n",
+    "        desc += f\" and pixel coordinates ({obj['pixel_coords'][0]}, {obj['pixel_coords'][1]}, {obj['pixel_coords'][2]:.2f})\"\n",
+    "        descriptions.append(desc)\n",
+    "    \n",
+    "    final_description = \"Scene Description:\\n\"\n",
+    "    for i, desc in enumerate(descriptions, 1):\n",
+    "        final_description += f\"{desc}\\n\"\n",
+    "    \n",
+    "    return final_description"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "cb048e25-d554-4bd7-bf11-878e071b5987",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Scene Description:\\nA large yellow rubber cylinder rotated 161.8° located at 3D coordinates (0.05, -2.86, 0.70) and pixel coordinates (115, 182, 8.91)\\nA large purple rubber cube rotated 159.4° located at 3D coordinates (-0.47, -0.57, 0.70) and pixel coordinates (203, 131, 10.55)\\nA large red rubber cube rotated 37.4° located at 3D coordinates (-2.04, 2.22, 0.70) and pixel coordinates (253, 75, 13.14)\\nA small green metal sphere rotated 107.7° located at 3D coordinates (0.99, 2.08, 0.35) and pixel coordinates (343, 131, 11.28)\\nA small cyan rubber sphere rotated 305.4° located at 3D coordinates (2.70, 1.53, 0.35) and pixel coordinates (396, 172, 9.86)\\n'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "object_info_to_description(data['scenes'][0]['objects'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ffacd5f3-e9a4-46ca-8c50-187ab12c9f1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img2obj_dict = {}\n",
+    "for scene in data['scenes']:\n",
+    "    obj_list = scene['objects']\n",
+    "    img2obj_dict[scene['image_filename']] = obj_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "db35f03c-1529-4776-bf4f-3bd44e960e5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question_index': 0,\n",
+       " 'question_family_index': 29,\n",
+       " 'image_index': 0,\n",
+       " 'question': 'The big thing that is in front of the large rubber cube in front of the small thing that is behind the tiny matte ball is what color?',\n",
+       " 'answer': 'yellow',\n",
+       " 'image_filename': 'CLEVR_valB_000000.png',\n",
+       " 'split': 'valB',\n",
+       " 'program': [{'value_inputs': [], 'inputs': [], 'function': 'scene'},\n",
+       "  {'value_inputs': ['small'], 'inputs': [0], 'function': 'filter_size'},\n",
+       "  {'value_inputs': ['rubber'], 'inputs': [1], 'function': 'filter_material'},\n",
+       "  {'value_inputs': ['sphere'], 'inputs': [2], 'function': 'filter_shape'},\n",
+       "  {'value_inputs': [], 'inputs': [3], 'function': 'unique'},\n",
+       "  {'value_inputs': ['behind'], 'inputs': [4], 'function': 'relate'},\n",
+       "  {'value_inputs': ['small'], 'inputs': [5], 'function': 'filter_size'},\n",
+       "  {'value_inputs': [], 'inputs': [6], 'function': 'unique'},\n",
+       "  {'value_inputs': ['front'], 'inputs': [7], 'function': 'relate'},\n",
+       "  {'value_inputs': ['large'], 'inputs': [8], 'function': 'filter_size'},\n",
+       "  {'value_inputs': ['rubber'], 'inputs': [9], 'function': 'filter_material'},\n",
+       "  {'value_inputs': ['cube'], 'inputs': [10], 'function': 'filter_shape'},\n",
+       "  {'value_inputs': [], 'inputs': [11], 'function': 'unique'},\n",
+       "  {'value_inputs': ['front'], 'inputs': [12], 'function': 'relate'},\n",
+       "  {'value_inputs': ['large'], 'inputs': [13], 'function': 'filter_size'},\n",
+       "  {'value_inputs': [], 'inputs': [14], 'function': 'unique'},\n",
+       "  {'value_inputs': [], 'inputs': [15], 'function': 'query_color'}]}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qs['questions'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "66b746fc-569c-4922-a442-79dbbc09e33b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.shuffle(qs['questions'])\n",
+    "cnt = 0 \n",
+    "qa_pairs = [] \n",
+    "added_pair = set()\n",
+    "for qd in qs['questions']:\n",
+    "    img_idx = qd['image_filename']\n",
+    "    total_count = len(img2obj_dict[img_idx]) # object list length\n",
+    "    desc = object_info_to_description(img2obj_dict[img_idx])\n",
+    "    question, answer = qd['question'], qd['answer']\n",
+    "    if 'how many' in question.lower() or 'number' in question.lower():\n",
+    "        qa_pairs.append({\n",
+    "            \"img_filename\": img_idx,\n",
+    "            'q': question,\n",
+    "            'a': answer,\n",
+    "            'description': desc \n",
+    "        })\n",
+    "        if img_idx not in added_pair:\n",
+    "            qa_pairs.append({\n",
+    "                \"img_filename\": img_idx,\n",
+    "                'q': \"How many items are there in the described scene?\",\n",
+    "                'a': total_count,\n",
+    "                'description': desc \n",
+    "            })\n",
+    "            added_pair.add(img_idx)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "c271fa7b-fed5-472f-a302-6ec203c4b787",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "59978"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(qa_pairs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b0da8a70-c3f5-4e48-b384-3684933d72ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14884"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(added_pair)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "c648587e-2ec0-427c-b594-f55dd187b4d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save for later loading\n",
+    "with open(f\"clever_counting_problems_clevr_cogent_v1.0_{split}.json\", 'w') as fw:\n",
+    "    json.dump( qa_pairs, fw, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b3a8cbe4-4261-41d3-a481-43a0b1cc2795",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.shuffle(qa_pairs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "d6dff4e7-65dd-4e82-82df-340ec2a57919",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'img_filename': 'CLEVR_trainA_048403.png',\n",
+       "  'q': 'How many things are both on the right side of the big yellow rubber thing and left of the purple ball?',\n",
+       "  'a': '5',\n",
+       "  'description': 'Scene Description:\\nA large red rubber cylinder rotated 291.3° located at 3D coordinates (-0.89, -2.73, 0.70) and pixel coordinates (101, 152, 10.04)\\nA small purple metal sphere rotated 247.7° located at 3D coordinates (2.93, 0.87, 0.35) and pixel coordinates (379, 183, 9.66)\\nA large cyan rubber cylinder rotated 114.5° located at 3D coordinates (-2.40, 2.23, 0.70) and pixel coordinates (246, 82, 13.94)\\nA small red metal cylinder rotated 109.9° located at 3D coordinates (-0.95, 1.77, 0.35) and pixel coordinates (270, 113, 12.83)\\nA small red rubber cylinder rotated 343.7° located at 3D coordinates (-0.12, -0.74, 0.35) and pixel coordinates (209, 153, 10.82)\\nA large red rubber cylinder rotated 324.5° located at 3D coordinates (-2.71, -2.21, 0.70) and pixel coordinates (84, 119, 11.59)\\nA small red metal cylinder rotated 1.1° located at 3D coordinates (2.88, -0.12, 0.35) and pixel coordinates (342, 200, 9.12)\\nA small gray rubber cube rotated 144.9° located at 3D coordinates (0.79, 0.98, 0.35) and pixel coordinates (299, 145, 11.19)\\nA large yellow rubber cube rotated 90.0° located at 3D coordinates (-1.78, -0.31, 0.70) and pixel coordinates (180, 110, 12.05)\\n'},\n",
+       " {'img_filename': 'CLEVR_trainA_048403.png',\n",
+       "  'q': 'How many items are there in the described scene?',\n",
+       "  'a': 9,\n",
+       "  'description': 'Scene Description:\\nA large red rubber cylinder rotated 291.3° located at 3D coordinates (-0.89, -2.73, 0.70) and pixel coordinates (101, 152, 10.04)\\nA small purple metal sphere rotated 247.7° located at 3D coordinates (2.93, 0.87, 0.35) and pixel coordinates (379, 183, 9.66)\\nA large cyan rubber cylinder rotated 114.5° located at 3D coordinates (-2.40, 2.23, 0.70) and pixel coordinates (246, 82, 13.94)\\nA small red metal cylinder rotated 109.9° located at 3D coordinates (-0.95, 1.77, 0.35) and pixel coordinates (270, 113, 12.83)\\nA small red rubber cylinder rotated 343.7° located at 3D coordinates (-0.12, -0.74, 0.35) and pixel coordinates (209, 153, 10.82)\\nA large red rubber cylinder rotated 324.5° located at 3D coordinates (-2.71, -2.21, 0.70) and pixel coordinates (84, 119, 11.59)\\nA small red metal cylinder rotated 1.1° located at 3D coordinates (2.88, -0.12, 0.35) and pixel coordinates (342, 200, 9.12)\\nA small gray rubber cube rotated 144.9° located at 3D coordinates (0.79, 0.98, 0.35) and pixel coordinates (299, 145, 11.19)\\nA large yellow rubber cube rotated 90.0° located at 3D coordinates (-1.78, -0.31, 0.70) and pixel coordinates (180, 110, 12.05)\\n'}]"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qa_pairs[:2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "a6a66364-5b47-4138-91d6-a045404d21b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_r1(query='who are you?', model=\"deepseek-ai/DeepSeek-R1\"):\n",
+    "    # Create the chat completion\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=model,\n",
+    "         messages=[\n",
+    "            {'role': 'user', \n",
+    "            'content': query}\n",
+    "        ],\n",
+    "        stream=False,\n",
+    "    )\n",
+    "    # Print the response\n",
+    "    return response.choices[0].message.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "e5d5649f-c4e3-4f3f-b76e-7f7ed27f68e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def format_query(qa_dict):\n",
+    "    query = \"Answer the question according to scene description.\\n\\n\"\n",
+    "    query += qa_dict['description']\n",
+    "    query += f\"\\nQuestion:\\n{qa_dict['q']}\"\n",
+    "    return query \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "7f568a4e-f217-464a-8329-bbefb64d9653",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think>Okay, let's see. The user is asking how many items are there in the described scene. Let me go through the scene description step by step.\n",
+      "\n",
+      "So, the scene description lists each object with details like color, material, shape, rotation, 3D coordinates, and pixel coordinates. Each entry starts with \"A\" which usually indicates one item each. Let me count each one.\n",
+      "\n",
+      "First entry: \"A small green metal cylinder...\" That's one. Second: \"A small blue rubber cylinder...\" Second item. Third: \"A small cyan rubber cylinder...\" That's three. Fourth: \"A large cyan metal sphere...\" Four. Fifth: \"A large brown metal cube...\" Five. Sixth: \"A large yellow rubber cube...\" Six. Seventh: \"A large brown rubber cylinder...\" That's seven. \n",
+      "\n",
+      "Wait, did I miss any? Let me check again. The list has entries from \"A small green...\" up to the seventh one. Each sentence starts with \"A\", which suggests each is a separate item. No commas separating multiple items in a single entry. Each has different attributes and coordinates, so they must all be distinct. \n",
+      "\n",
+      "So the answer should be 7 items.\n",
+      "</think>\n",
+      "\n",
+      "There are 7 items in the described scene. Each entry corresponds to one distinct object, listed by their properties, coordinates, and rotations.\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "debug_query = format_query(qa_pairs[0])\n",
+    "print(query_r1(debug_query))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "cdc4231a-8ef4-4cf6-a575-d84ae7bbd0b5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Answer the question accordingly to scene description.\n",
+      "\n",
+      "Scene Description:\n",
+      "A small green metal cylinder rotated 329.5° located at 3D coordinates (-2.49, -1.65, 0.35) and pixel coordinates (111, 132, 11.81)\n",
+      "A small blue rubber cylinder rotated 312.2° located at 3D coordinates (-1.73, -2.91, 0.35) and pixel coordinates (76, 163, 10.57)\n",
+      "A small cyan rubber cylinder rotated 48.4° located at 3D coordinates (-2.10, -0.22, 0.35) and pixel coordinates (172, 118, 12.41)\n",
+      "A large cyan metal sphere rotated 27.4° located at 3D coordinates (1.52, -1.26, 0.70) and pixel coordinates (247, 181, 9.33)\n",
+      "A large brown metal cube rotated 107.7° located at 3D coordinates (-0.73, 2.39, 0.70) and pixel coordinates (290, 92, 12.93)\n",
+      "A large yellow rubber cube rotated 288.2° located at 3D coordinates (0.52, 0.63, 0.70) and pixel coordinates (279, 130, 11.09)\n",
+      "A large brown rubber cylinder rotated 229.8° located at 3D coordinates (2.38, 0.38, 0.70) and pixel coordinates (343, 166, 9.77)\n",
+      "\n",
+      "Question:\n",
+      "How many items are there in the described scene?\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(debug_query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "4cf90eb6-2cce-4e3d-8190-c44168a66dca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'img_filename': 'CLEVR_train_044000.png',\n",
+       " 'q': 'How many rubber objects are either small blue spheres or small things?',\n",
+       " 'a': '2',\n",
+       " 'description': 'Scene Description:\\nA large purple rubber sphere rotated 78.4° located at 3D coordinates (2.27, 0.87, 0.70) and pixel coordinates (360, 156, 9.49)\\nA large gray metal cube rotated 152.7° located at 3D coordinates (2.79, -1.26, 0.70) and pixel coordinates (301, 213, 7.91)\\nA large purple metal sphere rotated 79.2° located at 3D coordinates (-2.66, -2.74, 0.70) and pixel coordinates (51, 126, 10.61)\\nA large blue rubber sphere rotated 279.5° located at 3D coordinates (1.31, 2.72, 0.70) and pixel coordinates (376, 112, 11.19)\\nA small brown rubber cube rotated 124.1° located at 3D coordinates (-2.49, 2.61, 0.35) and pixel coordinates (251, 82, 13.79)\\nA small green rubber sphere rotated 323.9° located at 3D coordinates (-2.02, 0.45, 0.35) and pixel coordinates (197, 109, 12.22)\\n'}"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qa_pairs[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "33fcd4eb-1f33-47d4-a453-76ef00e6d5d3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think>Okay, let's tackle this question. The user wants to know how many rubber objects are either small blue spheres or small things. Hmm, first, I need to parse each part of the question correctly.\n",
+      "\n",
+      "Let's go back to the scene description. The objects are listed with their attributes: size, color, material, shape, rotation, 3D and pixel coordinates. The key here is to filter the objects based on the criteria given.\n",
+      "\n",
+      "The question has two parts: \"small blue spheres\" OR \"small things\". So any rubber object that is either a small blue sphere or any small thing (regardless of other attributes) counts. But wait, do both categories need to be rubber? Because the question says \"rubber objects are either...\". So rubber is the material, and the condition is either being a small blue sphere or a small thing. So first, all rubber objects, and among them, count those that are either small blue spheres or small (regardless of color or shape). Wait, no. Let me re-read the question.\n",
+      "\n",
+      "The question is: \"How many rubber objects are either small blue spheres or small things?\" So rubber is the material. Within all rubber objects, count the ones that are either (1) small blue spheres OR (2) small things. Wait, does (2) being small things mean that even if they're small and of any color or shape, but they must be rubber?\n",
+      "\n",
+      "Yes, because the entire set is rubber objects. So first, select all objects where material is rubber. Then, within those, count how many are either (1) small, blue, sphere, or (2) small (any color or shape). Wait, but the structure is \"either X or Y\", where X is \"small blue sphere\" and Y is \"small things\". But \"small things\" would include all small objects, regardless of color and shape. However, since we've already narrowed it to rubber objects, \"small things\" here would be small rubber objects, regardless of color and shape.\n",
+      "\n",
+      "But wait, the condition is within rubber objects. So for the first part, small blue spheres (must check size, color, shape) and for the second part, small things (size is small, any color and shape, but since material is already rubber, that's covered). But wait, does the OR merge the two conditions, leading to rubber objects that are either (small blue spheres) or (small any-color any-shape).\n",
+      "\n",
+      "So the combined condition is: object is rubber AND ( (is small AND blue AND sphere) OR (is small) ). Wait, but if the condition for the second part is just \"small things\", which would imply any small object. But the entire group is rubber objects. So it's rubber objects that are small blue spheres OR rubber objects that are small (regardless of color or shape).\n",
+      "\n",
+      "Wait, no. Let's parse the sentence again: \"rubber objects are either small blue spheres or small things\". The \"either/or\" applies to \"small blue spheres\" and \"small things\". So, each rubber object has to be either (a small blue sphere) or (a small thing). However, \"small things\" here might refer to any small object regardless of other attributes. So if a rubber object is small, regardless of color or shape, it counts. But then, the first condition (small blue sphere) would also satisfy being a small thing. Wait, so there's an overlap. But when dealing with OR conditions, we have to avoid double-counting. So, the actual count is the number of small rubber objects (since any small rubber object is covered by the second part, which includes all small rubber objects, whether blue spheres or not) plus any objects that are small blue spheres but not rubber? But no, the question specifies \"rubber objects\", so we can ignore non-rubber ones.\n",
+      "\n",
+      "Wait, perhaps the wording is: \"rubber objects that are either small blue spheres or small things\". So \"small things\" here must reference other attributes. Wait, maybe there's ambiguity here. If the user is grouping \"small things\" as a separate category, regardless of being the other attributes. Let me try to approach this step by step.\n",
+      "\n",
+      "First, list all the rubber objects from the scene description:\n",
+      "\n",
+      "Looking through the list:\n",
+      "\n",
+      "1. A large purple rubber sphere ... location etc.\n",
+      "So material rubber, large, purple, sphere.\n",
+      "\n",
+      "2. A large gray metal cube ... metal, so not rubber.\n",
+      "\n",
+      "3. A large purple metal sphere ... metal, not rubber.\n",
+      "\n",
+      "4. A large blue rubber sphere ... rubber, large, blue, sphere.\n",
+      "\n",
+      "5. A small brown rubber cube ... rubber, small, brown, cube.\n",
+      "\n",
+      "6. A small green rubber sphere ... rubber, small, green, sphere.\n",
+      "\n",
+      "So the rubber objects are items 1,4,5,6.\n",
+      "\n",
+      "Now, for each of these four rubber objects, check if they are either (small blue sphere) or (small things).\n",
+      "\n",
+      "Let's check each:\n",
+      "\n",
+      "1. Large purple rubber sphere. Size: large. So for the first condition (small blue sphere): no. For the second condition (small thing): size large, so no. Not included.\n",
+      "\n",
+      "4. Large blue rubber sphere. Large. So, even though it's blue and sphere, the size is large. So (small blue sphere: no, since size is large). (small thing: no.) So not included.\n",
+      "\n",
+      "5. Small brown rubber cube. Size is small. So regardless of color and shape (brown, cube), this is a small thing. So it meets the second condition. So count it.\n",
+      "\n",
+      "6. Small green rubber sphere. Size: small. So meets the second condition (small thing). But it's a sphere and green, not blue. So this counts under the second category.\n",
+      "\n",
+      "Now, what about the first condition: small blue spheres. Are there any rubber objects that are small blue spheres? The only blue rubber sphere is item 4, which is large. So none of the rubber objects are small blue spheres.\n",
+      "\n",
+      "So now the two options under the OR are:\n",
+      "\n",
+      "- small blue sphere (none found)\n",
+      "- small things (items 5 and 6), both are rubber.\n",
+      "\n",
+      "Therefore total count is 0 + 2 = 2.\n",
+      "\n",
+      "Wait, but the initial question is: number of rubber objects that are either (small blue spheres) or (small things). Since \"small blue spheres\" are a subset of \"small things\" (since if something is a small blue sphere, it is a small thing). Therefore if there were any such objects, they would be counted twice, but in this case, since there are none, the total is just the count of small things.\n",
+      "\n",
+      "But according to the data, the rubber objects that are small are two: item 5 (small brown cube) and item 6 (small green sphere). Neither is a blue sphere, so no overlap. Therefore, sum is 2.\n",
+      "\n",
+      "So the answer should be 2.\n",
+      "\n",
+      "I need to confirm again whether the OR condition includes the union, meaning any rubber object that is either condition. So if either condition is true, count it.\n",
+      "\n",
+      "Since both 5 and 6 are small, even though they aren't blue spheres, they belong to the second category (small things). The blue sphere (if it were small) would have been counted as well. But in this case, the valid counts are 5 and 6. So yes, answer is 2.\n",
+      "</think>\n",
+      "\n",
+      "The answer is 2. \n",
+      "\n",
+      "**Explanation:**\n",
+      "First, identify all rubber objects from the scene description:\n",
+      "1. Large purple rubber sphere (not small)\n",
+      "2. Large blue rubber sphere (not small)\n",
+      "3. Small brown rubber cube (small)\n",
+      "4. Small green rubber sphere (small)\n",
+      "\n",
+      "Next, apply the criteria:\n",
+      "- **Small blue spheres**: None of the rubber objects meet this (the only blue rubber sphere is large).\n",
+      "- **Small rubber objects (regardless of color/shape)**: The small brown rubber cube and small green rubber sphere qualify (2 objects).\n",
+      "\n",
+      "Thus, there are **2 rubber objects** that fit either criterion.\n"
+     ]
+    }
+   ],
+   "source": [
+    "debug_query1 = format_query(qa_pairs[1])\n",
+    "res1 = query_r1(debug_query1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "8e516bd0-f1e5-4898-88a3-3afcaf0ae34e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'img_filename': 'CLEVR_train_044000.png',\n",
+       " 'q': 'How many rubber objects are either small blue spheres or small things?',\n",
+       " 'a': '2',\n",
+       " 'description': 'Scene Description:\\nA large purple rubber sphere rotated 78.4° located at 3D coordinates (2.27, 0.87, 0.70) and pixel coordinates (360, 156, 9.49)\\nA large gray metal cube rotated 152.7° located at 3D coordinates (2.79, -1.26, 0.70) and pixel coordinates (301, 213, 7.91)\\nA large purple metal sphere rotated 79.2° located at 3D coordinates (-2.66, -2.74, 0.70) and pixel coordinates (51, 126, 10.61)\\nA large blue rubber sphere rotated 279.5° located at 3D coordinates (1.31, 2.72, 0.70) and pixel coordinates (376, 112, 11.19)\\nA small brown rubber cube rotated 124.1° located at 3D coordinates (-2.49, 2.61, 0.35) and pixel coordinates (251, 82, 13.79)\\nA small green rubber sphere rotated 323.9° located at 3D coordinates (-2.02, 0.45, 0.35) and pixel coordinates (197, 109, 12.22)\\n'}"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qa_pairs[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92784518-49e2-443d-9541-2785cbb944cf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

previous_version/Video-R1-main-previous/src/distill_r1/grpo_r1_distilled.jpg ADDED Viewed

Git LFS Details

SHA256: e0f6135ef837a375090b07e29a18fd2d5cb819100c73d5dc7ea63401f66caf59
Pointer size: 131 Bytes
Size of remote file: 304 kB

previous_version/Video-R1-main-previous/src/distill_r1/query_r1.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import json
+import random
+import os
+from openai import OpenAI
+from tqdm import tqdm
+import concurrent.futures
+from typing import List, Dict, Optional
+from datetime import datetime
+from threading import Lock
+import time
+from prompt import R1_SYS_PROMPT
+# Initialize the client
+client = OpenAI(
+    api_key=os.environ.get("SL_KEY", "YOUR_SILCONFLOW_KEY"),
+    base_url="https://api.siliconflow.cn/v1",
+)
+# Create a lock for thread-safe file writing
+file_lock = Lock()
+def format_query(qa_dict: Dict, v2=False) -> str:
+    query = "Answer the question according to scene description.\n\n"
+    query += qa_dict["description"]
+    query += f"\nQuestion:\n{qa_dict['q']}"
+    if v2:
+        query += "\nInstructions:\n"
+        query += "1. Carefully analyze the scene description\n"
+        query += "2. Provide your reasoning if necessary\n"
+        query += "3. For the final answer, start a new line with '**The answer is: **' followed by your answer\n"
+    return query
+def write_to_jsonl(result: Dict, filename: str):
+    """Thread-safe function to write a result to JSONL file"""
+    with file_lock:
+        with open(filename, 'a') as f:
+            f.write(json.dumps(result) + '\n')
+def query_r1(qa_pair: Dict, output_file: str, model: str = "deepseek-ai/DeepSeek-R1", v2=False) -> Optional[Dict]:
+    query = format_query(qa_pair, v2=v2)
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": R1_SYS_PROMPT},
+                {"role": "user", "content": query}],
+            stream=False,
+            max_tokens=4096
+        )
+        result = {
+            **qa_pair,
+            "r1_response": response.choices[0].message.content,
+            "timestamp": datetime.now().isoformat()
+        }
+        # Write result immediately
+        write_to_jsonl(result, output_file)
+        time.sleep(4)
+        return result
+    except Exception as e:
+        print(f"Error processing query: {e}")
+        error_result = {
+            **qa_pair,
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+        write_to_jsonl(error_result, f"errors_{output_file}")
+        time.sleep(10)
+        return None
+def process_qa_pairs_parallel(qa_pairs: List[Dict], output_file: str, max_workers: int = 10) -> List[Dict]:
+    successful_count = 0
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Create futures for all qa_pairs
+        futures = [executor.submit(query_r1, qa_pair, output_file, v2="v2" in output_file) for qa_pair in qa_pairs]
+        # Process results as they complete with progress bar
+        results = []
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            try:
+                result = future.result()
+                if result is not None:
+                    results.append(result)
+                    successful_count += 1
+            except Exception as e:
+                print(f"Failed to process query: {e}")
+    return results
+if __name__ == "__main__":
+    # Load and shuffle QA pairs
+    random.seed(1234)
+    qa_pairs = json.load(open("/home/lilei/Visual-R1/data/clever_counting_problems_clevr_cogent_v1.0_trainA.json"))
+    random.shuffle(qa_pairs)
+    qa_pairs = qa_pairs[:10000]
+    # Create output filename with timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = f"r1_results_clevr_cogent_v1.0_trainA_v2.jsonl"
+    finished = set()
+    with open(output_file, 'r') as f:
+        for line in f:
+            ins = json.loads(line)
+            key = ins["img_filename"] + "-" + ins["q"] + "-"  + str(ins["a"])
+            finished.add(key)
+    qa_pairs = [ins for ins in qa_pairs if ins["img_filename"] + "-" + ins["q"] + "-" + str(ins["a"]) not in finished]
+    print("Finished: ", len(finished))
+    print("Remaining: ", len(qa_pairs))
+    # Process QA pairs in parallel
+    r1_results = process_qa_pairs_parallel(qa_pairs, output_file)
+    # Print final statistics
+    print(f"Successfully processed {len(r1_results)} out of {len(qa_pairs)} queries")
+    print(f"Results saved to {output_file}")
+    print(f"Any errors were saved to errors_{output_file}")

previous_version/Video-R1-main-previous/src/eval/prompts/geoqa_test_prompts.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

previous_version/Video-R1-main-previous/src/eval/prompts/superclevr_test200_counting_problems.jsonl ADDED Viewed

	@@ -0,0 +1,200 @@

+{"image_path": "./images/superCLEVR_new_025000.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025001.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025002.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025003.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025004.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025005.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025006.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025007.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025008.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025009.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025010.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025011.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025012.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025013.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025014.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025015.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025016.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025017.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025018.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025019.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025020.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025021.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025022.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025023.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025024.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025025.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025026.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025027.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025028.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025029.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025030.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025031.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025032.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025033.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025034.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025035.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025036.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025037.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025038.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025039.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025040.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025041.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025042.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025043.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025044.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025045.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025046.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025047.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025048.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025049.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025050.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025051.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025052.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025053.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025054.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025055.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025056.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025057.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025058.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025059.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025060.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025061.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025062.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025063.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025064.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025065.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025066.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025067.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025068.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025069.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025070.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025071.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025072.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025073.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025074.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025075.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025076.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025077.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025078.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025079.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025080.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025081.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025082.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025083.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025084.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025085.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025086.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025087.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025088.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025089.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025090.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025091.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025092.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025093.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025094.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025095.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025096.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025097.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025098.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025099.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025100.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025101.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025102.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025103.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025104.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025105.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025106.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025107.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025108.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025109.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025110.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025111.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025112.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025113.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025114.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025115.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025116.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025117.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025118.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025119.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025120.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025121.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025122.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025123.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025124.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025125.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025126.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025127.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025128.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025129.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025130.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025131.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025132.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025133.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025134.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025135.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025136.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025137.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025138.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025139.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025140.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025141.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025142.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025143.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025144.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025145.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025146.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025147.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025148.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025149.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025150.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025151.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025152.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025153.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025154.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025155.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025156.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025157.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025158.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025159.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025160.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025161.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025162.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025163.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025164.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025165.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025166.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025167.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025168.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025169.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025170.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025171.png", "question": "How many different items are there in the image?", "ground_truth": 7}
+{"image_path": "./images/superCLEVR_new_025172.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025173.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025174.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025175.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025176.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025177.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025178.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025179.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025180.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025181.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025182.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025183.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025184.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025185.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025186.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025187.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025188.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025189.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025190.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025191.png", "question": "How many different items are there in the image?", "ground_truth": 8}
+{"image_path": "./images/superCLEVR_new_025192.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025193.png", "question": "How many different items are there in the image?", "ground_truth": 9}
+{"image_path": "./images/superCLEVR_new_025194.png", "question": "How many different items are there in the image?", "ground_truth": 10}
+{"image_path": "./images/superCLEVR_new_025195.png", "question": "How many different items are there in the image?", "ground_truth": 5}
+{"image_path": "./images/superCLEVR_new_025196.png", "question": "How many different items are there in the image?", "ground_truth": 6}
+{"image_path": "./images/superCLEVR_new_025197.png", "question": "How many different items are there in the image?", "ground_truth": 3}
+{"image_path": "./images/superCLEVR_new_025198.png", "question": "How many different items are there in the image?", "ground_truth": 4}
+{"image_path": "./images/superCLEVR_new_025199.png", "question": "How many different items are there in the image?", "ground_truth": 3}

previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_counting_superclevr.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+MODEL_PATH="Qwen2-VL-2B-GRPO-CLEVR-70k/checkpoint-100" # Qwen2vl-2b-Instruct for original scores
+BSZ=64 # reduce it if GPU OOM
+OUTPUT_PATH="./logs/counting_results_superclevr_200_qwen2vl_2b_instruct_grpo_100.json"
+PROMPT_PATH="./prompts/superclevr_test200_counting_problems.jsonl"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+data = []
+with open(PROMPT_PATH, "r") as f:
+    for line in f:
+        data.append(json.loads(line))
+QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> and final answer (number) in <answer> </answer> tags."
+messages = []
+for i in data:
+    message = [{
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": f"file://{i['image_path']}"
+            },
+            {
+                "type": "text",
+                "text": QUESTION_TEMPLATE.format(Question=i['question'])
+            }
+        ]
+    }]
+    messages.append(message)
+all_outputs = []  # List to store all answers
+# Process data in batches
+for i in tqdm(range(0, len(messages), BSZ)):
+    batch_messages = messages[i:i + BSZ]
+    # Preparation for inference
+    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+    image_inputs, video_inputs = process_vision_info(batch_messages)
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    batch_output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    all_outputs.extend(batch_output_text)
+    print(f"Processed batch {i//BSZ + 1}/{(len(messages) + BSZ - 1)//BSZ}")
+def extract_number_answer(output_str):
+    # Try to find the number within <answer> tags, if can not find, return None
+    answer_pattern = r'<answer>\s*(\d+)\s*</answer>'
+    match = re.search(answer_pattern, output_str)
+    if match:
+        return int(match.group(1))
+    return None
+final_output = []
+correct_number = 0
+for input_example, model_output in zip(data,all_outputs):
+    original_output = model_output
+    ground_truth = input_example['ground_truth']
+    model_answer = extract_number_answer(original_output)
+    # Create a result dictionary for this example
+    result = {
+        'question': input_example,
+        'ground_truth': ground_truth,
+        'model_output': original_output,
+        'extracted_answer': model_answer
+    }
+    final_output.append(result)
+    # Count correct answers
+    if model_answer is not None and model_answer == ground_truth:
+        correct_number += 1
+# Calculate and print accuracy
+accuracy = correct_number / len(data) * 100
+print(f"\nAccuracy: {accuracy:.2f}%")
+# Save results to a JSON file
+output_path = OUTPUT_PATH
+with open(output_path, "w") as f:
+    json.dump({
+        'accuracy': accuracy,
+        'results': final_output
+    }, f, indent=2)
+print(f"Results saved to {output_path}")

previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_geoqa.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+from math_verify import parse, verify
+MODEL_PATH="<MODEL_PATH>" # qwen2vl model or grpoed model on geoqa train
+BSZ=50 # reduce it if GPU OOM
+OUTPUT_PATH="<OUTPUT_LOG>"
+PROMPT_PATH="./prompts/geoqa_test_prompts.jsonl"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+data = []
+with open(PROMPT_PATH, "r") as f:
+    for line in f:
+        data.append(json.loads(line))
+QUESTION_TEMPLATE = "{Question} Output the thinking process in <think> </think> and final answer (number) in <answer> </answer> tags."
+messages = []
+data = data
+for i in data:
+    message = [{
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": f"file://{i['image_path']}"
+            },
+            {
+                "type": "text",
+                "text": QUESTION_TEMPLATE.format(Question=i['question'])
+            }
+        ]
+    }]
+    messages.append(message)
+all_outputs = []  # List to store all answers
+# Process data in batches
+for i in tqdm(range(0, len(messages), BSZ)):
+    batch_messages = messages[i:i + BSZ]
+    # Preparation for inference
+    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+    image_inputs, video_inputs = process_vision_info(batch_messages)
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024, do_sample=False)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    batch_output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    all_outputs.extend(batch_output_text)
+    print(f"Processed batch {i//BSZ + 1}/{(len(messages) + BSZ - 1)//BSZ}")
+final_output = []
+correct_number = 0
+for input_example, model_output in zip(data,all_outputs):
+    original_output = model_output
+    ground_truth = input_example['ground_truth']
+    model_answer = parse(original_output)
+    # Count correct answers
+    if model_answer is not None and float(verify(model_answer,parse(ground_truth)))>0:
+        correct_number += 1
+        is_correct = True
+    else:
+        is_correct = False
+    try:
+        result = {
+            'question': input_example,
+            'ground_truth': ground_truth,
+            'model_output': original_output,
+            'extracted_answer':str(model_answer[0]) if model_answer is not None else None,
+            'is_correct':is_correct
+        }
+    except Exception as e:
+        print("no answer parsed",e,model_answer)
+        result = {
+            'question': input_example,
+            'ground_truth': ground_truth,
+            'model_output': original_output,
+            'extracted_answer':None,
+            'is_correct':is_correct
+        }
+    final_output.append(result)
+# Calculate and print accuracy
+accuracy = correct_number / len(data) * 100
+print(f"\nAccuracy: {accuracy:.2f}%")
+# Save results to a JSON file
+output_path = OUTPUT_PATH
+with open(output_path, "w") as f:
+    json.dump({
+        'accuracy': accuracy,
+        'results': final_output
+    }, f, indent=2, ensure_ascii=False)
+print(f"Results saved to {output_path}")

previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_geoqa_multigpu.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+import tqdm
+from math_verify import parse, verify
+import argparse
+import pandas as pd
+from torch.multiprocessing import Process, set_start_method, Manager
+from transformers.utils.logging import disable_progress_bar
+disable_progress_bar()
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# >>>>> 1. get evaluation configuration <<<<<
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+def get_eval_config():
+    parser = argparse.ArgumentParser(description="Inference script for GeoQA evaluation.")
+    parser.add_argument("--model_path", required=True, type=str, help="Path to the model checkpoint (e.g., qwen2vl model or a fine-tuned model).")
+    parser.add_argument("--batch_size", default=4, type=int, help="Batch size for inference. Reduce if GPU OOM (default: 50).")
+    parser.add_argument("--output_path", required=True, type=str, help="Path to save inference result (e.g., JSON file).")
+    parser.add_argument("--prompt_path", required=True, type=str, help="Path to the prompts JSONL file for GeoQA evaluation.")
+    all_gpu = ",".join(map(str, range(torch.cuda.device_count())))
+    parser.add_argument("--gpu_ids", default=all_gpu, help="comma-separated list of GPU IDs to use")
+    args = parser.parse_args()
+    return args
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# >>>>>>>>>> 2. load testset <<<<<<<<<<<<<
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+def prepare_test_messages(testset_path):
+    testset_data = pd.read_json(testset_path, lines=True).to_dict(orient="records")
+    QUESTION_TEMPLATE = "{Question} Output the thinking process in <think> </think> and final answer (number) in <answer> </answer> tags."
+    tested_messages = []
+    for i in testset_data:
+        message = [{
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{i['image_path']}"
+                },
+                {
+                    "type": "text",
+                    "text": QUESTION_TEMPLATE.format(Question=i['question'])
+                }
+            ]
+        }]
+        tested_messages.append(message)
+    return testset_data, tested_messages
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# >>>>> 3. use several GPUs to accelerate inference at testset <<<<<
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+def init_model(model_path, gpu_id):
+    """init a model(args.model_path) on a specific gpu"""
+    # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        device_map=f"cuda:{gpu_id}",
+    )
+    # default processer
+    processor = AutoProcessor.from_pretrained(model_path, use_fast=True)
+    return model, processor
+def answer_a_batch_question_qwen(batch_messages, model, processor):
+    """ let qwen answer a batch of questions """
+    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+    image_inputs, video_inputs = process_vision_info(batch_messages)
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(model.device)
+    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024) # do_sample=False
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    batch_output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return batch_output_text
+def infer_on_single_gpu(model_path, device_id, chunk_of_tested_messages, batch_size, results=None):
+    """init model on this single gpu and let it answer asign chunk of questions"""
+    model, processor = init_model(model_path, device_id)
+    ### split batch
+    responses = []
+    batch_messages_list = [chunk_of_tested_messages[start: start + batch_size]
+               for start in range(0, len(chunk_of_tested_messages), batch_size)]
+    for batch_messages in tqdm.auto.tqdm(batch_messages_list, desc=f"GPU {device_id} progress", position=device_id, leave=False):
+        batch_output_text = answer_a_batch_question_qwen(batch_messages, model, processor)
+        responses.extend(batch_output_text)
+    results[device_id] = responses
+    return
+def multi_gpu_inference(prompts, gpu_ids, model_path, batch_size):
+    """ let each gpu (along with a model) answer a chunk of questions """
+    set_start_method("spawn", force=True)
+    manager = Manager()
+    gpu_id2result = manager.dict()
+    gpu_ids = [int(gpu_id.strip()) for gpu_id in gpu_ids.split(',')]
+    num_gpus = len(gpu_ids)
+    chunk_size = len(prompts) // num_gpus
+    processes = []
+    for i, gpu_id in enumerate(gpu_ids):
+        start_idx = i * chunk_size
+        end_idx = (i + 1) * chunk_size if i != num_gpus - 1 else len(prompts)
+        chunk = prompts[start_idx: end_idx]
+        process = Process(target=infer_on_single_gpu, args=(model_path, gpu_id, chunk, batch_size, gpu_id2result))
+        process.start()
+        processes.append(process)
+    # for process in tqdm.auto.tqdm(processes, desc="Inference progress", position=num_gpus, leave=True):
+    for process in processes:
+        process.join()
+    all_predicts = []
+    for gpu_id in gpu_ids:
+        all_predicts.extend(gpu_id2result[gpu_id])
+    return all_predicts
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+# >>>>>>>>>> 4. compute metrics <<<<<<<<<<<
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+def compute_metrics(testset_data, all_predicts):
+    final_output = []
+    correct_number = 0
+    for input_example, model_output in zip(testset_data, all_predicts):
+        original_output = model_output
+        ground_truth = input_example['ground_truth']
+        model_answer = parse(original_output)
+        # Count correct answers
+        if model_answer is not None and float(verify(model_answer,parse(ground_truth)))>0:
+            correct_number += 1
+            is_correct = True
+        else:
+            is_correct = False
+        try:
+            result = {
+                'question': input_example,
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer':str(model_answer[0]) if model_answer is not None else None,
+                'is_correct':is_correct
+            }
+        except Exception as e:
+            print("no answer parsed",e,model_answer)
+            result = {
+                'question': input_example,
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer':None,
+                'is_correct':is_correct
+            }
+        final_output.append(result)
+    # Calculate and print accuracy
+    accuracy = correct_number / len(tested_messages) * 100
+    print(f"\nAccuracy: {accuracy:.2f}%")
+    # Save results to a JSON file
+    with open(args.output_path, "w") as f:
+        json.dump({
+            'accuracy': accuracy,
+            'results': final_output
+        }, f, indent=2, ensure_ascii=False)
+    print(f"Results saved to {args.output_path}")
+if __name__ == "__main__":
+    args = get_eval_config()
+    testset_data, tested_messages = prepare_test_messages(testset_path=args.prompt_path)
+    all_predicts = multi_gpu_inference(tested_messages, args.gpu_ids, args.model_path, args.batch_size)
+    compute_metrics(testset_data, all_predicts)

previous_version/Video-R1-main-previous/src/eval/test_qwen2vl_video_counting.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+MODEL_PATH="YOUR_PATH" # Qwen2vl-2b-Instruct for original scores
+BSZ=64 # reduce it if GPU OOM
+OUTPUT_PATH="YOUR_PATH/test.json"
+PROMPT_PATH="YOUR_PATH/test_dvd.jsonl"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+data = []
+with open(PROMPT_PATH, "r") as f:
+    for line in f:
+        data.append(json.loads(line))
+# detailed step-by-step
+QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> and final answer (number) in <answer> </answer> tags."
+messages = []
+for x in data:
+    message = [{
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": os.getcwd() + "/src/r1-v/data" + x['video_filename'][1:]
+            },
+            {
+                "type": "text",
+                "text": QUESTION_TEMPLATE.format(Question=x['problem'])
+            }
+        ]
+    }]
+    messages.append(message)
+all_outputs = []  # List to store all answers
+# Process data in batches
+for i in tqdm(range(0, len(messages), BSZ)):
+    batch_messages = messages[i:i + BSZ]
+    # Preparation for inference
+    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+    image_inputs, video_inputs = process_vision_info(batch_messages)
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    batch_output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    all_outputs.extend(batch_output_text)
+    print(f"Processed batch {i//BSZ + 1}/{(len(messages) + BSZ - 1)//BSZ}")
+def extract_number_answer(output_str):
+    # Try to find the number within <answer> tags, if can not find, return None
+    answer_pattern = r'<answer>\s*(\d+)\s*</answer>'
+    match = re.search(answer_pattern, output_str)
+    if match:
+        return int(match.group(1))
+    return None
+final_output = []
+correct_number = 0
+for input_example, model_output in zip(data,all_outputs):
+    original_output = model_output
+    ground_truth = extract_number_answer(input_example['solution'])
+    model_answer = extract_number_answer(original_output)
+    # Create a result dictionary for this example
+    result = {
+        'question': input_example,
+        'ground_truth': ground_truth,
+        'model_output': original_output,
+        'extracted_answer': model_answer
+    }
+    final_output.append(result)
+    # Count correct answers
+    if model_answer is not None and model_answer == ground_truth:
+        correct_number += 1
+# Calculate and print accuracy
+accuracy = correct_number / len(data) * 100
+print(f"\nAccuracy: {accuracy:.2f}%")
+# Save results to a JSON file
+output_path = OUTPUT_PATH
+with open(output_path, "w") as f:
+    json.dump({
+        'accuracy': accuracy,
+        'results': final_output
+    }, f, indent=2)
+print(f"Results saved to {output_path}")

previous_version/Video-R1-main-previous/src/qwen-vl-utils/.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.8.19

previous_version/Video-R1-main-previous/src/qwen-vl-utils/README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# qwen-vl-utils
+Qwen-VL Utils contains a set of helper functions for processing and integrating visual language information with Qwen-VL Series Model.
+## Install
+```bash
+pip install qwen-vl-utils
+```
+## Usage
+### Qwen2VL
+```python
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
+messages = [
+    # Image
+    ## Local file path
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Image URL
+    [{"role": "user", "content": [{"type": "image", "image": "http://path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Base64 encoded image
+    [{"role": "user", "content": [{"type": "image", "image": "data:image;base64,/9j/..."}, {"type": "text", "text": "Describe this image."}]}],
+    ## PIL.Image.Image
+    [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": "Describe this image."}]}],
+    ## Model dynamically adjusts image size, specify dimensions if required.
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "resized_height": 280, "resized_width": 420}, {"type": "text", "text": "Describe this image."}]}],
+    # Video
+    ## Local video path
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4"}, {"type": "text", "text": "Describe this video."}]}],
+    ## Local video frames
+    [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/extracted_frame1.jpg", "file:///path/to/extracted_frame2.jpg", "file:///path/to/extracted_frame3.jpg"],}, {"type": "text", "text": "Describe this video."},],}],
+    ## Model dynamically adjusts video nframes, video height and width. specify args if required.
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", "fps": 2.0, "resized_height": 280, "resized_width": 280}, {"type": "text", "text": "Describe this video."}]}],
+]
+processor = AutoProcessor.from_pretrained(model_path)
+model = Qwen2VLForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto", device_map="auto")
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+images, videos = process_vision_info(messages)
+inputs = processor(text=text, images=images, videos=videos, padding=True, return_tensors="pt")
+print(inputs)
+generated_ids = model.generate(**inputs)
+print(generated_ids)
+```
+### Qwen2.5VL
+```python
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# You can set the maximum tokens for a video through the environment variable VIDEO_MAX_PIXELS
+# based on the maximum tokens that the model can accept.
+# export VIDEO_MAX_PIXELS = 32000 * 28 * 28 * 0.9
+# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
+messages = [
+    # Image
+    ## Local file path
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Image URL
+    [{"role": "user", "content": [{"type": "image", "image": "http://path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Base64 encoded image
+    [{"role": "user", "content": [{"type": "image", "image": "data:image;base64,/9j/..."}, {"type": "text", "text": "Describe this image."}]}],
+    ## PIL.Image.Image
+    [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": "Describe this image."}]}],
+    ## Model dynamically adjusts image size, specify dimensions if required.
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "resized_height": 280, "resized_width": 420}, {"type": "text", "text": "Describe this image."}]}],
+    # Video
+    ## Local video path
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4"}, {"type": "text", "text": "Describe this video."}]}],
+    ## Local video frames
+    [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/extracted_frame1.jpg", "file:///path/to/extracted_frame2.jpg", "file:///path/to/extracted_frame3.jpg"],}, {"type": "text", "text": "Describe this video."},],}],
+    ## Model dynamically adjusts video nframes, video height and width. specify args if required.
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", "fps": 2.0, "resized_height": 280, "resized_width": 280}, {"type": "text", "text": "Describe this video."}]}],
+]
+processor = AutoProcessor.from_pretrained(model_path)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto", device_map="auto")
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+images, videos, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+inputs = processor(text=text, images=images, videos=videos, padding=True, return_tensors="pt", **video_kwargs)
+print(inputs)
+generated_ids = model.generate(**inputs)
+print(generated_ids)
+```

previous_version/Video-R1-main-previous/src/qwen-vl-utils/pyproject.toml ADDED Viewed

	@@ -0,0 +1,75 @@

+[project]
+name = "qwen-vl-utils"
+version = "0.0.10"
+description = "Qwen Vision Language Model Utils - PyTorch"
+authors = [
+    { name = "Qwen Team", email = "[email protected]" },
+]
+dependencies = [
+    "requests",
+    "pillow",
+    "av",
+    "packaging",
+]
+readme = "README.md"
+requires-python = ">= 3.8"
+license = {text = "Apache-2.0"}
+keywords = [
+    'large language model',
+    'vision language model',
+    'qwen-vl',
+    'pytorch',
+]
+classifiers = [
+    'Development Status :: 4 - Beta',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Programming Language :: Python :: 3',
+    'License :: OSI Approved :: Apache Software License',
+]
+[project.urls]
+Homepage = "https://github.com/QwenLM/Qwen2-VL/tree/main/qwen-vl-utils"
+Repository = "https://github.com/QwenLM/Qwen2-VL.git"
+Issues = "https://github.com/QwenLM/Qwen2-VL/issues"
+[project.optional-dependencies]
+decord = [
+    "decord",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.rye]
+managed = true
+dev-dependencies = [
+    "torch",
+    "torchvision",
+]
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+packages = ["src/qwen_vl_utils"]
+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+ignore = ["C408", "C901", "E501", "E731", "E741", "W605"]
+select = ["C", "E", "F", "I", "W"]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["qwen_vl_utils"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"

previous_version/Video-R1-main-previous/src/qwen-vl-utils/requirements-dev.lock ADDED Viewed

	@@ -0,0 +1,84 @@

+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: ["decord"]
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+#   universal: false
+-e file:.
+av==12.3.0
+    # via qwen-vl-utils
+certifi==2022.12.7
+    # via requests
+charset-normalizer==2.1.1
+    # via requests
+decord==0.6.0
+    # via qwen-vl-utils
+filelock==3.13.1
+    # via torch
+    # via triton
+fsspec==2024.2.0
+    # via torch
+idna==3.4
+    # via requests
+jinja2==3.1.3
+    # via torch
+markupsafe==2.1.5
+    # via jinja2
+mpmath==1.3.0
+    # via sympy
+networkx==3.1
+    # via torch
+numpy==1.24.1
+    # via decord
+    # via torchvision
+nvidia-cublas-cu12==12.1.3.1
+    # via nvidia-cudnn-cu12
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via nvidia-cusolver-cu12
+    # via torch
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.6.68
+    # via nvidia-cusolver-cu12
+    # via nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+packaging==24.1
+    # via qwen-vl-utils
+pillow==10.2.0
+    # via qwen-vl-utils
+    # via torchvision
+requests==2.28.1
+    # via qwen-vl-utils
+sympy==1.12
+    # via torch
+torch==2.4.0
+    # via torchvision
+torchvision==0.19.0
+triton==3.0.0
+    # via torch
+typing-extensions==4.9.0
+    # via torch
+urllib3==1.26.13
+    # via requests

previous_version/Video-R1-main-previous/src/qwen-vl-utils/requirements.lock ADDED Viewed

	@@ -0,0 +1,32 @@

+# generated by rye
+# use `rye lock` or `rye sync` to update this lockfile
+#
+# last locked with the following flags:
+#   pre: false
+#   features: ["decord"]
+#   all-features: false
+#   with-sources: false
+#   generate-hashes: false
+#   universal: false
+-e file:.
+av==12.3.0
+    # via qwen-vl-utils
+certifi==2022.12.7
+    # via requests
+charset-normalizer==2.1.1
+    # via requests
+decord==0.6.0
+    # via qwen-vl-utils
+idna==3.4
+    # via requests
+numpy==1.24.4
+    # via decord
+packaging==24.1
+    # via qwen-vl-utils
+pillow==10.2.0
+    # via qwen-vl-utils
+requests==2.28.1
+    # via qwen-vl-utils
+urllib3==1.26.13
+    # via requests

previous_version/Video-R1-main-previous/src/qwen-vl-utils/src/qwen_vl_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .vision_process import (
+    extract_vision_info,
+    fetch_image,
+    fetch_video,
+    process_vision_info,
+    smart_resize,
+)

previous_version/Video-R1-main-previous/src/qwen-vl-utils/src/qwen_vl_utils/vision_process.py ADDED Viewed

	@@ -0,0 +1,379 @@

+from __future__ import annotations
+import base64
+import logging
+import math
+import os
+import sys
+import time
+import warnings
+from functools import lru_cache
+from io import BytesIO
+import requests
+import torch
+import torchvision
+from packaging import version
+from PIL import Image
+from torchvision import io, transforms
+from torchvision.transforms import InterpolationMode
+from typing import Optional
+logger = logging.getLogger(__name__)
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+# VIDEO_MIN_PIXELS = 128 * 28 * 28
+# VIDEO_MAX_PIXELS = 768 * 28 * 28
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 128 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 16
+# Set the maximum number of video token inputs.
+# Here, 128K represents the maximum number of input tokens for the VLLM model.
+# Remember to adjust it according to your own configuration.
+VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
+logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+      if pil_image.mode == 'RGBA':
+          white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+          white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+          return white_background
+      else:
+          return pil_image.convert("RGB")
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        response = requests.get(image, stream=True)
+        image_obj = Image.open(BytesIO(response.content))
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = to_rgb(image_obj)
+    ## resize
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = ele.get("min_pixels", MIN_PIXELS)
+        max_pixels = ele.get("max_pixels", MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+    return image
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
+        nframes = total_frames / video_fps * fps
+        if nframes > total_frames:
+            logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
+        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
+        nframes = floor_by_factor(nframes, FRAME_FACTOR)
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
+    return nframes
+def _read_video_torchvision(
+    ele: dict,
+) -> (torch.Tensor, float):
+    """read video using torchvision.io.read_video
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    video_path = ele["video"]
+    if version.parse(torchvision.__version__) < version.parse("0.19.0"):
+        if "http://" in video_path or "https://" in video_path:
+            warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
+        if "file://" in video_path:
+            video_path = video_path[7:]
+    st = time.time()
+    video, audio, info = io.read_video(
+        video_path,
+        start_pts=ele.get("video_start", 0.0),
+        end_pts=ele.get("video_end", None),
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+    total_frames, video_fps = video.size(0), info["video_fps"]
+    logger.info(f"torchvision:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long()
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    video = video[idx]
+    return video, sample_fps
+def is_decord_available() -> bool:
+    import importlib.util
+    return importlib.util.find_spec("decord") is not None
+def _read_video_decord(
+    ele: dict,
+) -> (torch.Tensor, float):
+    """read video using decord.VideoReader
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    import decord
+    video_path = ele["video"]
+    st = time.time()
+    vr = decord.VideoReader(video_path)
+    # TODO: support start_pts and end_pts
+    if 'video_start' in ele or 'video_end' in ele:
+        raise NotImplementedError("not support start_pts and end_pts in decord for now.")
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    logger.info(f"decord:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    return video, sample_fps
+VIDEO_READER_BACKENDS = {
+    "decord": _read_video_decord,
+    "torchvision": _read_video_torchvision,
+}
+FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
+@lru_cache(maxsize=1)
+def get_video_reader_backend() -> str:
+    if FORCE_QWENVL_VIDEO_READER is not None:
+        video_reader_backend = FORCE_QWENVL_VIDEO_READER
+    elif is_decord_available():
+        video_reader_backend = "decord"
+    else:
+        video_reader_backend = "torchvision"
+    print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr)
+    return video_reader_backend
+def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
+    if isinstance(ele["video"], str):
+        video_reader_backend = get_video_reader_backend()
+        try:
+            video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele)
+        except Exception as e:
+            logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
+            video, sample_fps = VIDEO_READER_BACKENDS["torchvision"](ele)
+        nframes, _, height, width = video.shape
+        min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+        total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+        max_pixels_supposed = ele.get("max_pixels", max_pixels)
+        if max_pixels_supposed > max_pixels:
+            logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
+        max_pixels = min(max_pixels_supposed, max_pixels)
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=image_factor,
+            )
+        else:
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=image_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+        video = transforms.functional.resize(
+            video,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()
+        if return_video_sample_fps:
+            return video, sample_fps
+        return video
+    else:
+        assert isinstance(ele["video"], (list, tuple))
+        process_info = ele.copy()
+        process_info.pop("type", None)
+        process_info.pop("video", None)
+        images = [
+            fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
+            for video_element in ele["video"]
+        ]
+        nframes = ceil_by_factor(len(images), FRAME_FACTOR)
+        if len(images) < nframes:
+            images.extend([images[-1]] * (nframes - len(images)))
+        if return_video_sample_fps:
+            return images, process_info.pop("fps", 2.0)
+        return images
+def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+    vision_infos = []
+    if isinstance(conversations[0], dict):
+        conversations = [conversations]
+    for conversation in conversations:
+        for message in conversation:
+            if isinstance(message["content"], list):
+                for ele in message["content"]:
+                    if (
+                        "image" in ele
+                        or "image_url" in ele
+                        or "video" in ele
+                        or ele["type"] in ("image", "image_url", "video")
+                    ):
+                        vision_infos.append(ele)
+    return vision_infos
+def process_vision_info(
+    conversations: list[dict] | list[list[dict]],
+    return_video_kwargs: bool = False,
+) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
+    vision_infos = extract_vision_info(conversations)
+    ## Read images or videos
+    image_inputs = []
+    video_inputs = []
+    video_sample_fps_list = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info))
+        elif "video" in vision_info:
+            video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True)
+            video_sample_fps_list.append(video_sample_fps)
+            video_inputs.append(video_input)
+        else:
+            raise ValueError("image, image_url or video should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    if len(video_inputs) == 0:
+        video_inputs = None
+    if return_video_kwargs:
+        return image_inputs, video_inputs, {'fps': video_sample_fps_list}
+    return image_inputs, video_inputs

previous_version/Video-R1-main-previous/src/r1-v/temp_image.png ADDED Viewed

Git LFS Details

SHA256: 6d32d2be631fcae3fcf15b31fb57096fdba3c4c6e5417f8cab84f5c16e7ce18f
Pointer size: 131 Bytes
Size of remote file: 147 kB

src/r1-v/.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+# Temp folders
+data/
+wandb/
+scripts/
+checkpoints/
+.vscode/

src/r1-v/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

src/r1-v/Makefile ADDED Viewed

	@@ -0,0 +1,20 @@

+.PHONY: style quality
+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src
+check_dirs := src
+style:
+	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort $(check_dirs) setup.py
+quality:
+	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort --check-only $(check_dirs) setup.py
+	flake8 --max-line-length 119 $(check_dirs) setup.py
+# Evaluation
+evaluate:

src/r1-v/setup.cfg ADDED Viewed

	@@ -0,0 +1,41 @@

+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = open_r1
+known_third_party =
+    transformers
+    datasets
+    fugashi
+    git
+    h5py
+    matplotlib
+    nltk
+    numpy
+    packaging
+    pandas
+    psutil
+    pytest
+    rouge_score
+    sacrebleu
+    seqeval
+    sklearn
+    streamlit
+    torch
+    tqdm
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+[flake8]
+ignore = E203, E501, E741, W503, W605
+max-line-length = 119
+per-file-ignores =
+    # imported but unused
+    __init__.py: F401
+[tool:pytest]
+doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS

src/r1-v/setup.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from huggingface/transformers: https://github.com/huggingface/transformers/blob/21a2d900eceeded7be9edc445b56877b95eda4ca/setup.py
+import re
+import shutil
+from pathlib import Path
+from setuptools import find_packages, setup
+# Remove stale open_r1.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
+stale_egg_info = Path(__file__).parent / "open_r1.egg-info"
+if stale_egg_info.exists():
+    print(
+        (
+            "Warning: {} exists.\n\n"
+            "If you recently updated open_r1, this is expected,\n"
+            "but it may prevent open_r1 from installing in editable mode.\n\n"
+            "This directory is automatically generated by Python's packaging tools.\n"
+            "I will remove it now.\n\n"
+            "See https://github.com/pypa/pip/issues/5466 for details.\n"
+        ).format(stale_egg_info)
+    )
+    shutil.rmtree(stale_egg_info)
+# IMPORTANT: all dependencies should be listed here with their version requirements, if any.
+#   * If a dependency is fast-moving (e.g. transformers), pin to the exact version
+_deps = [
+    "accelerate>=1.2.1",
+    "bitsandbytes>=0.43.0",
+    "black>=24.4.2",
+    "datasets>=3.2.0",
+    "deepspeed==0.15.4",
+    "distilabel[vllm,ray,openai]>=1.5.2",
+    "einops>=0.8.0",
+    "flake8>=6.0.0",
+    "hf_transfer>=0.1.4",
+    "huggingface-hub[cli]>=0.19.2,<1.0",
+    "isort>=5.12.0",
+    "liger_kernel==0.5.2",
+    "lighteval @ git+https://github.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math]",
+    "math-verify",  # Used for math verification in grpo
+    "packaging>=23.0",
+    "parameterized>=0.9.0",
+    "pytest",
+    "safetensors>=0.3.3",
+    "sentencepiece>=0.1.99",
+    "torch>=2.5.1",
+    # "transformers @ git+https://github.com/huggingface/transformers.git@336dc69d63d56f232a183a3e7f52790429b871ef",
+    "trl==0.16.0",
+    "vllm==0.7.2",
+    "wandb>=0.19.1",
+    "pillow",
+]
+# this is a lookup table with items like:
+#
+# tokenizers: "tokenizers==0.9.4"
+# packaging: "packaging"
+#
+# some of the values are versioned whereas others aren't.
+deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)}
+def deps_list(*pkgs):
+    return [deps[pkg] for pkg in pkgs]
+extras = {}
+extras["tests"] = deps_list("pytest", "parameterized")
+extras["torch"] = deps_list("torch")
+extras["quality"] = deps_list("black", "isort", "flake8")
+extras["eval"] = deps_list("lighteval", "math-verify")
+extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
+# core dependencies shared across the whole project - keep this to a bare minimum :)
+install_requires = [
+    deps["accelerate"],
+    deps["bitsandbytes"],
+    deps["einops"],
+    deps["datasets"],
+    deps["deepspeed"],
+    deps["hf_transfer"],
+    deps["huggingface-hub"],
+    deps["liger_kernel"],
+    deps["packaging"],  # utilities from PyPA to e.g., compare versions
+    deps["safetensors"],
+    deps["sentencepiece"],
+    # deps["transformers"],
+    deps["trl"],
+]
+setup(
+    name="r1-v",
+    version="0.1.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    author="The r1-v team and the Hugging Face team (past and future)",
+    description="R1-V",
+    license="Apache",
+    url="https://github.com/Deep-Agent/R1-V",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    zip_safe=False,
+    extras_require=extras,
+    python_requires=">=3.10.9",
+    install_requires=install_requires,
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)

src/r1-v/src/open_r1/__init__.py ADDED Viewed

File without changes

src/r1-v/src/open_r1/evaluate.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Custom evaluation tasks for LightEval."""
+from lighteval.metrics.dynamic_metrics import (
+    ExprExtractionConfig,
+    LatexExtractionConfig,
+    multilingual_extractive_match_metric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.utils.language import Language
+metric = multilingual_extractive_match_metric(
+    language=Language.ENGLISH,
+    fallback_mode="first_match",
+    precision=5,
+    gold_extraction_target=(LatexExtractionConfig(),),
+    pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
+    aggregation_function=max,
+)
+def prompt_fn(line, task_name: str = None):
+    """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
+    return Doc(
+        task_name=task_name,
+        query=line["problem"],
+        choices=[line["solution"]],
+        gold_index=0,
+    )
+# Define tasks
+aime24 = LightevalTaskConfig(
+    name="aime24",
+    suite=["custom"],
+    prompt_function=prompt_fn,
+    hf_repo="HuggingFaceH4/aime_2024",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[metric],
+    version=1,
+)
+math_500 = LightevalTaskConfig(
+    name="math_500",
+    suite=["custom"],
+    prompt_function=prompt_fn,
+    hf_repo="HuggingFaceH4/MATH-500",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[metric],
+    version=1,
+)
+# Add tasks to the table
+TASKS_TABLE = []
+TASKS_TABLE.append(aime24)
+TASKS_TABLE.append(math_500)
+# MODULE LOGIC
+if __name__ == "__main__":
+    print([t["name"] for t in TASKS_TABLE])
+    print(len(TASKS_TABLE))

src/r1-v/src/open_r1/generate.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from distilabel.llms import OpenAILLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps.tasks import TextGeneration
+def build_distilabel_pipeline(
+    model: str,
+    base_url: str = "http://localhost:8000/v1",
+    prompt_column: Optional[str] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    max_new_tokens: int = 8192,
+    num_generations: int = 1,
+) -> Pipeline:
+    generation_kwargs = {"max_new_tokens": max_new_tokens}
+    if temperature is not None:
+        generation_kwargs["temperature"] = temperature
+    if top_p is not None:
+        generation_kwargs["top_p"] = top_p
+    with Pipeline().ray() as pipeline:
+        TextGeneration(
+            llm=OpenAILLM(
+                base_url=base_url,
+                api_key="something",
+                model=model,
+                # thinking can take some time...
+                timeout=10 * 60,
+                generation_kwargs=generation_kwargs,
+            ),
+            input_mappings={"instruction": prompt_column} if prompt_column is not None else {},
+            input_batch_size=64,  # on 4 nodes bs ~60+ leads to preemption due to KV cache exhaustion
+            num_generations=num_generations,
+        )
+    return pipeline
+if __name__ == "__main__":
+    import argparse
+    from datasets import load_dataset
+    parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses with DeepSeek R1")
+    parser.add_argument(
+        "--hf-dataset",
+        type=str,
+        required=True,
+        help="HuggingFace dataset to load",
+    )
+    parser.add_argument(
+        "--hf-dataset-config",
+        type=str,
+        required=False,
+        help="Dataset config to use",
+    )
+    parser.add_argument(
+        "--hf-dataset-split",
+        type=str,
+        default="train",
+        help="Dataset split to use",
+    )
+    parser.add_argument("--prompt-column", type=str, default="prompt")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Model name to use for generation",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="URL of the vLLM server",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        help="Temperature for generation",
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        help="Top-p value for generation",
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=8192,
+        help="Maximum number of new tokens to generate",
+    )
+    parser.add_argument(
+        "--num-generations",
+        type=int,
+        default=1,
+        help="Number of generations per problem",
+    )
+    parser.add_argument(
+        "--hf-output-dataset",
+        type=str,
+        required=False,
+        help="HuggingFace repo to push results to",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Whether to make the output dataset private when pushing to HF Hub",
+    )
+    args = parser.parse_args()
+    print("\nRunning with arguments:")
+    for arg, value in vars(args).items():
+        print(f"  {arg}: {value}")
+    print()
+    print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...")
+    dataset = load_dataset(args.hf_dataset, split=args.hf_dataset_split)
+    print("Dataset loaded!")
+    pipeline = build_distilabel_pipeline(
+        model=args.model,
+        base_url=args.vllm_server_url,
+        prompt_column=args.prompt_column,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_new_tokens=args.max_new_tokens,
+        num_generations=args.num_generations,
+    )
+    print("Running generation pipeline...")
+    distiset = pipeline.run(dataset=dataset, use_cache=False)
+    print("Generation pipeline finished!")
+    if args.hf_output_dataset:
+        print(f"Pushing resulting dataset to '{args.hf_output_dataset}'...")
+        distiset.push_to_hub(args.hf_output_dataset, private=args.private)
+        print("Dataset pushed!")

src/r1-v/src/open_r1/grpo-cot-72BEval.py ADDED Viewed

	@@ -0,0 +1,489 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModifiedOrig
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+from openai import OpenAI
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import time
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+# from qa_metrics.answerBERT import AnswerBertActor
+# pedant = PEDANT()
+# answerBERT = AnswerBertActor(device='cuda:7')
+client = OpenAI(
+    base_url="http://29.81.228.243:8081 /v1",  # your vLLM server
+    api_key="ANYKEY",                        # if you set --api-key when launching
+)
+def validate_description(description, question):
+    input_message = "You are provided a text description of a problem and a question. Determine the answer to the question based on the text description. First provide a step-by-step reasoning within <think> </think> tags, then provide your answer as a single final answer, single letter choice, or a short phrase ENCLOSED with <answer> </answer> tags. \nText description: {Description}\nQuestion: {Question}\nPlease only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions.".format(Description=description, Question=question)
+    response = client.chat.completions.create(
+        model="Qwen2.5-72B-Instruct",          # **must match** the returned id
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user",   "content": input_message}
+        ]
+    )
+    # print('*'*10)
+    # print('Input Prompt: ', input_message)
+    # print('-'*10)
+    # print('Output Message: ', response.choices[0].message.content)
+    # print('-'*10)
+    # time.sleep(40)
+    return response.choices[0].message.content
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def extract_description(predict: str) -> Optional[str]:
+        """
+        Extracts the content of the <answer>…</answer> block from `predict`.
+        Returns the inner text (with leading/trailing whitespace stripped),
+        or None if no <answer> tag is found.
+        """
+        match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+        if not match:
+            return predict
+        return match.group(1).strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    question_type = kwargs['problem_type'][0]
+    questions = kwargs['problem']
+    # questions = kwargs['prompt']
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    extracted_content_descriptions = [extract_description(str(ele)) for ele in contents]
+    description_answer_outputs = []
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        futures = [
+            executor.submit(validate_description, desc, q)
+            for desc, q in zip(extracted_content_descriptions, questions)
+        ]
+        for future in as_completed(futures):
+            try:
+                description_answer_outputs.append(future.result())
+            except Exception as e:
+                # handle/log e
+                # description_answer_outputs.append(None)
+                print('Description output error: ', e)
+                description_answer_outputs.append(0)
+    contents = [str(ele) for ele in contents]
+    description_answer_outputs = [str(ele) for ele in description_answer_outputs]
+    gt_answers = [extract_answer(str(sol)) for sol in solution]
+    extracted_description_outputs = [extract_answer(str(description_answer_outputs[index_description])) for index_description in range(len(description_answer_outputs))]
+    # print('GT answers: ', gt_answers)
+    # print('Description answers: ', description_answer_outputs[0])
+    # print('-'*10)
+    # import time
+    # time.sleep(10)
+    description_rewards = [compute_math_score_single(extracted_description_outputs[count_idx], gt_answers[count_idx]) for count_idx in range(len(description_answer_outputs))]
+    # print('()'*10)
+    # print("Question: ", questions[0])
+    # print(gt_answers)
+    # print('Description outputs', description_answer_outputs[0])
+    # print(description_rewards)
+    # print('-'*10)
+    # time.sleep(30)
+    # for content, sol, description_reward in zip(contents, solution, description_rewards):
+    for content, gt_ans, description_reward in zip(contents, gt_answers, description_rewards):
+        try:
+            output_ans = extract_answer(str(content))
+            # gt_ans = extract_answer(sol)
+            if question_type == "OCR":
+                # description_extraction = extract_answer(str(second_content))
+                # description_error_rate = wer(gt_ans, description_extraction)
+                description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+                # error_rate = wer(gt_ans, output_ans)
+                answer_pedant_reward = pedant.get_score(gt_ans, output_ans, question)
+                # reward = (1 - error_rate) + (1- description_error_rate)
+                # reward = max(0.0, min(2.0, reward))
+                # print('Extracted description: ', description_extraction)
+                # print('Generated answer: ', output_ans)
+                # print('Sol: ', gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                reward = description_pendat_reward + answer_pedant_reward
+            # elif question_type == "free-form":
+            #     score = compute_rouge_score(gt_ans, output_ans)
+            #     reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            elif question_type == 'math' or question_type == 'unify' or question_type == "multiple choice" or question_type == "numerical":
+                # description_reward = compute_math_score_single(description_extraction, gt_ans)
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                reward = description_reward + answer_reward
+                # reward = answer_reward
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {gt_ans}\n")
+    # print("rewards: ", rewards)
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+#     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+#     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+#     "<think> reasoning process here </think><answer> answer here </answer>"
+# )
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>. Please only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "Please think about this question as if you were a human pondering deeply. "
+    #     "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+    #     "It's encouraged to include self-reflection or verification in the reasoning process. "
+    #     "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    # )
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>. Please only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions."
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        # msg ={
+        #     "prompt":
+        #        [{
+        #             "role": "user",
+        #             "content": [
+        #                 {
+        #                     "type": example['data_type'],
+        #                     # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+        #                 },
+        #                 {
+        #                     "type": "text",
+        #                     "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+        #                 }
+        #                 ]
+        #         }]
+        #     }
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question)
+                        }
+                        ]
+                }]
+            }
+        # return msg
+        return {
+            "prompt": msg["prompt"],
+            "problem": question,
+        }
+    dataset = dataset.map(make_conversation_image_and_video)
+    # print('Example problem')
+    # print(dataset['train']['problem'][10])
+    # time.sleep(30)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModifiedOrig
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-LLMEval.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModifiedOrig
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+# from utils.gpt_eval import infer
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+# from qa_metrics.answerBERT import AnswerBertActor
+# pedant = PEDANT()
+# answerBERT = AnswerBertActor(device='cuda:7')
+alpha = 1.0
+TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+'''
+gpt infer
+'''
+import os
+from openai import AzureOpenAI
+import time
+import base64
+from mimetypes import guess_type
+def azure_gpt4(messages, model):
+    outputs = []
+    for message in messages:
+        input_prompt = [
+                { "role": "system", "content": "You are a helpful assistant." },
+                { "role": "user", "content": [
+                    {
+                        "type": "text",
+                        "text": message["instruction"]
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {
+                    #         "url": message["image"]
+                    #         }
+                    # }
+                ]}
+            ]
+        ## try N times if API exceed limit ...
+        for i in range(10):
+            try:
+                output = client.chat.completions.create(
+                    model=model, messages=input_prompt, max_tokens=2000
+                )
+                output_text = output.choices[0].message.content
+                break ## exit if successful
+            except Exception as e:
+                print(f'Index {i} got error message: {e}')
+                output_text = ''
+                time.sleep(3)
+        outputs.append(output_text)
+    return outputs
+client = AzureOpenAI(
+        api_key = "83f30a2a22324395b854bd343db38d85",
+        api_version = "2024-08-01-preview",
+        azure_endpoint = "https://francecentral.api.cognitive.microsoft.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"
+        )
+model = "gpt-4o"
+prompt_template = '''You are provided a text description of a problem and a question. Determine the answer to the question based on the text description. Provide your answer as a single final answer or a short phrase enclosed with <answer></answer>. \nText description: {text}\nQuestion: {question}'''
+def infer(prompt):
+    # prompt_question = prompt_question.replace('<image>', '')
+    # prompt = prompt_template.replace('{text}', text).replace('{question}', prompt_question)
+    messages = [
+            {"instruction": prompt},
+            ]
+    prompt_success = False
+    prompt_time = 0
+    outputs = ['<answer> None </answer>']
+    while prompt_success == False and prompt_time <= 2:
+        try:
+            outputs = azure_gpt4(messages, model)
+            prompt_success = True
+        except:
+            prompt_time += 1
+            time.sleep(5)
+    return outputs[0]
+'''
+end of gpt infer
+'''
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def _call_infer(desc):
+    return infer(desc)
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def extract_description(predict: str) -> Optional[str]:
+        """
+        Extracts the content of the <answer>…</answer> block from `predict`.
+        Returns the inner text (with leading/trailing whitespace stripped),
+        or None if no <answer> tag is found.
+        """
+        match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+        if not match:
+            return predict
+        return match.group(1).strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    # second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    questions = kwargs['problem']
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    extracted_content_descriptions = [extract_description(ele) for ele in contents]
+    description_query_inputs = []
+    for index in range(len(extracted_content_descriptions)):
+        prompt_question = questions[index]
+        des_text = extracted_content_descriptions[index]
+        prompt_question = prompt_question.replace('<image>', '')
+        prompt_input = prompt_template.replace('{text}', des_text).replace('{question}', prompt_question) + TYPE_TEMPLATE[question_type]
+        description_query_inputs.append(prompt_input)
+    description_score_outputs = []
+    with ThreadPoolExecutor(max_workers=8) as executor:
+        # kick off all the futures
+        # futures = [
+        #     executor.submit(_call_infer, desc, ques)
+        #     for desc, ques in zip(extracted_content_descriptions, questions)
+        # ]
+        futures = [
+            executor.submit(_call_infer, desc)
+            for desc in description_query_inputs
+        ]
+        # collect as they finish (optional—keeps order of completion)
+        for fut in as_completed(futures):
+            description_score_outputs.append(extract_answer(fut.result()))
+    gt_answers = [extract_answer(sol) for sol in solution]
+    description_rewards = [compute_math_score_single(description_score_outputs[count_idx], gt_answers[count_idx]) for count_idx in range(len(description_score_outputs))]
+    # print(gt_answers)
+    # print(description_score_outputs)
+    # print(description_rewards)
+    # print('-'*10)
+    for content, gt_ans, description_reward in zip(contents, gt_answers, description_rewards):
+    # for content, sol, question in zip(contents, solution, questions):
+    # for content, sol, second_content in zip(contents, solution, second_completions):
+        try:
+            output_ans = extract_answer(content)
+            # gt_ans = extract_answer(sol)
+            # description_extraction = extract_answer(second_content)
+            # if question_type == "multiple choice":
+            #     reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            # elif question_type == "numerical":
+            #     gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+            #     out_has_decimal = ("." in output_ans) or ("," in output_ans)
+            #     if gt_has_decimal != out_has_decimal:
+            #         reward = 0.0
+            #     else:
+            #         gt_number = normalize_number(gt_ans)
+            #         out_number = normalize_number(output_ans)
+            #         if gt_number is None or out_number is None:
+            #             reward = 0.0
+            #         else:
+            #             reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            if question_type == "OCR":
+                # description_extraction = extract_answer(second_content)
+                # description_error_rate = wer(gt_ans, description_extraction)
+                # description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+                # error_rate = wer(gt_ans, output_ans)
+                answer_pedant_reward = pedant.get_score(gt_ans, output_ans, questions[0])
+                # reward = (1 - error_rate) + (1- description_error_rate)
+                # reward = max(0.0, min(2.0, reward))
+                # print('Extracted description: ', description_extraction)
+                # print('Generated answer: ', output_ans)
+                # print('Sol: ', gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                # reward = description_pendat_reward + answer_pedant_reward
+                reward = answer_pedant_reward
+            # elif question_type == "free-form":
+            #     score = compute_rouge_score(gt_ans, output_ans)
+            #     reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            elif question_type == 'math' or question_type == 'unify' or question_type == "multiple choice" or question_type == "numerical":
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                # print(f"Extracted description: {description_extraction} | Generated answer: {output_ans} | Sol: {gt_ans}")
+                # print(f'Description reward: {description_reward} | answer reward: {answer_reward} | final reward: {reward}')
+                # print('-' * 10)
+                if description_reward == 0 and answer_reward == 1:
+                    reward = alpha
+                else:
+                    reward = description_reward + answer_reward
+                # reward = answer_reward
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {gt_ans}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+#     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+#     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+#     "<think> reasoning process here </think><answer> answer here </answer>"
+# )
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "Please think about this question as if you were a human pondering deeply. "
+    #     "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+    #     "It's encouraged to include self-reflection or verification in the reasoning process. "
+    #     "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    # )
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>"
+    )
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModifiedOrig
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-answerBERT-eval.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModifiedOrig
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+from qa_metrics.answerBERT import AnswerBertActor
+# pedant = PEDANT()
+answerBERT = AnswerBertActor(device='cuda:0')
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def extract_description(predict: str) -> Optional[str]:
+        """
+        Extracts the content of the <answer>…</answer> block from `predict`.
+        Returns the inner text (with leading/trailing whitespace stripped),
+        or None if no <answer> tag is found.
+        """
+        match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+        if not match:
+            return predict
+        return match.group(1).strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    # second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    questions = kwargs['problem']
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    extracted_content_descriptions = [extract_description(ele) for ele in contents]
+    # extracted_content_answers = [extract_answer(ele) for ele in contents]
+    # model     = kwargs.get("model")      # may be None if called elsewhere
+    # tokenizer = kwargs.get("tokenizer")
+    # # (optional) example use: let the model score the generated answer
+    # if model is not None and tokenizer is not None:
+    #     model.eval()
+    description_inputs = [questions[index_count] + ' [SEP] ' + extracted_content_descriptions[index_count] for index_count in range(len(extracted_content_descriptions))]
+    description_rewards = answerBERT.batch_predict(description_inputs, batch_size = 32)
+    for content, sol, description_reward in zip(contents, solution, description_rewards):
+    # for content, sol, question in zip(contents, solution, questions):
+    # for content, sol, second_content in zip(contents, solution, second_completions):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            # description_extraction = extract_answer(second_content)
+            # if question_type == "multiple choice":
+            #     reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            # elif question_type == "numerical":
+            #     gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+            #     out_has_decimal = ("." in output_ans) or ("," in output_ans)
+            #     if gt_has_decimal != out_has_decimal:
+            #         reward = 0.0
+            #     else:
+            #         gt_number = normalize_number(gt_ans)
+            #         out_number = normalize_number(output_ans)
+            #         if gt_number is None or out_number is None:
+            #             reward = 0.0
+            #         else:
+            #             reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            if question_type == "OCR":
+                # description_extraction = extract_answer(second_content)
+                # description_error_rate = wer(gt_ans, description_extraction)
+                description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+                # error_rate = wer(gt_ans, output_ans)
+                answer_pedant_reward = pedant.get_score(gt_ans, output_ans, question)
+                # reward = (1 - error_rate) + (1- description_error_rate)
+                # reward = max(0.0, min(2.0, reward))
+                # print('Extracted description: ', description_extraction)
+                # print('Generated answer: ', output_ans)
+                # print('Sol: ', gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                reward = description_pendat_reward + answer_pedant_reward
+            # elif question_type == "free-form":
+            #     score = compute_rouge_score(gt_ans, output_ans)
+            #     reward = max(0.0, min(1.0, score))
+            # elif question_type == "regression":
+            #     gt_number = normalize_number(gt_ans)
+            #     out_number = normalize_number(output_ans)
+            #     if gt_number is None or out_number is None:
+            #         reward = 0.0
+            #     rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+            #     rel_diff = min(1.0, max(0.0, rel_diff))
+            #     reward = 1 - rel_diff
+            elif question_type == 'math' or question_type == 'unify' or question_type == "multiple choice" or question_type == "numerical" or question_type == "regression":
+                # print('Extracted description: ', description_extraction)
+                # print('Generated answer: ', output_ans)
+                # print('Sol: ', gt_ans)
+                # description_reward = compute_math_score_single(description_extraction, gt_ans)
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                reward = description_reward + answer_reward
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+#     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+#     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+#     "<think> reasoning process here </think><answer> answer here </answer>"
+# )
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "Please think about this question as if you were a human pondering deeply. "
+    #     "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+    #     "It's encouraged to include self-reflection or verification in the reasoning process. "
+    #     "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    # )
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>"
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModifiedOrig
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-noDesEval.py ADDED Viewed

	@@ -0,0 +1,446 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModifiedOrig
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+# from qa_metrics.answerBERT import AnswerBertActor
+# pedant = PEDANT()
+# answerBERT = AnswerBertActor(device='cuda:7')
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def extract_description(predict: str) -> Optional[str]:
+        """
+        Extracts the content of the <answer>…</answer> block from `predict`.
+        Returns the inner text (with leading/trailing whitespace stripped),
+        or None if no <answer> tag is found.
+        """
+        match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+        if not match:
+            return predict
+        return match.group(1).strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    # second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    questions = kwargs['problem']
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    # extracted_content_descriptions = [extract_description(ele) for ele in contents]
+    # extracted_content_answers = [extract_answer(ele) for ele in contents]
+    # model     = kwargs.get("model")      # may be None if called elsewhere
+    # tokenizer = kwargs.get("tokenizer")
+    # # (optional) example use: let the model score the generated answer
+    # if model is not None and tokenizer is not None:
+    #     model.eval()
+    # description_inputs = [questions[index_count] + ' [SEP] ' + extracted_content_descriptions[index_count] for index_count in range(len(extracted_content_descriptions))]
+    # description_rewards = answerBERT.batch_predict(description_inputs, batch_size = 64)
+    for content, sol in zip(contents, solution):
+    # for content, sol, question in zip(contents, solution, questions):
+    # for content, sol, second_content in zip(contents, solution, second_completions):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            # description_extraction = extract_answer(second_content)
+            # if question_type == "multiple choice":
+            #     reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            # elif question_type == "numerical":
+            #     gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+            #     out_has_decimal = ("." in output_ans) or ("," in output_ans)
+            #     if gt_has_decimal != out_has_decimal:
+            #         reward = 0.0
+            #     else:
+            #         gt_number = normalize_number(gt_ans)
+            #         out_number = normalize_number(output_ans)
+            #         if gt_number is None or out_number is None:
+            #             reward = 0.0
+            #         else:
+            #             reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            if question_type == "OCR":
+                # description_extraction = extract_answer(second_content)
+                # description_error_rate = wer(gt_ans, description_extraction)
+                description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+                # error_rate = wer(gt_ans, output_ans)
+                answer_pedant_reward = pedant.get_score(gt_ans, output_ans, question)
+                # reward = (1 - error_rate) + (1- description_error_rate)
+                # reward = max(0.0, min(2.0, reward))
+                # print('Extracted description: ', description_extraction)
+                # print('Generated answer: ', output_ans)
+                # print('Sol: ', gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                reward = description_pendat_reward + answer_pedant_reward
+            # elif question_type == "free-form":
+            #     score = compute_rouge_score(gt_ans, output_ans)
+            #     reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            elif question_type == 'math' or question_type == 'unify' or question_type == "multiple choice" or question_type == "numerical":
+                # print('Extracted description: ', description_extraction)
+                # print('Generated answer: ', output_ans)
+                # print('Sol: ', gt_ans)
+                # description_reward = compute_math_score_single(description_extraction, gt_ans)
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                # print('-' * 10)
+                # reward = description_reward + answer_reward
+                reward = answer_reward
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+#     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+#     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+#     "<think> reasoning process here </think><answer> answer here </answer>"
+# )
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>. Please only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "Please think about this question as if you were a human pondering deeply. "
+    #     "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+    #     "It's encouraged to include self-reflection or verification in the reasoning process. "
+    #     "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    # )
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>. Please only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions."
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        # msg ={
+        #     "prompt":
+        #        [{
+        #             "role": "user",
+        #             "content": [
+        #                 {
+        #                     "type": example['data_type'],
+        #                     # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+        #                 },
+        #                 {
+        #                     "type": "text",
+        #                     "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+        #                 }
+        #                 ]
+        #         }]
+        #     }
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question)
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModifiedOrig
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-noInfo.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModified
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+from utils.math_cot_noInfo import *
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text):
+        pattern = r'<answer>\s*(.*?)\s*</answer>'
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+        return ""
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    question_type = kwargs['problem_type'][0]
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    for content, sol in zip(contents, solution):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            if question_type == "multiple choice":
+                reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            elif question_type == "numerical":
+                gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+                out_has_decimal = ("." in output_ans) or ("," in output_ans)
+                if gt_has_decimal != out_has_decimal:
+                    reward = 0.0
+                else:
+                    gt_number = normalize_number(gt_ans)
+                    out_number = normalize_number(output_ans)
+                    if gt_number is None or out_number is None:
+                        reward = 0.0
+                    else:
+                        reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            elif question_type == "OCR":
+                error_rate = wer(gt_ans, output_ans)
+                reward = 1 - error_rate
+                reward = max(0.0, min(1.0, reward))
+            elif question_type == "free-form":
+                score = compute_rouge_score(gt_ans, output_ans)
+                reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            elif question_type == 'math':
+                reward = compute_math_score_single(content, gt_ans)
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    # "format": format_reward,
+}
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+    "<think> reasoning process here </think><answer> answer here </answer>"
+)
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user provides a question about an image, "
+#     "and the Assistant is tasked with generating an exhaustive and detailed description of the image. "
+#     "The assistant should extract and describe all possible information from the image—including objects, numbers, text, and their relationships—"
+#     "and enclose this description within <info> </info> tags. "
+#     "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+#     "and provide this step-by-step reasoning within <think> </think> tags. "
+#     "Finally, the assistant provides a single word or phrase answer within <answer> </answer> tags. "
+#     "The output format should be: <info> image description here </info> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>."
+# )
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "Please think about this question as if you were a human pondering deeply. "
+        "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+        "It's encouraged to include self-reflection or verification in the reasoning process. "
+        "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    )
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "You are tasked with analyzing an image to generate an exhaustive and detailed description. "
+    #     "Your goal is to extract and describe all possible information from the image, including but not limited to objects, numbers, text, and the relationships between these elements. "
+    #     "The description should be as fine and detailed as possible, capturing every nuance, and should be enclosed within <info> </info> tags.\n"
+    #     "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+    #     "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+    #     "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+    #     "The output format should be: <info> image description here </info> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>"
+    # )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModified
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-qwenEval.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import ray
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModifiedOrig
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+import torch
+# from utils.gpt_eval import infer
+# from utils.math_cot import *
+from qa_metrics.pedant import PEDANT
+from concurrent.futures import ProcessPoolExecutor
+import os, subprocess, sys
+# from qa_metrics.answerBERT import AnswerBertActor
+# from utils.self_eval import *
+from vllm import LLM, SamplingParams
+pedant = None
+# answerBERT = AnswerBertActor(device='cuda:7')
+# curr_actor = VllmActor.options(num_gpus=1).remote("Qwen/Qwen2.5-3B-Instruct")
+from typing import List
+import os
+import ray, os, subprocess, torch.distributed as dist
+MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
+MAX_LEN  = 32_768
+RAY_NS   = "grpo_qwen_vllm"
+RAY_TMP  = "/tmp/ray"
+# ------------------------------------------------------------
+# 1. Define the Ray actor class *before* we ever create it
+#    (Ray just needs to see the decorator; it doesn’t need an
+#    active cluster at definition time)
+# ------------------------------------------------------------
+@ray.remote(num_gpus=1,resources={"gpu_7": 1})
+class VllmActor:
+    def __init__(self, model_id):
+        self.engine = LLM(
+            model_id,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.80,
+            max_model_len=MAX_LEN,
+            trust_remote_code=True,
+            dtype="bfloat16",
+        )
+        self.default = SamplingParams(top_p=0.9, temperature=0.7, max_tokens=128)
+    def generate_batch(self, prompts, sampling=None):
+        outs = self.engine.generate(prompts, sampling_params=sampling or self.default)
+        return [o.outputs[0].text for o in outs]
+# ------------------------------------------------------------
+# 2. Torch-DDP initialisation
+# ------------------------------------------------------------
+dist.init_process_group("nccl")
+rank = dist.get_rank()
+# ------------------------------------------------------------
+# 3. Rank-0 starts the Ray head, others wait
+# ------------------------------------------------------------
+if rank == 1:
+    ray.init(
+        _temp_dir=RAY_TMP,
+        object_store_memory=1 * 1024**3,
+        namespace=RAY_NS,
+        include_dashboard=False,
+        resources={"gpu_7": 1}
+    )
+    # optional: confirm the head is up
+    # from ray._private.internal_api import wait_for_gcs
+    # wait_for_gcs()
+dist.barrier()       # ---- head definitely running here ----
+# ------------------------------------------------------------
+# 4. Non-zero ranks attach to the head
+# ------------------------------------------------------------
+if rank != 0:
+    ray.init(address="auto", _temp_dir=RAY_TMP, namespace=RAY_NS)
+dist.barrier()       # ---- every rank now in the cluster ----
+# ------------------------------------------------------------
+# 5. Create / look-up the VllmActor
+# ------------------------------------------------------------
+if rank == 1:
+    vllm_actor = (
+        VllmActor.options(name="vllm", namespace=RAY_NS, lifetime="detached")
+        .remote(MODEL_ID)
+    )
+    # block until the model finishes loading so other ranks don’t race
+    ray.get(vllm_actor.generate_batch.remote(["ping"]))
+dist.barrier()       # ---- actor fully alive everywhere ----
+if rank != 0:
+    vllm_actor = ray.get_actor("vllm", namespace=RAY_NS)
+eval_prompt_template = '''You are provided a text description of a problem and a question. Determine the answer to the question based on the text description. Provide your answer as a single final answer or a short phrase enclosed with <answer></answer>. If the question is a multiple choice, the final answer should be a single letter choice. \nText description: {}\nQuestion: {}'''
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def extract_description(predict: str) -> Optional[str]:
+        """
+        Extracts the content of the <answer>…</answer> block from `predict`.
+        Returns the inner text (with leading/trailing whitespace stripped),
+        or None if no <answer> tag is found.
+        """
+        match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+        if not match:
+            return predict
+        return match.group(1).strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    # second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    questions = kwargs['problem']
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    extracted_content_descriptions = [extract_description(ele) for ele in contents]
+    description_eval_inputs = [eval_prompt_template.format(extracted_content_descriptions[count_index], questions[count_index]) for count_index in range(len(extracted_content_descriptions))]
+    # extracted_content_answers = [extract_answer(ele) for ele in contents]
+    # model     = kwargs.get("model")      # may be None if called elsewhere
+    # tokenizer = kwargs.get("tokenizer")
+    # # (optional) example use: let the model score the generated answer
+    # if model is not None and tokenizer is not None:
+    #     model.eval()
+    # description_inputs = [questions[index_count] + ' [SEP] ' + extracted_content_descriptions[index_count] for index_count in range(len(extracted_content_descriptions))]
+    # description_rewards = answerBERT.batch_predict(description_inputs, batch_size = 64)
+    # description_rewards = [infer(extracted_content_descriptions[index_count], questions[index_count]) for index_count in range(len(extracted_content_descriptions))]
+    # description_outputs = generate_batch(description_eval_inputs)
+    print(len(description_eval_inputs))
+    print('Computing rewards...')
+    print('-'*10)
+    # description_outputs = ray.get(vllm_actor.generate.remote(description_eval_inputs))
+    description_outputs = ray.get(
+        vllm_actor.generate_batch_sequential.remote(description_eval_inputs,
+                                                batch_size=32)  # tune to taste
+    )
+    print('Finish computing generating batch')
+    output_answers = [extract_answer(content) for content in contents]
+    gt_answers = [extract_answer(sol) for sol in solution]
+    description_rewards = [compute_math_score_single(description_outputs[curr_idx], gt_answers[curr_idx]) for curr_idx in range(len(description_outputs))]
+    # for content, sol, description_reward in zip(contents, solution, description_rewards):
+    # for content, sol, question in zip(contents, solution, questions):
+    # for content, sol, second_content in zip(contents, solution, second_completions):
+    for output_ans, gt_ans, description_reward in zip(output_answers, gt_answers, description_rewards):
+        try:
+            # output_ans = extract_answer(content)
+            # gt_ans = extract_answer(sol)
+            # description_extraction = extract_answer(second_content)
+            # if question_type == "multiple choice":
+            #     reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            # elif question_type == "numerical":
+            #     gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+            #     out_has_decimal = ("." in output_ans) or ("," in output_ans)
+            #     if gt_has_decimal != out_has_decimal:
+            #         reward = 0.0
+            #     else:
+            #         gt_number = normalize_number(gt_ans)
+            #         out_number = normalize_number(output_ans)
+            #         if gt_number is None or out_number is None:
+            #             reward = 0.0
+            #         else:
+            #             reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            if question_type == "OCR":
+                # description_extraction = extract_answer(second_content)
+                # description_error_rate = wer(gt_ans, description_extraction)
+                # description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+                # error_rate = wer(gt_ans, output_ans)
+                answer_pedant_reward = pedant.get_score(gt_ans, output_ans, questions[0])
+                # reward = (1 - error_rate) + (1- description_error_rate)
+                # reward = max(0.0, min(2.0, reward))
+                # print('Extracted description: ', description_extraction)
+                print('Generated answer: ', output_ans)
+                print('Sol: ', gt_ans)
+                # print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                print('-' * 10)
+                # reward = description_pendat_reward + answer_pedant_reward
+                reward = answer_pedant_reward
+            # elif question_type == "free-form":
+            #     score = compute_rouge_score(gt_ans, output_ans)
+            #     reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            elif question_type == 'math' or question_type == 'unify' or question_type == "multiple choice" or question_type == "numerical":
+                # print('Extracted description: ', description_extraction)
+                print('Generated answer: ', output_ans)
+                print('Sol: ', gt_ans)
+                # description_reward = compute_math_score_single(description_extraction, gt_ans)
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+                print('-' * 10)
+                reward = description_reward + answer_reward
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {output_ans}\n")
+                f.write(f"Solution: {gt_ans}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>"
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModifiedOrig
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-selfEval.py ADDED Viewed

	@@ -0,0 +1,457 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModified
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+# pedant = PEDANT()
+'''
+Alpha constant: When the description is wrong, but the final answer is right, the model is doing reward hacking,
+so we give it a partial reward
+'''
+alpha = 1.0
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    question = kwargs['problem'][0]
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    # model     = kwargs.get("model")      # may be None if called elsewhere
+    # tokenizer = kwargs.get("tokenizer")
+    # # (optional) example use: let the model score the generated answer
+    # if model is not None and tokenizer is not None:
+    #     model.eval()
+    # for content, sol in zip(contents, solution):
+    for content, sol, second_content in zip(contents, solution, second_completions):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            description_extraction = extract_answer(second_content)
+            # if question_type == "multiple choice":
+            #     reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            # elif question_type == "numerical":
+            #     gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+            #     out_has_decimal = ("." in output_ans) or ("," in output_ans)
+            #     if gt_has_decimal != out_has_decimal:
+            #         reward = 0.0
+            #     else:
+            #         gt_number = normalize_number(gt_ans)
+            #         out_number = normalize_number(output_ans)
+            #         if gt_number is None or out_number is None:
+            #             reward = 0.0
+            #         else:
+            #             reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            # if question_type == "OCR":
+            #     # description_extraction = extract_answer(second_content)
+            #     # description_error_rate = wer(gt_ans, description_extraction)
+            #     description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+            #     # error_rate = wer(gt_ans, output_ans)
+            #     answer_pedant_reward = pedant.get_score(gt_ans, output_ans, question)
+            #     # reward = (1 - error_rate) + (1- description_error_rate)
+            #     # reward = max(0.0, min(2.0, reward))
+            #     print('Extracted description: ', description_extraction)
+            #     print('Generated answer: ', output_ans)
+            #     print('Sol: ', gt_ans)
+            #     print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+            #     print('-' * 10)
+            #     reward = description_pendat_reward + answer_pedant_reward
+            if question_type == "free-form":
+                score = compute_rouge_score(gt_ans, output_ans)
+                description_score = compute_rouge_score(gt_ans, description_extraction)
+                reward = max(0.0, min(1.0, score)) + max(0.0, min(1.0, description_score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                description_number = normalize_number(description_extraction)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                if description_number is None:
+                    description_reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                description_diff = (abs(description_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                description_diff = min(1.0, max(0.0, description_diff))
+                reward = (1 - rel_diff) + (1 - description_diff)
+            elif question_type == 'math' or question_type == 'unify' or question_type == 'multiple choice' or question_type == 'numerical':
+                description_reward = compute_math_score_single(description_extraction, gt_ans)
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                if description_reward == 0 and answer_reward == 1:
+                    # Avoid multiplication to save computation
+                    reward = alpha
+                else:
+                    reward = description_reward + answer_reward
+                # print(f"Extracted description: {description_extraction} | Generated answer: {output_ans} | Sol: {gt_ans}")
+                # print(f'Description reward: {description_reward} | answer reward: {answer_reward} | final reward: {reward}')
+                # print('-' * 10)
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>. Please only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions."
+)
+def main(script_args, training_args, model_args):
+    print('Start program..')
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    print('Loading dataset')
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image and image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>. Please keep your final answer short and precise."
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+    ABS_Verify_Prompt = '''You are provided a text description of a problem and a question. Determine the answer to the question based on the text description. First provide a step-by-step reasoning within <think> </think> tags, then provide your answer as a single final answer, single letter choice, or a short phrase ENCLOSED with <answer> </answer> tags. \nText description: {{Description}}\nQuestion: {Question}\nPlease only return the final single letter choice within the <answer> </answer> tags for multiple choice questions; Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags for numerical questions.'''
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            # "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                            "text": QUESTION_TEMPLATE.format(Question=question)
+                        }
+                        ]
+                }]
+            }
+        return msg
+    def make_verify_conversation(example):
+        # ➊ build the question text
+        question = example["problem"]
+        if example["problem_type"] == "multiple choice":
+            question += "Options:\n" + "\n".join(example["options"])
+        # ➋ verification template + suffix (no if/else)
+        verify_text = (
+            ABS_Verify_Prompt.format(Question=question.replace("<image>", ""))
+            # + TYPE_TEMPLATE[example["problem_type"]]        # ← one-liner, no branching
+        )
+        # ➌ conversation dict
+        conv_dict = {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": verify_text}],
+                }
+            ]
+        }
+        # templated = maybe_apply_chat_template(conv_dict, processing_class)["prompt"]
+        # return {"verify_prompt": templated}
+        return {"verify_prompt": conv_dict}
+    print('Start mapping dataset')
+    dataset = dataset.map(make_conversation_image_and_video)
+    dataset = dataset.map(
+        make_verify_conversation,
+        desc="add description verify prompt",
+    )
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModified
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot-selfEvalConst.py ADDED Viewed

	@@ -0,0 +1,456 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerSelfConst
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+# pedant = PEDANT()
+'''
+Alpha constant: When the description is wrong, but the final answer is right, the model is doing reward hacking,
+so we give it a partial reward
+'''
+alpha = 0.85
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        # format_score = format_reward(predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    question = kwargs['problem'][0]
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    # model     = kwargs.get("model")      # may be None if called elsewhere
+    # tokenizer = kwargs.get("tokenizer")
+    # # (optional) example use: let the model score the generated answer
+    # if model is not None and tokenizer is not None:
+    #     model.eval()
+    # for content, sol in zip(contents, solution):
+    for content, sol, second_content in zip(contents, solution, second_completions):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            description_extraction = extract_answer(second_content)
+            # if question_type == "multiple choice":
+            #     reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            # elif question_type == "numerical":
+            #     gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+            #     out_has_decimal = ("." in output_ans) or ("," in output_ans)
+            #     if gt_has_decimal != out_has_decimal:
+            #         reward = 0.0
+            #     else:
+            #         gt_number = normalize_number(gt_ans)
+            #         out_number = normalize_number(output_ans)
+            #         if gt_number is None or out_number is None:
+            #             reward = 0.0
+            #         else:
+            #             reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            # if question_type == "OCR":
+            #     # description_extraction = extract_answer(second_content)
+            #     # description_error_rate = wer(gt_ans, description_extraction)
+            #     description_pendat_reward = pedant.get_score(gt_ans, description_extraction, question)
+            #     # error_rate = wer(gt_ans, output_ans)
+            #     answer_pedant_reward = pedant.get_score(gt_ans, output_ans, question)
+            #     # reward = (1 - error_rate) + (1- description_error_rate)
+            #     # reward = max(0.0, min(2.0, reward))
+            #     print('Extracted description: ', description_extraction)
+            #     print('Generated answer: ', output_ans)
+            #     print('Sol: ', gt_ans)
+            #     print(f'Description reward: {description_reward}; answer reward: {answer_reward}')
+            #     print('-' * 10)
+            #     reward = description_pendat_reward + answer_pedant_reward
+            if question_type == "free-form":
+                score = compute_rouge_score(gt_ans, output_ans)
+                description_score = compute_rouge_score(gt_ans, description_extraction)
+                reward = max(0.0, min(1.0, score)) + max(0.0, min(1.0, description_score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                description_number = normalize_number(description_extraction)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                if description_number is None:
+                    description_reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                description_diff = (abs(description_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                description_diff = min(1.0, max(0.0, description_diff))
+                reward = (1 - rel_diff) + (1 - description_diff)
+            elif question_type == 'math' or question_type == 'unify' or question_type == 'multiple choice' or question_type == 'numerical':
+                description_reward = compute_math_score_single(description_extraction, gt_ans)
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                if description_reward == 0 and answer_reward == 1:
+                    # Avoid multiplication to save computation
+                    reward = alpha
+                else:
+                    reward = description_reward + answer_reward
+                # print(f"Extracted description: {description_extraction} | Generated answer: {output_ans} | Sol: {gt_ans}")
+                # print(f'Description reward: {description_reward} | answer reward: {answer_reward} | final reward: {reward}')
+                # print('-' * 10)
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    # pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    pattern = r"<des>.*?</des>\s*<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. After the user asks a question about an image, write a rich, self-contained description of that image—detailed enough that someone could answer the question from the description alone, without ever seeing the image. Enclose the entire description in <des> </des> tags."
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word, single letter choice, or phrase answer within <answer> </answer> tags."
+    "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>."
+)
+def main(script_args, training_args, model_args):
+    print('Start program..')
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    print('Loading dataset')
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description to answer a question. "
+        "Analyze the image and produce a thorough, self-contained description—detailed enough for someone to answer the question using the description alone. Wrap the entire description in <des> </des> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <des> image description here </des> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>"
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) within the <answer> </answer> tags.",
+    }
+    ABS_Verify_Prompt = '''You are provided a text description of a problem and a question. Determine the answer to the question based on the text description. First provide a step-by-step reasoning within <think> </think> tags, then provide your answer as a single final answer, single letter choice, or a short phrase ENCLOSED with <answer> </answer> tags. \nText description: {{Description}}\nQuestion: {Question}'''
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    def make_verify_conversation(example):
+        # ➊ build the question text
+        question = example["problem"]
+        if example["problem_type"] == "multiple choice":
+            question += "Options:\n" + "\n".join(example["options"])
+        # ➋ verification template + suffix (no if/else)
+        verify_text = (
+            ABS_Verify_Prompt.format(Question=question.replace("<image>", ""))
+            + TYPE_TEMPLATE[example["problem_type"]]        # ← one-liner, no branching
+        )
+        # ➌ conversation dict
+        conv_dict = {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": verify_text}],
+                }
+            ]
+        }
+        # templated = maybe_apply_chat_template(conv_dict, processing_class)["prompt"]
+        # return {"verify_prompt": templated}
+        return {"verify_prompt": conv_dict}
+    print('Start mapping dataset')
+    dataset = dataset.map(make_conversation_image_and_video)
+    dataset = dataset.map(
+        make_verify_conversation,
+        desc="add description verify prompt",
+    )
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerSelfConst
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-cot.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModified
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+from utils.math_cot import *
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy", "format"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    # )
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text):
+        pattern = r'<answer>\s*(.*?)\s*</answer>'
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+        return ""
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    question_type = kwargs['problem_type'][0]
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    for content, sol in zip(contents, solution):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            if question_type == "multiple choice":
+                reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            elif question_type == "numerical":
+                gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+                out_has_decimal = ("." in output_ans) or ("," in output_ans)
+                if gt_has_decimal != out_has_decimal:
+                    reward = 0.0
+                else:
+                    gt_number = normalize_number(gt_ans)
+                    out_number = normalize_number(output_ans)
+                    if gt_number is None or out_number is None:
+                        reward = 0.0
+                    else:
+                        reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            elif question_type == "OCR":
+                error_rate = wer(gt_ans, output_ans)
+                reward = 1 - error_rate
+                reward = max(0.0, min(1.0, reward))
+            elif question_type == "free-form":
+                score = compute_rouge_score(gt_ans, output_ans)
+                reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            elif question_type == 'math':
+                reward = compute_math_score_single(content, gt_ans)
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    # "format": 0,
+}
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+#     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+#     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+#     "<think> reasoning process here </think><answer> answer here </answer>"
+# )
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user provides a question about an image, "
+    "and the Assistant is tasked with generating an exhaustive and detailed description of the image. "
+    "The assistant should extract and describe all possible information from the image—including objects, numbers, text, and their relationships—"
+    "and enclose this description within <info> </info> tags. "
+    "Next, the assistant should think deeply about the reasoning process, engaging in an internal dialogue and self-reflection, "
+    "and provide this step-by-step reasoning within <think> </think> tags. "
+    "Finally, the assistant provides a single word or phrase answer within <answer> </answer> tags. "
+    "The output format should be: <info> image description here </info> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "Please think about this question as if you were a human pondering deeply. "
+    #     "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+    #     "It's encouraged to include self-reflection or verification in the reasoning process. "
+    #     "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    # )
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "You are tasked with analyzing an image to generate an exhaustive and detailed description. "
+        "Your goal is to extract and describe all possible information from the image, including but not limited to objects, numbers, text, and the relationships between these elements. "
+        "The description should be as fine and detailed as possible, capturing every nuance, and should be enclosed within <info> </info> tags.\n"
+        "Next, engage in an internal dialogue as if you were a human pondering deeply—use expressions such as 'let me think', 'wait', 'hmm', 'oh, I see', 'let's break it down', etc., and include self-reflection or verification in your reasoning process. "
+        "Provide your detailed, step-by-step reasoning based on the image description, and enclose this part within <think> </think> tags.\n"
+        "Finally, provide a single word or phrase answer to the question, enclosed within <answer> </answer> tags.\n"
+        "The output format should be: <info> image description here </info> <think> reasoning process here </think> <answer> FINAL ANSWER here </answer>"
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "math": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModified
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo-description-LLMEval.py ADDED Viewed

	@@ -0,0 +1,579 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from openai import OpenAI
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModifiedOrig
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from typing import Dict, List, Optional
+from mathruler.grader import extract_boxed_content, grade_answer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+# from utils.gpt_eval import infer
+# from utils.math_cot import *
+# from qa_metrics.pedant import PEDANT
+# from qa_metrics.answerBERT import AnswerBertActor
+# pedant = PEDANT()
+# answerBERT = AnswerBertActor(device='cuda:7')
+alpha = 1.0
+TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) in \\boxed{}.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) in \\boxed{}.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer in \\boxed{}.",
+        "free-form": " Please provide your text answer in \\boxed{}.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) in \\boxed{}.",
+        "math": " Please provide the final exact answer (single option letter for multiple choice) in \\boxed{}.",
+    }
+'''
+gpt infer
+'''
+import os
+from openai import AzureOpenAI
+import time
+import base64
+from mimetypes import guess_type
+def azure_gpt4(messages, model):
+    outputs = []
+    for message in messages:
+        input_prompt = [
+                { "role": "system", "content": "You are a helpful assistant." },
+                { "role": "user", "content": [
+                    {
+                        "type": "text",
+                        "text": message["instruction"]
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {
+                    #         "url": message["image"]
+                    #         }
+                    # }
+                ]}
+            ]
+        ## try N times if API exceed limit ...
+        for i in range(10):
+            try:
+                output = client.chat.completions.create(
+                    model=model, messages=input_prompt, max_tokens=2000
+                )
+                output_text = output.choices[0].message.content
+                break ## exit if successful
+            except Exception as e:
+                print(f'Index {i} got error message: {e}')
+                output_text = ''
+                time.sleep(3)
+        outputs.append(output_text)
+    return outputs
+client = AzureOpenAI(
+        api_key = "83f30a2a22324395b854bd343db38d85",
+        api_version = "2024-08-01-preview",
+        azure_endpoint = "https://francecentral.api.cognitive.microsoft.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"
+        )
+model = "gpt-4o"
+prompt_template = '''Text description: {text}\nQuestion: {question}\nYou are provided a text description of a problem and a question. Determine the answer to the question based on the text description. First provide an internal step-by-step reasoning within <think> </think> tags, then provide a single word or phrase answer in \\boxed{}.'''
+# client = OpenAI(
+#     base_url="http://29.81.244.54:8080/v1",  # your vLLM server
+#     api_key="ANYKEY",                        # if you set --api-key when launching
+# )
+client = OpenAI(
+    base_url="http://29.81.224.188:8080/v1",  # your vLLM server
+    api_key="ANYKEY",                        # if you set --api-key when launching
+)
+def chat_batch(
+    client,
+    all_message_batches: List[List[Dict[str, str]]],
+    *,
+    # model: str = "Qwen2.5-32B-Instruct",
+    model: str = "Qwen2.5-32B-finetune",
+    max_workers: int = 8,
+    retries: int = 2,
+    backoff: float = 0.5,
+    timeout: Optional[float] = None,
+) -> List[str]:
+    """
+    Send many chat requests in parallel and return replies as a list of strings,
+    preserving the order of `all_message_batches`.
+    """
+    def _chat_once_with_retry(messages: List[Dict[str, str]]) -> str:
+        last_err: Optional[BaseException] = None
+        for attempt in range(retries + 1):
+            try:
+                resp = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    timeout=timeout,
+                )
+                # Different SDKs expose content slightly differently; handle common cases.
+                choice = resp.choices[0]
+                if hasattr(choice, "message") and getattr(choice.message, "content", None) is not None:
+                    return choice.message.content
+                if hasattr(choice, "text") and choice.text is not None:
+                    return choice.text
+                # Fallback to stringifying the choice if structure is unexpected.
+                return str(choice)
+            except Exception as e:
+                last_err = e
+                if attempt < retries:
+                    sleep(backoff * (2 ** attempt))
+        return f"Error: {last_err!r}"
+    results: List[Optional[str]] = [None] * len(all_message_batches)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_idx = {
+            executor.submit(_chat_once_with_retry, batch): i
+            for i, batch in enumerate(all_message_batches)
+        }
+        for fut in as_completed(future_to_idx):
+            i = future_to_idx[fut]
+            results[i] = fut.result()
+    # mypy-friendly cast: no Nones remain at this point
+    return [r if r is not None else "Error: Unknown failure" for r in results]
+def infer(prompt):
+    # prompt_question = prompt_question.replace('<image>', '')
+    # prompt = prompt_template.replace('{text}', text).replace('{question}', prompt_question)
+    messages = [
+            {"instruction": prompt},
+            ]
+    prompt_success = False
+    prompt_time = 0
+    outputs = ['\\boxed{None}']
+    while prompt_success == False and prompt_time <= 2:
+        try:
+            outputs = azure_gpt4(messages, model)
+            prompt_success = True
+        except:
+            prompt_time += 1
+            time.sleep(5)
+    return outputs[0]
+'''
+end of gpt infer
+'''
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def _call_infer(desc):
+    return infer(desc)
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    # reward_funcs: list[str] = field(
+    #     default_factory=lambda: ["accuracy"],
+    #     metadata={"help": "List of reward functions. Possible values: 'accuracy'"},
+    # )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text: str) -> str:
+        """
+        1) Try the full <answer> … </answer> block.
+        2) If that is missing, grab whatever follows the opening <answer> tag.
+        3) Otherwise return the original text.
+        """
+        # ① normal case  <answer> … </answer>
+        m = re.search(r'<answer>\s*(.*?)\s*</answer>', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ② fallback  <answer> … <end-of-string>
+        m = re.search(r'<answer>\s*(.*)$', text, flags=re.DOTALL | re.IGNORECASE)
+        if m:
+            return m.group(1).strip()
+        # ③ nothing found
+        return text.strip()
+    def extract_description(predict: str) -> Optional[str]:
+        """
+        Extracts the content of the <answer>…</answer> block from `predict`.
+        Returns the inner text (with leading/trailing whitespace stripped),
+        or None if no <answer> tag is found.
+        """
+        match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+        if not match:
+            return predict
+        return match.group(1).strip()
+    def single_accuracy_reward(predict: str, ground_truth: str) -> float:
+        answer = predict
+        return 1.0 if grade_answer(answer, ground_truth) else 0.0
+    def compute_math_score_single(predict: str, ground_truth: str, format_weight: float = 0.0) -> Dict[str, float]:
+        predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict)
+        accuracy_score = single_accuracy_reward(predict, ground_truth)
+        # return (1 - format_weight) * accuracy_score + format_weight * format_score
+        return accuracy_score
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    # print('Computing rewards now...')
+    # second_prompts     = kwargs.get("second_prompts")      # ← list[str] or None
+    # second_completions = kwargs.get("second_completions")
+    # second_contents = [comp[0]["content"] for comp in second_completions]
+    # print('second prompts', second_prompts)
+    # print('-'*10)
+    # print('second completions', second_completions)
+    # print('-'*10)
+    # import time
+    # time.sleep(30)
+    question_type = kwargs['problem_type'][0]
+    questions = kwargs['problem']
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    extracted_content_descriptions = [extract_description(ele) for ele in contents]
+    description_query_inputs = []
+    batch_messages = []
+    vllm_batch_messages = []
+    for index in range(len(extracted_content_descriptions)):
+        prompt_question = questions[index]
+        des_text = extracted_content_descriptions[index]
+        prompt_question = prompt_question.replace('<image>', '')
+        prompt_input = prompt_template.replace('{text}', des_text).replace('{question}', prompt_question) + TYPE_TEMPLATE[question_type]
+        description_query_inputs.append(prompt_input)
+        curr_msg = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user",   "content": prompt_input}
+        ]
+        vllm_batch_messages.append(curr_msg)
+    batched_vllm_outputs = chat_batch(client, vllm_batch_messages)
+    description_score_outputs = [extract_boxed_content(idx_input) for idx_input in batched_vllm_outputs]
+    # with ThreadPoolExecutor(max_workers=8) as executor:
+    #     futures = [
+    #         executor.submit(_call_infer, desc)
+    #         for desc in description_query_inputs
+    #     ]
+    #     # collect as they finish (optional—keeps order of completion)
+    #     for fut in as_completed(futures):
+    #         # description_score_outputs.append(extract_answer(fut.result()))
+    #         # extract_boxed_content
+    #         description_score_outputs.append(extract_boxed_content(fut.result()))
+    gt_answers = [extract_answer(sol) for sol in solution]
+    description_rewards = [compute_math_score_single(description_score_outputs[count_idx], gt_answers[count_idx]) for count_idx in range(len(description_score_outputs))]
+    print(gt_answers)
+    print(description_score_outputs)
+    print(description_rewards)
+    print('-'*10)
+    for content, gt_ans, description_reward in zip(contents, gt_answers, description_rewards):
+    # for content, sol, question in zip(contents, solution, questions):
+    # for content, sol, second_content in zip(contents, solution, second_completions):
+        try:
+            # output_ans = extract_answer(content)
+            output_ans = extract_boxed_content(content)
+            if question_type != 'None':
+                answer_reward = compute_math_score_single(output_ans, gt_ans)
+                if description_reward == 0 and answer_reward == 1:
+                    reward = alpha
+                else:
+                    reward = description_reward + answer_reward
+                # reward = answer_reward
+            else:
+                print('Falling back to none rewards')
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {gt_ans}\n")
+    return rewards
+def simple_format_reward(completions, **kwargs):
+    """Reward function that checks the same format as `format_reward`:
+       <description>...</description><think>...</think>\boxed{...}
+    """
+    pattern = re.compile(
+        r"^\s*<description>.*?</description>\s*"
+        r"<think>.*?</think>\s*"
+        r"\\boxed\{.*?\}\s*$",
+        re.DOTALL,
+    )
+    completion_contents = [completion[0]["content"] for completion in completions]
+    return [0.1 if pattern.fullmatch(content or "") else 0.0
+            for content in completion_contents]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": simple_format_reward,
+}
+# SYSTEM_PROMPT = (
+#     "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+#     "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+#     "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+#     "<think> reasoning process here </think><answer> answer here </answer>"
+# )
+SYSTEM_PROMPT = (
+    "You are tasked with analyzing an image/video to generate a detailed description to help you answer the question. First analyze the image/video and produce a self-contained description—detailed enough that can lead to the correct answer. Wrap the entire description in <description> </description> tags.\n Next, engage in an internal dialogue and include self-reflection or verification in your reasoning process. Provide your detailed, step-by-step reasoning based on the image/video description information and image/video, and enclose this part within <think> </think> tags.\n Finally, provide a single word or phrase answer to the question in \boxed{}.\nThe output format should be: <description> image/video description here </description> <think> reasoning process here </think> \boxed{FINAL ANSWER here}."
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # QUESTION_TEMPLATE = (
+    #     "{Question}\n"
+    #     "Please think about this question as if you were a human pondering deeply. "
+    #     "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+    #     "It's encouraged to include self-reflection or verification in the reasoning process. "
+    #     "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    # )
+    QUESTION_TEMPLATE = (
+    "{Question}\n"
+    "You are tasked with analyzing an image/video to generate a detailed description to help you answer the question. "
+    "First analyze the image/video and produce a self-contained description—detailed enough that can lead to the correct answer. "
+    "Wrap the entire description in <description> </description> tags.\n"
+    "Next, engage in an internal dialogue and include self-reflection or verification in your reasoning process. "
+    "Provide your detailed, step-by-step reasoning based on the image/video description information and image/video, and enclose this part within <think> </think> tags.\n"
+    "Finally, provide a single word or phrase answer to the question in \\boxed{{}}.\n"
+    "The output format should be: <description> image/video description here </description> "
+    "<think> reasoning process here </think> \\boxed{{FINAL ANSWER here}}."
+)
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            # "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                            "text": QUESTION_TEMPLATE.format(Question=question)
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModifiedOrig
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from trainer import Qwen2VLGRPOTrainer, Qwen2VLGRPOVLLMTrainerModified
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from datasets import Dataset, DatasetDict
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge_score import rouge_scorer
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    temporal: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using temporal GRPO"},
+    )
+    len_control: Optional[bool] = field(
+        default=True,
+        metadata={"help": "whether using length reward"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text):
+        pattern = r'<answer>\s*(.*?)\s*</answer>'
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+        return ""
+    def normalize_number(num_str):
+        try:
+            num_str = num_str.replace(',', '')
+            return float(num_str)
+        except Exception as e:
+            print(f"Error converting '{num_str}' to float: {e}")
+            return None
+    def wer(reference, hypothesis):
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        m = len(ref_words)
+        n = len(hyp_words)
+        d = [[0]*(n+1) for _ in range(m+1)]
+        for i in range(m+1):
+            d[i][0] = i
+        for j in range(n+1):
+            d[0][j] = j
+        for i in range(1, m+1):
+            for j in range(1, n+1):
+                if ref_words[i-1] == hyp_words[j-1]:
+                    d[i][j] = d[i-1][j-1]
+                else:
+                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
+        return d[m][n] / max(1, m)
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    question_type = kwargs['problem_type'][0]
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    for content, sol in zip(contents, solution):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            if question_type == "multiple choice":
+                reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
+            elif question_type == "numerical":
+                gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
+                out_has_decimal = ("." in output_ans) or ("," in output_ans)
+                if gt_has_decimal != out_has_decimal:
+                    reward = 0.0
+                else:
+                    gt_number = normalize_number(gt_ans)
+                    out_number = normalize_number(output_ans)
+                    if gt_number is None or out_number is None:
+                        reward = 0.0
+                    else:
+                        reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
+            elif question_type == "OCR":
+                error_rate = wer(gt_ans, output_ans)
+                reward = 1 - error_rate
+                reward = max(0.0, min(1.0, reward))
+            elif question_type == "free-form":
+                score = compute_rouge_score(gt_ans, output_ans)
+                reward = max(0.0, min(1.0, score))
+            elif question_type == "regression":
+                gt_number = normalize_number(gt_ans)
+                out_number = normalize_number(output_ans)
+                if gt_number is None or out_number is None:
+                    reward = 0.0
+                rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
+                rel_diff = min(1.0, max(0.0, rel_diff))
+                reward = 1 - rel_diff
+            else:
+                reward = 0.0
+        except Exception as e:
+            print(f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a", encoding="utf-8") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    return [0.1 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": format_reward,
+}
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+    "<think> reasoning process here </think><answer> answer here </answer>"
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "Please think about this question as if you were a human pondering deeply. "
+        "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+        "It's encouraged to include self-reflection or verification in the reasoning process. "
+        "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags."
+    }
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    def make_conversation_video(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+    }
+    def make_conversation_image_and_video(example):
+        if example["problem_type"] == 'multiple choice':
+            question = example['problem'] + "Options:\n"
+            for op in example["options"]:
+                question += op + "\n"
+        else:
+            question = example['problem']
+        msg ={
+            "prompt":
+               [{
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": example['data_type'],
+                            # example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                        },
+                        {
+                            "type": "text",
+                            "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                        }
+                        ]
+                }]
+            }
+        return msg
+    dataset = dataset.map(make_conversation_image_and_video)
+    trainer_cls = Qwen2VLGRPOTrainer if not training_args.use_vllm else Qwen2VLGRPOVLLMTrainerModified
+    print("using: ", trainer_cls)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        script_args=script_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+        trainer.train(resume_from_checkpoint=checkpoint)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/grpo_vllm_caption.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from datetime import datetime
+import json
+from io import BytesIO
+import base64
+import numpy as np
+import torch
+import torch.nn.functional as F
+from datasets import load_dataset
+from rouge_score import rouge_scorer
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+import Levenshtein
+import wandb
+from dataclasses import dataclass, field
+from typing import Optional
+from math_verify import parse, verify
+from trainer.grpo_trainer_vllm_caption import Qwen2VLGRPOTrainerCap
+os.environ["WANDB_MODE"] = "offline"
+wandb.init(project="SelfEval-R1", name="SelfEval-R1")
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={
+            "help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+    caption_reward: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether to use caption reward or not"},
+    )
+    caption_reward_weight: Optional[float] = field(
+        default=0.1,
+        metadata={"help": "Weight for the caption reward"},
+    )
+# This function is partially borrowed from Video-R1[https://github.com/tulerfeng/Video-R1]
+def accuracy_reward(completions, solution, **kwargs):
+    def extract_answer(text):
+        pattern = r'<answer>(.*?)</answer>'
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+        return ""
+    def extract_option(text):
+        pattern = r'<option>(.*?)</option>'
+        match = re.search(pattern, text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+        return ""
+    def is_number(num_str):
+        try:
+            float(num_str)
+            return True
+        except Exception as e:
+            return False
+    def extract_numbers(answer):
+        pattern = r"[-+]?\d*\.?\d+"
+        match = re.search(pattern, answer)
+        if match:
+            number_str = match.group()
+            if answer.strip().endswith('%'):
+                number = float(number_str) / 100
+            else:
+                number = float(number_str)
+            return number
+        else:
+            return None
+    def anls(reference, hypothesis):
+        distance = Levenshtein.distance(reference, hypothesis)
+        max_length = max(len(reference), len(hypothesis))
+        similarity = 1 - (distance / max_length)
+        return similarity
+    def compute_rouge_score(reference, hypothesis, use_stemmer=True):
+        scorer = rouge_scorer.RougeScorer(
+            ['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stemmer)
+        scores = scorer.score(reference, hypothesis)
+        average_fmeasure = (
+            scores['rouge1'].fmeasure + scores['rouge2'].fmeasure + scores['rougeL'].fmeasure) / 3
+        return average_fmeasure
+    question_type = kwargs['problem_type'][0]
+    contents = [completion[0]["content"] for completion in completions]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    rewards = []
+    for content, sol in zip(contents, solution):
+        try:
+            output_ans = extract_answer(content)
+            gt_ans = extract_answer(sol)
+            if question_type == "OCR":
+                if is_number(gt_ans):
+                    output_ans = extract_numbers(output_ans)
+                    reward = 1.0 if output_ans == float(
+                        gt_ans) else 0.0
+                else:
+                    reward = anls(gt_ans.lower(),
+                                  output_ans.lower())
+                    reward = max(0.0, min(1.0, reward))
+            elif question_type == "free-form":
+                score = compute_rouge_score(gt_ans, output_ans)
+                reward = max(0.0, min(1.0, score))
+            else:
+                if is_number(gt_ans):
+                    output_ans = extract_numbers(output_ans)
+                    reward = 1.0 if output_ans == float(
+                        gt_ans) else 0.0
+                else:
+                    reward = 1.0 if output_ans.lower() == gt_ans.lower() else 0.0
+        except Exception as e:
+            print(
+                f"Error in reward_fn for question_type '{question_type}': {e}")
+            reward = 0.0
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = 'debug.log'
+            with open(log_path, "a") as f:
+                try:
+                    f.write(
+                        f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                    f.write(f"Content: {content}\n")
+                    f.write(f"Solution: {sol}\n")
+                    f.write(f"type: {question_type}\n")
+                except BaseException:
+                    f.write("writeing error")
+    return rewards
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"<info>.*?</info>\s<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"]
+                           for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL)
+               for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": format_reward,
+}
+SYSTEM_PROMPT = (
+    "You are tasked with analyzing an image to generate an exhaustive and detailed description. "
+    "Your goal is to extract and describe all possible information from the image, including but not limited to objects, "
+    "numbers, text, and the relationships between these elements. The description should be as fine and detailed as possible, "
+    "capturing every nuance. After generating the detailed description, you need to analyze it and provide step-by-step "
+    "detailed reasoning for the given question based on the information. Finally, provide a single word or phrase answer "
+    "to the question. The description, reasoning process and answer are enclosed within <info> </info>, <think> </think> "
+    "and <answer> </answer> tags, respectively, i.e., <info> image description here </info> <think> reasoning process here "
+    "</think> <answer> answer here </answer>"
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func]
+                    for func in script_args.reward_funcs]
+    # Load the dataset
+    # dataset = load_dataset(script_args.dataset_name,
+    #                        name=script_args.dataset_config)
+    dataset = load_dataset("json", data_files=script_args.dataset_name, split='train')
+    # Format into conversation
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": [
+                    {"type": "text", "text": SYSTEM_PROMPT}]},
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": example["problem"]},
+                    ],
+                },
+            ]
+        }
+    dataset = dataset.map(make_conversation_image)
+    if "Qwen" in model_args.model_name_or_path or "Aria" in model_args.model_name_or_path:
+        trainer_cls = Qwen2VLGRPOTrainerCap
+    else:
+        trainer_cls = GRPOTrainer
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        train_dataset=dataset,
+        eval_dataset=None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+        caption_reward=script_args.caption_reward,
+        caption_reward_weight=script_args.caption_reward_weight,
+    )
+    trainer.train()
+    # trainer.train()
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    print('training_args:\n', training_args)
+    print('script_args:\n', script_args)
+    print('model_args:\n', model_args)
+    main(script_args, training_args, model_args)

src/r1-v/src/open_r1/sft_video.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Example usage:
+accelerate launch \
+    --config_file=deepspeed_zero2.yaml \
+    train_video_llm.py \
+    --dataset_name mfarre/simplevideoshorts \
+    --model_name_or_path Qwen/Qwen2-VL-7B-Instruct \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --output_dir video-llm-output \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --gradient_checkpointing
+"""
+import os
+import json
+import random
+import requests
+import torch
+from torch.optim import AdamW
+from datasets import load_dataset
+from transformers import (
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    BitsAndBytesConfig,
+    Qwen2VLProcessor,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration
+)
+from transformers import get_linear_schedule_with_warmup
+from trl import (
+    ModelConfig,
+    ScriptArguments,
+    SFTConfig,
+    SFTTrainer,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+)
+from accelerate import Accelerator
+from qwen_vl_utils import process_vision_info
+from datasets import Dataset, DatasetDict
+import wandb
+from typing import List, Dict, Any
+os.environ["DS_BUILD_FUSED_ADAM"] = "0"
+def get_current_device():
+    """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
+    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
+def download_video(url: str, folder: str = '/tmp/videos/') -> str:
+    """Download video if not already present locally."""
+    filename = url.split("/")[-1]
+    local_path = os.path.join(folder, filename)
+    if os.path.exists(local_path):
+        return local_path
+    try:
+        with requests.get(url, stream=True) as r:
+            r.raise_for_status()
+            with open(local_path, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+        return local_path
+    except requests.RequestException as e:
+        raise Exception(f"Failed to download video: {e}")
+def prepare_dataset(example: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
+    """Prepare dataset example for training."""
+    system_message = "You are a helpful assistant"
+    QUESTION_TEMPLATE = (
+        "{Question}\n"
+        "Please think about this question as if you were a human pondering deeply. "
+        "Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions "
+        "It's encouraged to include self-reflection or verification in the reasoning process. "
+        "Provide your detailed reasoning between the <think> </think> tags, and then give your final answer between the <answer> </answer> tags."
+    )
+    TYPE_TEMPLATE = {
+        "multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.",
+        "numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.",
+        "OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.",
+        "free-form": " Please provide your text answer within the <answer> </answer> tags.",
+        "regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags."
+    }
+    if example["problem_type"] == 'multiple choice':
+        question = example['problem'] + "Options:\n"
+        for op in example["options"]:
+            question += op + "\n"
+    else:
+        question = example['problem']
+    messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": system_message}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": example['data_type'],
+                    example['data_type']: os.getcwd() + "/Video-R1-data" + example['path'][1:]
+                    # "max_pixels": 360*420,
+                    # "fps": 1.0
+                },
+                {
+                    "type": "text",
+                    "text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[example['problem_type']]
+                }
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": example['process'] + "\n" + example['solution']}]
+        }
+    ]
+    return {"messages": messages}
+def collate_fn(examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+    """Collate batch of examples for training."""
+    texts = []
+    # video_inputs = []
+    # image_inputs = []
+    for i, example in enumerate(examples):
+        try:
+            texts.append(processor.apply_chat_template(example["messages"], tokenize=False))
+            image_inputs, video_inputs, video_kwargs = process_vision_info(example["messages"], return_video_kwargs=True)
+        except Exception as e:
+            raise ValueError(f"Failed to process example {i}: {e}")
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        return_tensors="pt",
+        padding=True
+    )
+    labels = inputs["input_ids"].clone()
+    labels[labels == processor.tokenizer.pad_token_id] = -100
+    # Handle visual tokens based on processor type
+    visual_tokens = [151652, 151653, 151656] if isinstance(processor, Qwen2VLProcessor) else [
+        processor.tokenizer.convert_tokens_to_ids(processor.image_token)
+    ]
+    for visual_token_id in visual_tokens:
+        labels[labels == visual_token_id] = -100
+    inputs["labels"] = labels
+    return inputs
+if __name__ == "__main__":
+    # Parse arguments
+    parser = TrlParser((ScriptArguments, SFTConfig, ModelConfig))
+    script_args, training_args, model_config = parser.parse_args_and_config()
+    # Configure training args
+    training_args.gradient_checkpointing_kwargs = dict(use_reentrant=False)
+    training_args.remove_unused_columns = False
+    training_args.dataset_kwargs = {"skip_prepare_dataset": True}
+    # Load dataset
+    if script_args.dataset_name.endswith('.json') or script_args.dataset_name.endswith('.jsonl'):
+        dataset =  DatasetDict({"train": Dataset.from_json(script_args.dataset_name)})
+    else:
+        # Load the dataset
+        dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Setup model
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
+    # # Quantization configuration for 4-bit training
+    # bnb_config = BitsAndBytesConfig(
+    #     load_in_4bit=True,
+    #     bnb_4bit_use_double_quant=True,
+    #     bnb_4bit_quant_type="nf4",
+    #     bnb_4bit_compute_dtype=torch.bfloat16
+    # )
+    # Model initialization
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        torch_dtype=torch_dtype,
+        device_map=get_kbit_device_map(),
+        # quantization_config=bnb_config,
+    )
+    if "Qwen2-VL" in model_config.model_name_or_path:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    elif "Qwen2.5-VL" in model_config.model_name_or_path:
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    else:
+        model = AutoModelForVision2Seq.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    processor = AutoProcessor.from_pretrained(
+        model_config.model_name_or_path,
+        trust_remote_code=model_config.trust_remote_code
+    )
+    # Prepare dataset
+    prepared_dataset = [prepare_dataset(example) for example in dataset['train']]
+    # Initialize wandb if specified
+    if training_args.report_to == "wandb":
+        wandb.init(project="video-llm-training")
+    '''
+    Below is added code
+    '''
+    base_lr = 2e-4
+    optimizer = AdamW(
+        params=model.parameters(),
+        lr=base_lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2
+    )
+    num_training_steps = len(prepared_dataset) // (
+        training_args.per_device_train_batch_size
+        * training_args.gradient_accumulation_steps
+        * training_args.world_size
+    ) * training_args.num_train_epochs
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=int(0.05 * num_training_steps),
+        num_training_steps=num_training_steps,
+    )
+    '''
+    Above is added code
+    '''
+    # Initialize trainer
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=prepared_dataset,
+        data_collator=collate_fn,
+        peft_config=get_peft_config(model_config),
+        # tokenizer=processor.tokenizer
+        optimizers=(optimizer, lr_scheduler),
+    )
+    # Train model
+    trainer.train()
+    # Save final model
+    trainer.save_model(training_args.output_dir)
+    processor.save_pretrained(training_args.output_dir)
+    if trainer.accelerator.is_main_process:
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+    # Cleanup
+    del model
+    del trainer
+    torch.cuda.empty_cache()
+    wandb.finish()

src/r1-v/src/open_r1/trainer/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .grpo_trainer import Qwen2VLGRPOTrainer
+from .vllm_grpo_trainer_modified import Qwen2VLGRPOVLLMTrainerModified
+from .vllm_grpo_trainer_modified_orig import Qwen2VLGRPOVLLMTrainerModifiedOrig
+from .vllm_grpo_trainer_selfConst import Qwen2VLGRPOVLLMTrainerSelfConst
+__all__ = [
+    "Qwen2VLGRPOTrainer",
+    "Qwen2VLGRPOVLLMTrainerModified",
+    "Qwen2VLGRPOVLLMTrainerModifiedOrig",
+    "Qwen2VLGRPOVLLMTrainerSelfConst"
+]

src/r1-v/src/open_r1/trainer/vllm_grpo_trainer_modified_error.py ADDED Viewed

	@@ -0,0 +1,1061 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Callable, Optional, Union
+from accelerate.utils.other import is_compiled_module
+from accelerate.utils import broadcast_object_list, gather, gather_object
+import torch
+import torch.utils.data
+import transformers
+import warnings
+from unittest.mock import patch
+from datasets import Dataset, IterableDataset
+from packaging import version
+from transformers import (
+    AriaForConditionalGeneration,
+    AriaProcessor,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    Trainer,
+    TrainerCallback,
+    is_wandb_available,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import is_peft_available
+from trl.data_utils import (
+    apply_chat_template,
+    is_conversational,
+    maybe_apply_chat_template,
+)
+from trl.import_utils import is_vllm_available
+from trl.models import (
+    create_reference_model,
+    prepare_deepspeed,
+    unwrap_model_for_generation,
+)
+from trl.trainer.grpo_config import GRPOConfig
+from trl.trainer.utils import generate_model_card, get_comet_experiment_url, pad
+from trl import GRPOTrainer
+import copy
+if is_peft_available():
+    from peft import PeftConfig, get_peft_model
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+if is_wandb_available():
+    import wandb
+import torch.nn as nn
+from torch.utils.data import Sampler
+import gc
+from qwen_vl_utils import process_vision_info
+import re
+def extract_answer(predict: str) -> Optional[str]:
+    """
+    Extracts the content of the <answer>…</answer> block from `predict`.
+    Returns the inner text (with leading/trailing whitespace stripped),
+    or None if no <answer> tag is found.
+    """
+    match = re.search(r"<answer>([\s\S]*?)</answer>", predict, re.DOTALL)
+    if not match:
+        return None
+    return match.group(1).strip()
+def extract_info(predict: str) -> Optional[str]:
+    """
+    Extracts the content of the <answer>…</answer> block from `predict`.
+    Returns the inner text (with leading/trailing whitespace stripped),
+    or None if no <answer> tag is found.
+    """
+    match = re.search(r"<des>([\s\S]*?)</des>", predict, re.DOTALL)
+    if not match:
+        return None
+    return match.group(1).strip()
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
+class Qwen2VLGRPOVLLMTrainerModified(Trainer):
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: GRPOConfig = None,
+        script_args = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[
+            Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]
+        ] = None,
+        processing_class: Optional[PreTrainedTokenizerBase] = None,
+        reward_processing_classes: Optional[
+            Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]
+        ] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[
+            Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]
+        ] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+        # qwen2-vl related params
+        max_pixels: Optional[int] = 12845056,
+        min_pixels: Optional[int] = 3136,
+        attn_implementation: str = "flash_attention_2",
+    ):
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        model_init_kwargs["attn_implementation"] = attn_implementation
+        if isinstance(model, str):
+            model_id = model
+            torch_dtype = model_init_kwargs.get("torch_dtype")
+            if (
+                isinstance(torch_dtype, torch.dtype)
+                or torch_dtype == "auto"
+                or torch_dtype is None
+            ):
+                pass  # torch_dtype is already a torch.dtype or "auto" or None
+            elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+                torch_dtype = getattr(torch, torch_dtype)
+                model_init_kwargs["torch_dtype"] = torch_dtype
+            else:
+                raise ValueError(
+                    "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            model_init_kwargs["use_cache"] = (
+                False
+                if args.gradient_checkpointing
+                else model_init_kwargs.get("use_cache")
+            )
+            if "Qwen2-VL" in model_id:
+                model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            elif "Qwen2.5-VL" in model_id:
+                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            elif "Aria" in model_id:
+                model_init_kwargs.pop("use_cache")
+                model = AriaForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            else:
+                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                raise ValueError(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "This argument can only be used when the `model` argument is a string."
+                )
+        if peft_config is not None:
+            model = get_peft_model(model, peft_config)
+        # Reference model
+        if is_deepspeed_zero3_enabled():
+            if "Qwen2-VL" in model_id:
+                self.ref_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            elif "Qwen2.5-VL" in model_id:
+                self.ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            elif "Aria" in model_id:
+                self.ref_model = AriaForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            else:
+                self.ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+        elif peft_config is None:
+            # If PEFT configuration is not provided, create a reference model based on the initial model.
+            self.ref_model = create_reference_model(model)
+        else:
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        # Processing class
+        if processing_class is None:
+            if "Qwen" in model_id or "Aria" in model_id:
+                processing_class = AutoProcessor.from_pretrained(model_id)
+                pad_token_id = processing_class.tokenizer.pad_token_id
+                processing_class.pad_token_id = pad_token_id
+                processing_class.eos_token_id = processing_class.tokenizer.eos_token_id
+                if "Qwen" in model_id:
+                    processing_class.image_processor.max_pixels = max_pixels
+                    processing_class.image_processor.min_pixels = min_pixels
+            else:
+                processing_class = AutoTokenizer.from_pretrained(
+                    model.config._name_or_path, padding_side="left"
+                )
+                pad_token_id = processing_class.pad_token_id
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+        self.reward_funcs = reward_funcs
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        else:
+            if len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError(
+                    "The number of reward processing classes must match the number of reward functions."
+                )
+        for i, (reward_processing_class, reward_func) in enumerate(
+            zip(reward_processing_classes, reward_funcs)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(
+                        reward_func.config._name_or_path
+                    )
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = (
+                        reward_processing_class.eos_token
+                    )
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+        self.reward_processing_classes = reward_processing_classes
+        # Data collator
+        def data_collator(features):  # No data collation is needed in GRPO
+            return features
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = (
+            args.max_completion_length
+        )  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.temporal = script_args.temporal
+        self.generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            temperature=1,  # HACK
+            num_return_sequences=self.num_generations,
+            pad_token_id=pad_token_id,
+        )
+        self.beta = args.beta
+        self.shuffled_num_generations = self.num_generations // 2
+        self.shuffled_generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            top_p=0.95,
+            temperature=1, # HACK
+            num_return_sequences=self.shuffled_num_generations,
+            pad_token_id=pad_token_id,
+        )
+        self.dummy_generation_config = GenerationConfig(
+            max_new_tokens=1,
+            do_sample=True,
+            top_p=0.95,
+            temperature=1, # HACK
+            num_return_sequences=1,
+            pad_token_id=pad_token_id,
+        )
+        self.len_control = script_args.len_control
+        self.beta = args.beta
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+        # Initialize the metrics
+        self._metrics = defaultdict(list)
+        self.use_vllm = args.use_vllm
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+            if self.accelerator.is_main_process:
+                vllm_device = self.args.vllm_device
+                if vllm_device == "auto":
+                    vllm_device = f"cuda:{self.accelerator.num_processes}"  # take the next GPU idx
+                # Check that the requested device is available
+                if (
+                    vllm_device.split(":")[0] == "cuda"
+                    and int(vllm_device.split(":")[1]) >= torch.cuda.device_count()
+                ):
+                    raise ValueError(
+                        f"The requested device for vllm ({vllm_device}) is not available. You are likely using vLLM "
+                        "without restricting the number of GPUs for training. Set the `--num_processes` argument to a "
+                        "value lower than the number of GPUs available on your machine—typically, reducing it by one "
+                        f"is sufficient. In your case: `--num_processes {torch.cuda.device_count() - 1}`."
+                    )
+                # Check that the requested device is not also used for training
+                if vllm_device in {
+                    f"cuda:{idx}" for idx in range(self.accelerator.num_processes)
+                }:
+                    warnings.warn(
+                        f"The requested device {vllm_device} is also used for training. This may lead to unexpected "
+                        "behavior. It is recommended to use a dedicated device for vLLM."
+                    )
+                # vLLM is not compatible with accelerate. So we need to patch it to make sure we can (1) place the vLLM
+                # model on the desired device (world_size_patch) and (2) avoid a test that is not designed for our
+                # setting (profiling_patch).
+                world_size_patch = patch(
+                    "torch.distributed.get_world_size", return_value=1
+                )
+                profiling_patch = patch(
+                    "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
+                    return_value=None,
+                )
+                with world_size_patch, profiling_patch:
+                    print("vllm is running on: ", vllm_device)
+                    self.llm = LLM(
+                        model=model.name_or_path,
+                        device=vllm_device,
+                        gpu_memory_utilization=self.args.vllm_gpu_memory_utilization,
+                        dtype=torch.bfloat16,
+                        # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
+                        # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
+                        # This is particularly useful here because we generate completions from the same prompts.
+                        enable_prefix_caching=True,
+                        enforce_eager=True,
+                        mm_processor_kwargs=(
+                            {
+                                "max_pixels": max_pixels,
+                                "min_pixels": min_pixels,
+                            }
+                            # if "Qwen2-VL" in model_id or "Qwen2.5-VL" in model_id
+                            if False
+                            else None
+                        ),
+                        max_model_len=args.max_prompt_length + args.max_completion_length,
+                    )
+                self.sampling_params = SamplingParams(
+                    temperature=1.0,
+                    top_p=0.95,
+                    max_tokens=self.max_completion_length,
+                )
+            self._last_loaded_step = 0  # tag to avoid useless loading during grad accumulation
+            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+            # synchronize all processes after vLLM has been fully initialized.
+            self.accelerator.wait_for_everyone()
+        else:
+            raise ValueError(
+                "GRPOVLLMTrainerModified only supports vllm generation, please set --use_vllm True"
+            )
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True)
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+        # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(self, model, input_ids, **kwargs):
+        # logits = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, image_grid_thw=image_grid_thw).logits  # (B, L, V)
+        # import pdb
+        # pdb.set_trace()
+        logits = model(input_ids, **kwargs).logits
+        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+        input_ids = input_ids[:, 1:]  # (B, L-1), exclude the first input ID since we don't have logits for it
+        # Compute the log probabilities for the input tokens. Use a loop to reduce memory peak.
+        per_token_logps = []
+        for logits_row, input_ids_row in zip(logits, input_ids):
+            log_probs = logits_row.log_softmax(dim=-1)
+            token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1)
+            per_token_logps.append(token_log_prob)
+        return torch.stack(per_token_logps)
+    # Trainer "prepares" the inputs before calling `compute_loss`. It converts to tensor and move to device.
+    # Since we preprocess the data in `compute_loss`, we need to override this method to skip this step.
+    def _prepare_inputs(
+        self, inputs: dict[str, Union[torch.Tensor, Any]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        return inputs
+    def remove_none_from_data(self, data):
+        for entry in data:
+            if "content" in entry and isinstance(entry["content"], list):
+                for sub_entry in entry["content"]:
+                    if isinstance(sub_entry, dict):
+                        keys_to_remove = [k for k, v in sub_entry.items() if v is None]
+                        for k in keys_to_remove:
+                            del sub_entry[k]
+        return data
+    def _vllm_generate(self, prompts_text, mm_data, n):
+        """
+        Helper that wraps the whole ‘gather-broadcast-slice-pad-decode’ dance
+        and returns (completion_ids, decoded_texts) *ON THIS RANK ONLY*.
+        `mm_data` can be None/[] for pure-text inputs.
+        """
+        device = self.accelerator.device
+        # --------------- gather everything to rank-0 ----------------
+        all_prompts = gather_object(prompts_text)
+        all_mm_data  = gather_object(mm_data or [[]] * len(prompts_text))
+        # build the multimodal inputs expected by vLLM
+        vllm_inputs = [
+            {"prompt": p, "multi_modal_data": m[0] if m else {}}
+            for p, m in zip(all_prompts, all_mm_data)
+        ]
+        # -------------------------------------------------------------
+        if self.accelerator.is_main_process:
+            p = copy.deepcopy(self.sampling_params)
+            p.n = n
+            outs = self.llm.generate(vllm_inputs, sampling_params=p, use_tqdm=False)
+            comp_ids = [o.token_ids for c in outs for o in c.outputs]
+        else:
+            comp_ids = [None] * (len(vllm_inputs) * n)
+        # broadcast back, pick this rank’s slice
+        comp_ids = broadcast_object_list(comp_ids, from_process=0)
+        lo = self.accelerator.process_index * len(prompts_text) * n
+        hi = (self.accelerator.process_index + 1) * len(prompts_text) * n
+        comp_ids = comp_ids[lo:hi]
+        # pad, convert to tensor → decode
+        comp_ids = [torch.tensor(x, device=device) for x in comp_ids]
+        comp_ids = pad(comp_ids, padding_value=self.processing_class.pad_token_id)
+        decoded   = self.processing_class.batch_decode(comp_ids, skip_special_tokens=True)
+        return comp_ids, decoded
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Compute the per-token log probabilities for the model
+        device = self.accelerator.device
+        prompts = [x["prompt"] for x in inputs]
+        # images = [x["image"] for x in inputs]
+        prompts_text = [
+            maybe_apply_chat_template(example, self.processing_class)["prompt"]
+            for example in inputs
+        ]
+        input_copy = copy.deepcopy(inputs[0]['prompt'])
+        input_copy = self.remove_none_from_data(input_copy)
+        data_type = inputs[0]['data_type']
+        if data_type == 'image':
+            input_copy[0]['content'][0]['image'] = os.getcwd() + "/Video-R1-data" + inputs[0]['path'][1:]
+        elif data_type == 'video':
+            input_copy[0]['content'][0]['video'] = os.getcwd() + "/Video-R1-data" + inputs[0]['path'][1:]
+        image_inputs, video_inputs, video_kwargs = process_vision_info(input_copy, return_video_kwargs=True)
+        prompt_inputs = self.processing_class(
+            text=copy.deepcopy(prompts_text),
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        mm_data = [[data_type, image_inputs if image_inputs else video_inputs]]
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+        if self.max_prompt_length is not None:
+            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
+            prompt_mask = prompt_mask[:, -self.max_prompt_length :]
+        if self.temporal:
+            if video_inputs:
+                indices = torch.randperm(video_inputs[0].size(0))
+                shuffled_video_inputs = [video_inputs[0][indices]]
+                shuffled_prompt_inputs = self.processing_class(
+                    text=copy.deepcopy(prompts_text),
+                    images=image_inputs,
+                    videos=shuffled_video_inputs,
+                    return_tensors="pt",
+                    padding=True,
+                    padding_side="left",
+                    add_special_tokens=False,
+                )
+                shuffled_mm_data = [[self.accelerator.process_index, data_type, image_inputs if image_inputs else video_inputs]]
+                shuffled_prompt_inputs = super()._prepare_inputs(shuffled_prompt_inputs)
+                shuffled_prompt_ids, shuffled_prompt_mask = shuffled_prompt_inputs["input_ids"], shuffled_prompt_inputs["attention_mask"]
+                if self.max_prompt_length is not None:
+                    shuffled_prompt_ids = shuffled_prompt_ids[:, -self.max_prompt_length :]
+                    shuffled_prompt_mask = shuffled_prompt_mask[:, -self.max_prompt_length :]
+            else:
+                shuffled_mm_data = [None]
+        if self.args.use_vllm:
+            # First, have main process load weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                with unwrap_model_for_generation(
+                    self.model,
+                    self.accelerator,
+                    gather_deepspeed3_params=True,  # TODO: fix this, self.args.ds3_gather_for_generation,
+                ) as unwrapped_model:
+                    if is_compiled_module(unwrapped_model):
+                        state_dict = unwrapped_model._orig_mod.state_dict()
+                    else:
+                        state_dict = unwrapped_model.state_dict()
+                if self.accelerator.is_main_process:
+                    llm_model = (
+                        self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                    )
+                    # import pdb
+                    # pdb.set_trace()
+                    llm_model.load_weights(state_dict.items())
+                self._last_loaded_step = self.state.global_step
+            '''
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            all_prompts_text = gather_object(prompts_text)
+            all_mm_data = gather_object(mm_data)
+            # group into pairs
+            all_multimodal_inputs = []
+            if self.temporal:
+                shuffled_all_mm_data_none = gather_object(shuffled_mm_data)
+                shuffled_all_mm_data = [x for x in shuffled_all_mm_data_none if x]
+                shuffled_all_multimodal_inputs = []
+            # 2. Refer to TobiasLee's implementation suggestions
+            # this is a better implementation for vLLM sampling.
+            for prompt, mm_item in zip(all_prompts_text, all_mm_data):
+                all_multimodal_inputs.append({"prompt": prompt, "multi_modal_data": {mm_item[0]: mm_item[1]}})
+            if self.temporal and shuffled_all_mm_data!=[]:
+                for mm_item in shuffled_all_mm_data:
+                    shuffled_all_multimodal_inputs.append({"prompt": all_prompts_text[mm_item[0]], "multi_modal_data": {mm_item[1]: mm_item[2]}})
+            # Create sampling params with num_generations
+            if self.accelerator.is_main_process:
+                # Clone to avoid modifying original params
+                sampling_params = copy.deepcopy(self.sampling_params)
+                sampling_params.n = self.num_generations
+                # Single generate call with all prompts
+                if self.accelerator.is_main_process:
+                    outputs = self.llm.generate(
+                        all_multimodal_inputs,
+                        sampling_params=sampling_params,
+                        use_tqdm=False,
+                    )
+                # Flatten outputs: [prompt1_gen1, prompt1_gen2, ..., prompt2_gen1, prompt2_gen2, ...]
+                completion_ids = [out.token_ids for completion in outputs for out in completion.outputs]
+                if self.temporal and shuffled_all_mm_data!=[]:
+                    # Clone to avoid modifying original params
+                    shuffled_sampling_params = copy.deepcopy(self.sampling_params)
+                    shuffled_sampling_params.n = self.num_generations // 2
+                    # Single generate call with all prompts
+                    if self.accelerator.is_main_process:
+                        shuffled_outputs = self.llm.generate(
+                            shuffled_all_multimodal_inputs,
+                            sampling_params=shuffled_sampling_params,
+                            use_tqdm=False,
+                        )
+                    # Flatten outputs: [prompt1_gen1, prompt1_gen2, ..., prompt2_gen1, prompt2_gen2, ...]
+                    shuffled_completion_ids = [out.token_ids for completion in shuffled_outputs for out in completion.outputs]
+            else:
+                completion_ids = [None] * len(all_multimodal_inputs) * self.num_generations
+                if self.temporal and shuffled_all_mm_data!=[]:
+                    shuffled_completion_ids = [None] * len(shuffled_all_multimodal_inputs) * (self.num_generations // 2)
+            # broadcast and slice
+            completion_ids = broadcast_object_list(completion_ids, from_process=0)
+            process_slice = slice(
+                self.accelerator.process_index * len(prompts) * self.num_generations,
+                (self.accelerator.process_index + 1) * len(prompts) * self.num_generations,
+            )
+            completion_ids = completion_ids[process_slice]
+            # Pad the completions, and concatenate them with the prompts
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(
+                completion_ids, padding_value=self.processing_class.pad_token_id
+            )
+            '''
+            completion_ids, completions = self._vllm_generate(
+                prompts_text,        # original text prompts
+                mm_data,             # vision payload (may be empty for text-only)
+                self.num_generations,
+            )
+            prompt_ids = prompt_ids.repeat_interleave(self.num_generations, dim=0)
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            prompt_length = prompt_ids.size(1)
+            print('prompt_length:', prompt_length)
+            prompt_ids = prompt_completion_ids[:, :prompt_length]
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+            prompt_mask = prompt_mask.repeat_interleave(self.num_generations, dim=0)
+            '''
+            This is the additional code that avoids the shuffled_all_mm_data variable undefined error.
+            '''
+            if self.temporal and video_inputs:
+                # ❶ make the shuffled video batch (you already computed shuffled_video_inputs)
+                local_shuffled_mm   = [[data_type, shuffled_video_inputs]]
+                shuffled_prompts    = copy.deepcopy(prompts_text)
+                # ❷ generate half as many completions for each prompt
+                shuffled_completion_ids, _ = self._vllm_generate(
+                    prompts_text=shuffled_prompts,
+                    mm_data=local_shuffled_mm,
+                    n=self.num_generations // 2,
+                )
+                # ❸ mimic the old triple-list so later broadcast logic works unchanged
+                shuffled_all_mm_data = [[self.accelerator.process_index,
+                                        data_type,
+                                        shuffled_video_inputs]]
+            # -----------------------------------------------------------------
+                if self.temporal and shuffled_all_mm_data!=[]:
+                    # broadcast and slice
+                    shuffled_completion_ids = broadcast_object_list(shuffled_completion_ids, from_process=0)
+                    process_id_list = []
+                    for mm_item in shuffled_all_mm_data:
+                        process_id_list += [mm_item[0]] * len(prompts) * (self.num_generations // 2)
+                    if video_inputs:
+                        cur_shuffled_completion_ids = []
+                        for i in range(len(process_id_list)):
+                            if self.accelerator.process_index == process_id_list[i]:
+                                cur_shuffled_completion_ids.append(shuffled_completion_ids[i])
+                        # Pad the completions, and concatenate them with the prompts
+                        cur_shuffled_completion_ids = [torch.tensor(ids, device=device) for ids in cur_shuffled_completion_ids]
+                        cur_shuffled_completion_ids = pad(
+                            cur_shuffled_completion_ids, padding_value=self.processing_class.pad_token_id
+                        )
+                        shuffled_completion_ids = cur_shuffled_completion_ids
+            else:
+                raise ValueError("Only vLLM generation is supported in this version ")
+            '''Above is additional code'''
+            if self.temporal and shuffled_all_mm_data!=[]:
+                # broadcast and slice
+                shuffled_completion_ids = broadcast_object_list(shuffled_completion_ids, from_process=0)
+                process_id_list = []
+                for mm_item in shuffled_all_mm_data:
+                    process_id_list += [mm_item[0]] * len(prompts) * (self.num_generations // 2)
+                if video_inputs:
+                    cur_shuffled_completion_ids = []
+                    for i in range(len(process_id_list)):
+                        if self.accelerator.process_index == process_id_list[i]:
+                            cur_shuffled_completion_ids.append(shuffled_completion_ids[i])
+                    # Pad the completions, and concatenate them with the prompts
+                    cur_shuffled_completion_ids = [torch.tensor(ids, device=device) for ids in cur_shuffled_completion_ids]
+                    cur_shuffled_completion_ids = pad(
+                        cur_shuffled_completion_ids, padding_value=self.processing_class.pad_token_id
+                    )
+                    shuffled_completion_ids = cur_shuffled_completion_ids
+        else:
+            raise ValueError("Only vLLM generation is supported in this version ")
+        # below are the same with yifan's code
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.processing_class.eos_token_id
+        device = self.accelerator.device
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+        prompt_inputs.pop("input_ids")
+        prompt_inputs.pop("attention_mask")
+        if data_type == 'image':
+            prompt_inputs["pixel_values"] = prompt_inputs["pixel_values"].repeat(len(prompt_completion_ids), 1)
+            prompt_inputs["image_grid_thw"] = prompt_inputs["image_grid_thw"].repeat(len(prompt_completion_ids), 1)
+        # import pdb; pdb.set_trace()
+        if data_type == 'video':
+            prompt_inputs["pixel_values_videos"] = prompt_inputs["pixel_values_videos"].repeat(len(prompt_completion_ids), 1)
+            prompt_inputs["video_grid_thw"] = prompt_inputs["video_grid_thw"].repeat(len(prompt_completion_ids), 1)
+            if 'second_per_grid_ts' in prompt_inputs:
+                del prompt_inputs["second_per_grid_ts"]
+        # import pdb
+        # pdb.set_trace()
+        # per_token_logps = self._get_per_token_logps(model, prompt_completion_ids, attention_mask, pixel_values, image_grid_thw)
+        per_token_logps = self._get_per_token_logps(model, prompt_completion_ids, **prompt_inputs)
+        # Get rid of the prompt (-1 because of the shift done in get_per_token_logps)
+        per_token_logps = per_token_logps[:, prompt_length - 1 :]
+        gc.collect()
+        torch.cuda.empty_cache()
+        with torch.inference_mode():
+            if self.ref_model is not None:
+                ref_per_token_logps = self._get_per_token_logps(self.ref_model, prompt_completion_ids, **prompt_inputs)
+            else:
+                with self.accelerator.unwrap_model(model).disable_adapter():
+                    ref_per_token_logps = self._get_per_token_logps(model, prompt_completion_ids, **prompt_inputs)
+        ref_per_token_logps = ref_per_token_logps[:, prompt_length - 1 :]
+        x_clamped = torch.clamp(ref_per_token_logps - per_token_logps, min=-10, max=10)  # 限制 x 的范围
+        per_token_kl = torch.exp(x_clamped) - x_clamped - 1
+        gc.collect()
+        torch.cuda.empty_cache()
+        if self.temporal and video_inputs:
+            shuffled_completions = self.processing_class.batch_decode(shuffled_completion_ids, skip_special_tokens=True)
+            if is_conversational(inputs[0]):
+                shuffled_completions = [[{"role": "assistant", "content": shuffled_completion}] for shuffled_completion in shuffled_completions]
+            # Compute the rewards
+            shuffled_prompts = [prompt for prompt in prompts for _ in range(self.shuffled_num_generations)]
+            shuffled_rewards_per_func = torch.zeros(len(shuffled_prompts), len(self.reward_funcs), device=device)
+            for i, (reward_func, reward_processing_class) in enumerate(
+                zip(self.reward_funcs, self.reward_processing_classes)
+            ):
+                # Repeat all input columns (but "prompt" and "completion") to match the number of generations
+                shuffled_reward_kwargs = {key: [] for key in inputs[0].keys() if key not in ["prompt", "completion"]}
+                for key in shuffled_reward_kwargs:
+                    for example in inputs:
+                        # Repeat each value in the column for `num_generations` times
+                        shuffled_reward_kwargs[key].extend([example[key]] * self.shuffled_num_generations)
+                shuffled_output_reward_func = reward_func(prompts=shuffled_prompts, completions=shuffled_completions, **shuffled_reward_kwargs)
+                shuffled_rewards_per_func[:, i] = torch.tensor(shuffled_output_reward_func, dtype=torch.float32, device=device)
+        # Decode the generated completions
+        completions = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in completions
+            ]
+        '''Below is code for second completions generation'''
+        if is_conversational(inputs[0]):
+            first_texts = [c[0]["content"] for c in completions]
+        else:
+            first_texts = completions
+        # ------------------------------------------------------------
+        # 2️⃣  Build follow-up prompts with `extract_info`
+        # ------------------------------------------------------------
+        follow_up_prompts = [extract_info(txt) for txt in first_texts]
+        # ------------------------------------------------------------
+        # 3️⃣  SECOND-hop generation  ➜  `second_completions`
+        # ------------------------------------------------------------
+        _, second_texts = self._vllm_generate(
+            follow_up_prompts,   # new prompts (pure text)
+            None,                # no vision payload
+            1                    # one follow-up per prompt
+        )
+        # pack in chat format if needed
+        if is_conversational(inputs[0]):
+            second_completions = [
+                [{"role": "assistant", "content": t}] for t in second_texts
+            ]
+        else:
+            second_completions = second_texts
+        '''Above is code for second completions generation'''
+        # Compute the rewards
+        prompts = [prompt for prompt in prompts for _ in range(self.num_generations)]
+        rewards_per_func = torch.zeros(
+            len(prompts), len(self.reward_funcs), device=device
+        )
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes)
+        ):
+            reward_kwargs = {
+                key: []
+                for key in inputs[0].keys()
+                if key not in ["prompt", "completion"]
+            }
+            '''Below is code for taking second generations'''
+            # every original example contributes `self.num_generations`
+            for example in inputs:
+                for _ in range(self.num_generations):          # n times
+                    for key in reward_kwargs:
+                        reward_kwargs[key].append(example[key])
+            # -------- call the reward function --------
+            outputs = reward_func(
+                prompts=follow_up_prompts,          # ⬅ extracted info
+                completions=second_completions,     # ⬅ fresh answers
+                **reward_kwargs,
+            )
+            rewards_per_func[:, i] = torch.tensor(outputs, dtype=torch.float32, device=device)
+            '''Above is code for taking second generations'''
+            # for key in reward_kwargs:
+            #     for example in inputs:
+            #         # Repeat each value in the column for `num_generations` times
+            #         reward_kwargs[key].extend([example[key]] * self.num_generations)
+            # output_reward_func = reward_func(
+            #     prompts=prompts, completions=completions, **reward_kwargs
+            # )
+            # rewards_per_func[:, i] = torch.tensor(
+            #     output_reward_func, dtype=torch.float32, device=device
+            # )
+        # rewards_per_func = gather(rewards_per_func)
+        # # Sum the rewards from all reward functions
+        # rewards = rewards_per_func.sum(dim=1)
+        # process_slice = slice(
+        #     self.accelerator.process_index * len(prompts),
+        #     (self.accelerator.process_index + 1) * len(prompts),
+        # )
+        # rewards = rewards[process_slice]
+        if self.temporal and video_inputs:
+            temporal_rewards_per_func = rewards_per_func.clone()
+            acc_mean = temporal_rewards_per_func[:, 0].mean()
+            shuffled_acc_mean = shuffled_rewards_per_func[:, 0].mean()
+            if acc_mean >= 0.8 * shuffled_acc_mean:
+                mask = temporal_rewards_per_func[:, 0] > 0.1
+                temporal_rewards_per_func[mask, 0] = temporal_rewards_per_func[mask, 0] + 0.3
+                temporal_rewards = torch.tensor([1.0]).to('cuda')
+            else:
+                temporal_rewards = torch.tensor([0.0]).to('cuda')
+        else:
+            temporal_rewards =  torch.tensor([0.5]).to('cuda')
+        # Sum the rewards from all reward functions
+        if self.temporal and video_inputs:
+            rewards = temporal_rewards_per_func.sum(dim=1)
+        else:
+            rewards = rewards_per_func.sum(dim=1)
+        if self.len_control:
+            mem_rewards = [0] * self.num_generations
+            mask = rewards_per_func[:, 0] > 0.1
+            lenth_list = completion_mask.sum(1)
+            selected_indices = torch.nonzero(mask, as_tuple=True)[0].tolist()
+            #             if len(selected_indices) > 1 and len(selected_indices) < self.num_generations:
+            # if len(selected_indices) > 1:
+            #     selected_items = [(i, lenth_list[i]) for i in selected_indices]
+            #     sorted_items = sorted(selected_items, key=lambda x: x[1], reverse=True)
+            #     N = len(sorted_items)
+            #     for rank, (idx, length) in enumerate(sorted_items):
+            #         reward = 0.2 - 0.2 * (rank / N)
+            #         rewards[idx] += reward
+            #         mem_rewards[idx] = reward
+            # for idx in range(len(lenth_list)):
+            #     if lenth_list[idx] >= 512:
+            #         rewards[idx] -= 0.5
+            if len(selected_indices) > 1:
+                for idx in selected_indices:
+                    if 320 <= lenth_list[idx] <= 512:
+                        rewards[idx] += 0.2
+        print(rewards)
+        print(completion_mask.sum(1))
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4)
+        # x - x.detach() allows for preserving gradients from x
+        per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1)
+        per_token_loss = -(per_token_loss - self.beta * per_token_kl)
+        # per_token_loss = -per_token_loss
+        loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        # import pdb
+        # pdb.set_trace()
+        # Log the metrics
+        completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
+        self._metrics["completion_length"].append(completion_length)
+        reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                reward_func_name = reward_func.config._name_or_path.split("/")[-1]
+            else:
+                reward_func_name = reward_func.__name__
+            self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item())
+        gathered_rewards = self.accelerator.gather_for_metrics(rewards)
+        num_devices = gathered_rewards.size(0) // self.num_generations
+        rewards_per_device = gathered_rewards.view(num_devices, self.num_generations)
+        wrong_devices = (rewards_per_device <= 1).all(dim=1)
+        wrong_ratio = wrong_devices.sum().item() / num_devices
+        correct_devices = (rewards_per_device >= 2).all(dim=1)
+        correct_ratio = correct_devices.sum().item() / num_devices
+        self._metrics["all_wrong"].append(wrong_ratio)
+        self._metrics["all_correct"].append(correct_ratio)
+        if self.temporal:
+            temporal_rewards_list = self.accelerator.gather_for_metrics(temporal_rewards)
+            self._metrics["temporal_rewards"].append(self.accelerator.gather_for_metrics(temporal_rewards_list).mean().item())
+        self._metrics["reward"].append(self.accelerator.gather_for_metrics(rewards).mean().item())
+        self._metrics["reward_std"].append(self.accelerator.gather_for_metrics(std_grouped_rewards).mean().item())
+        mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        return loss
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()}  # average the metrics
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if next(iter(logs.keys())).startswith("eval_"):
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+        logs = {**logs, **metrics}
+        if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"):
+            super().log(logs, start_time)
+        else:  # transformers<=4.46
+            super().log(logs)
+        self._metrics.clear()

src/r1-v/src/open_r1/trainer/vllm_grpo_trainer_modified_orig.py ADDED Viewed

	@@ -0,0 +1,935 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Callable, Optional, Union
+from accelerate.utils.other import is_compiled_module
+from accelerate.utils import broadcast_object_list, gather, gather_object
+import torch
+import torch.utils.data
+import transformers
+import warnings
+from unittest.mock import patch
+from datasets import Dataset, IterableDataset
+from packaging import version
+from transformers import (
+    AriaForConditionalGeneration,
+    AriaProcessor,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    Trainer,
+    TrainerCallback,
+    is_wandb_available,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import is_peft_available
+from trl.data_utils import (
+    apply_chat_template,
+    is_conversational,
+    maybe_apply_chat_template,
+)
+from trl.import_utils import is_vllm_available
+from trl.models import (
+    create_reference_model,
+    prepare_deepspeed,
+    unwrap_model_for_generation,
+)
+from trl.trainer.grpo_config import GRPOConfig
+from trl.trainer.utils import generate_model_card, get_comet_experiment_url, pad
+from trl import GRPOTrainer
+import copy
+if is_peft_available():
+    from peft import PeftConfig, get_peft_model
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+if is_wandb_available():
+    import wandb
+import torch.nn as nn
+from torch.utils.data import Sampler
+import gc
+from qwen_vl_utils import process_vision_info
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
+class Qwen2VLGRPOVLLMTrainerModifiedOrig(Trainer):
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: GRPOConfig = None,
+        script_args = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[
+            Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]
+        ] = None,
+        processing_class: Optional[PreTrainedTokenizerBase] = None,
+        reward_processing_classes: Optional[
+            Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]
+        ] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[
+            Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]
+        ] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+        # qwen2-vl related params
+        max_pixels: Optional[int] = 12845056,
+        min_pixels: Optional[int] = 3136,
+        attn_implementation: str = "flash_attention_2",
+    ):
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        model_init_kwargs["attn_implementation"] = attn_implementation
+        if isinstance(model, str):
+            model_id = model
+            torch_dtype = model_init_kwargs.get("torch_dtype")
+            if (
+                isinstance(torch_dtype, torch.dtype)
+                or torch_dtype == "auto"
+                or torch_dtype is None
+            ):
+                pass  # torch_dtype is already a torch.dtype or "auto" or None
+            elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+                torch_dtype = getattr(torch, torch_dtype)
+                model_init_kwargs["torch_dtype"] = torch_dtype
+            else:
+                raise ValueError(
+                    "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            model_init_kwargs["use_cache"] = (
+                False
+                if args.gradient_checkpointing
+                else model_init_kwargs.get("use_cache")
+            )
+            if "Qwen2-VL" in model_id:
+                model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            elif "Qwen2.5-VL" in model_id:
+                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            elif "Aria" in model_id:
+                model_init_kwargs.pop("use_cache")
+                model = AriaForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            else:
+                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                raise ValueError(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "This argument can only be used when the `model` argument is a string."
+                )
+        if peft_config is not None:
+            model = get_peft_model(model, peft_config)
+        # Reference model
+        if is_deepspeed_zero3_enabled():
+            if "Qwen2-VL" in model_id:
+                self.ref_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            elif "Qwen2.5-VL" in model_id:
+                self.ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            elif "Aria" in model_id:
+                self.ref_model = AriaForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            else:
+                self.ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+        elif peft_config is None:
+            # If PEFT configuration is not provided, create a reference model based on the initial model.
+            self.ref_model = create_reference_model(model)
+        else:
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        # Processing class
+        # if processing_class is None:
+        #     if "Qwen" in model_id or "Aria" in model_id:
+        #         processing_class = AutoProcessor.from_pretrained(model_id)
+        #         pad_token_id = processing_class.tokenizer.pad_token_id
+        #         processing_class.pad_token_id = pad_token_id
+        #         processing_class.eos_token_id = processing_class.tokenizer.eos_token_id
+        #         if "Qwen" in model_id:
+        #             processing_class.image_processor.max_pixels = max_pixels
+        #             processing_class.image_processor.min_pixels = min_pixels
+        #     else:
+        #         processing_class = AutoTokenizer.from_pretrained(
+        #             model.config._name_or_path, padding_side="left"
+        #         )
+        #         pad_token_id = processing_class.pad_token_id
+        # ────────────────────────────────────────────────────────────────
+        # Robust processor loading ― works for both fresh models *and* checkpoints
+        # ────────────────────────────────────────────────────────────────
+        if processing_class is None:
+            # 1️⃣  First try to load whatever lives in the directory we were given.
+            #     This succeeds if you previously did `processor.save_pretrained(output_dir)`.
+            try:
+                processing_class = AutoProcessor.from_pretrained(model_id)
+                pad_token_id = processing_class.tokenizer.pad_token_id
+            except (OSError, ValueError):          # no processor files found
+                # 2️⃣  Fall back to inspecting the *model object* instead of the path.
+                is_vl_model = (
+                    hasattr(model, "vision_tower") or         # Qwen-VL, InternVL, etc.
+                    getattr(model.config, "vision_config", None) is not None or
+                    getattr(model.config, "image_vocab_size", None) is not None
+                )
+                if is_vl_model:
+                    # Always use the *base* model name stored in the config.
+                    base_name = model.config._name_or_path     # e.g. "Qwen/Qwen2.5-VL-7B-Instruct"
+                    processing_class = AutoProcessor.from_pretrained(base_name)
+                    pad_token_id = processing_class.tokenizer.pad_token_id
+                    # Optional Qwen-specific limits
+                    if hasattr(processing_class, "image_processor"):
+                        processing_class.image_processor.max_pixels = max_pixels
+                        processing_class.image_processor.min_pixels = min_pixels
+                else:
+                    # Pure text model → plain tokenizer
+                    processing_class = AutoTokenizer.from_pretrained(
+                        model.config._name_or_path, padding_side="left"
+                    )
+                    pad_token_id = processing_class.pad_token_id
+            # 3️⃣  Harmonise attributes the rest of the trainer expects
+            processing_class.pad_token_id = pad_token_id
+            if not hasattr(processing_class, "eos_token_id"):
+                processing_class.eos_token_id = pad_token_id
+        # ────────────────────────────────────────────────────────────────
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+        self.reward_funcs = reward_funcs
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        else:
+            if len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError(
+                    "The number of reward processing classes must match the number of reward functions."
+                )
+        for i, (reward_processing_class, reward_func) in enumerate(
+            zip(reward_processing_classes, reward_funcs)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(
+                        reward_func.config._name_or_path
+                    )
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = (
+                        reward_processing_class.eos_token
+                    )
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+        self.reward_processing_classes = reward_processing_classes
+        # Data collator
+        def data_collator(features):  # No data collation is needed in GRPO
+            return features
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = (
+            args.max_completion_length
+        )  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.temporal = script_args.temporal
+        self.generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            temperature=1,  # HACK
+            num_return_sequences=self.num_generations,
+            pad_token_id=pad_token_id,
+        )
+        self.beta = args.beta
+        self.shuffled_num_generations = self.num_generations // 2
+        self.shuffled_generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            top_p=0.95,
+            temperature=1, # HACK
+            num_return_sequences=self.shuffled_num_generations,
+            pad_token_id=pad_token_id,
+        )
+        self.dummy_generation_config = GenerationConfig(
+            max_new_tokens=1,
+            do_sample=True,
+            top_p=0.95,
+            temperature=1, # HACK
+            num_return_sequences=1,
+            pad_token_id=pad_token_id,
+        )
+        self.len_control = script_args.len_control
+        self.beta = args.beta
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+        # Initialize the metrics
+        self._metrics = defaultdict(list)
+        self.use_vllm = args.use_vllm
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+            if self.accelerator.is_main_process:
+                vllm_device = self.args.vllm_device
+                if vllm_device == "auto":
+                    vllm_device = f"cuda:{self.accelerator.num_processes}"  # take the next GPU idx
+                # Check that the requested device is available
+                if (
+                    vllm_device.split(":")[0] == "cuda"
+                    and int(vllm_device.split(":")[1]) >= torch.cuda.device_count()
+                ):
+                    raise ValueError(
+                        f"The requested device for vllm ({vllm_device}) is not available. You are likely using vLLM "
+                        "without restricting the number of GPUs for training. Set the `--num_processes` argument to a "
+                        "value lower than the number of GPUs available on your machine—typically, reducing it by one "
+                        f"is sufficient. In your case: `--num_processes {torch.cuda.device_count() - 1}`."
+                    )
+                # Check that the requested device is not also used for training
+                if vllm_device in {
+                    f"cuda:{idx}" for idx in range(self.accelerator.num_processes)
+                }:
+                    warnings.warn(
+                        f"The requested device {vllm_device} is also used for training. This may lead to unexpected "
+                        "behavior. It is recommended to use a dedicated device for vLLM."
+                    )
+                # vLLM is not compatible with accelerate. So we need to patch it to make sure we can (1) place the vLLM
+                # model on the desired device (world_size_patch) and (2) avoid a test that is not designed for our
+                # setting (profiling_patch).
+                world_size_patch = patch(
+                    "torch.distributed.get_world_size", return_value=1
+                )
+                profiling_patch = patch(
+                    "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
+                    return_value=None,
+                )
+                with world_size_patch, profiling_patch:
+                    print("vllm is running on: ", vllm_device)
+                    self.llm = LLM(
+                        model=model.name_or_path,
+                        device=vllm_device,
+                        gpu_memory_utilization=self.args.vllm_gpu_memory_utilization,
+                        dtype=torch.bfloat16,
+                        # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
+                        # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
+                        # This is particularly useful here because we generate completions from the same prompts.
+                        enable_prefix_caching=True,
+                        enforce_eager=True,
+                        mm_processor_kwargs=(
+                            {
+                                "max_pixels": max_pixels,
+                                "min_pixels": min_pixels,
+                            }
+                            # if "Qwen2-VL" in model_id or "Qwen2.5-VL" in model_id
+                            if False
+                            else None
+                        ),
+                        max_model_len=args.max_prompt_length + args.max_completion_length,
+                    )
+                self.sampling_params = SamplingParams(
+                    temperature=1.0,
+                    top_p=0.95,
+                    max_tokens=self.max_completion_length,
+                )
+            self._last_loaded_step = 0  # tag to avoid useless loading during grad accumulation
+            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+            # synchronize all processes after vLLM has been fully initialized.
+            self.accelerator.wait_for_everyone()
+        else:
+            raise ValueError(
+                "GRPOVLLMTrainerModified only supports vllm generation, please set --use_vllm True"
+            )
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True)
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+        # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(self, model, input_ids, **kwargs):
+        # logits = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, image_grid_thw=image_grid_thw).logits  # (B, L, V)
+        # import pdb
+        # pdb.set_trace()
+        logits = model(input_ids, **kwargs).logits
+        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+        input_ids = input_ids[:, 1:]  # (B, L-1), exclude the first input ID since we don't have logits for it
+        # Compute the log probabilities for the input tokens. Use a loop to reduce memory peak.
+        per_token_logps = []
+        for logits_row, input_ids_row in zip(logits, input_ids):
+            log_probs = logits_row.log_softmax(dim=-1)
+            token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1)
+            per_token_logps.append(token_log_prob)
+        return torch.stack(per_token_logps)
+    # Trainer "prepares" the inputs before calling `compute_loss`. It converts to tensor and move to device.
+    # Since we preprocess the data in `compute_loss`, we need to override this method to skip this step.
+    def _prepare_inputs(
+        self, inputs: dict[str, Union[torch.Tensor, Any]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        return inputs
+    def remove_none_from_data(self, data):
+        for entry in data:
+            if "content" in entry and isinstance(entry["content"], list):
+                for sub_entry in entry["content"]:
+                    if isinstance(sub_entry, dict):
+                        keys_to_remove = [k for k, v in sub_entry.items() if v is None]
+                        for k in keys_to_remove:
+                            del sub_entry[k]
+        return data
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Compute the per-token log probabilities for the model
+        device = self.accelerator.device
+        prompts = [x["prompt"] for x in inputs]
+        # images = [x["image"] for x in inputs]
+        prompts_text = [
+            maybe_apply_chat_template(example, self.processing_class)["prompt"]
+            for example in inputs
+        ]
+        input_copy = copy.deepcopy(inputs[0]['prompt'])
+        input_copy = self.remove_none_from_data(input_copy)
+        data_type = inputs[0]['data_type']
+        if data_type == 'image':
+            input_copy[0]['content'][0]['image'] = os.getcwd() + "/Video-R1-data" + inputs[0]['path'][1:]
+        elif data_type == 'video':
+            input_copy[0]['content'][0]['video'] = os.getcwd() + "/Video-R1-data" + inputs[0]['path'][1:]
+        image_inputs, video_inputs, video_kwargs = process_vision_info(input_copy, return_video_kwargs=True)
+        prompt_inputs = self.processing_class(
+            text=copy.deepcopy(prompts_text),
+            images=image_inputs,
+            videos=video_inputs,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        mm_data = [[data_type, image_inputs if image_inputs else video_inputs]]
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+        if self.max_prompt_length is not None:
+            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
+            prompt_mask = prompt_mask[:, -self.max_prompt_length :]
+        if self.temporal:
+            if video_inputs:
+                indices = torch.randperm(video_inputs[0].size(0))
+                shuffled_video_inputs = [video_inputs[0][indices]]
+                shuffled_prompt_inputs = self.processing_class(
+                    text=copy.deepcopy(prompts_text),
+                    images=image_inputs,
+                    videos=shuffled_video_inputs,
+                    return_tensors="pt",
+                    padding=True,
+                    padding_side="left",
+                    add_special_tokens=False,
+                )
+                shuffled_mm_data = [[self.accelerator.process_index, data_type, image_inputs if image_inputs else video_inputs]]
+                shuffled_prompt_inputs = super()._prepare_inputs(shuffled_prompt_inputs)
+                shuffled_prompt_ids, shuffled_prompt_mask = shuffled_prompt_inputs["input_ids"], shuffled_prompt_inputs["attention_mask"]
+                if self.max_prompt_length is not None:
+                    shuffled_prompt_ids = shuffled_prompt_ids[:, -self.max_prompt_length :]
+                    shuffled_prompt_mask = shuffled_prompt_mask[:, -self.max_prompt_length :]
+            else:
+                shuffled_mm_data = [None]
+        if self.args.use_vllm:
+            # First, have main process load weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                with unwrap_model_for_generation(
+                    self.model,
+                    self.accelerator,
+                    gather_deepspeed3_params=True,  # TODO: fix this, self.args.ds3_gather_for_generation,
+                ) as unwrapped_model:
+                    if is_compiled_module(unwrapped_model):
+                        state_dict = unwrapped_model._orig_mod.state_dict()
+                    else:
+                        state_dict = unwrapped_model.state_dict()
+                if self.accelerator.is_main_process:
+                    llm_model = (
+                        self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                    )
+                    # import pdb
+                    # pdb.set_trace()
+                    llm_model.load_weights(state_dict.items())
+                self._last_loaded_step = self.state.global_step
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            all_prompts_text = gather_object(prompts_text)
+            all_mm_data = gather_object(mm_data)
+            # group into pairs
+            all_multimodal_inputs = []
+            if self.temporal:
+                shuffled_all_mm_data_none = gather_object(shuffled_mm_data)
+                shuffled_all_mm_data = [x for x in shuffled_all_mm_data_none if x]
+                shuffled_all_multimodal_inputs = []
+            # 2. Refer to TobiasLee's implementation suggestions
+            # this is a better implementation for vLLM sampling.
+            for prompt, mm_item in zip(all_prompts_text, all_mm_data):
+                all_multimodal_inputs.append({"prompt": prompt, "multi_modal_data": {mm_item[0]: mm_item[1]}})
+            if self.temporal and shuffled_all_mm_data!=[]:
+                for mm_item in shuffled_all_mm_data:
+                    shuffled_all_multimodal_inputs.append({"prompt": all_prompts_text[mm_item[0]], "multi_modal_data": {mm_item[1]: mm_item[2]}})
+            # Create sampling params with num_generations
+            if self.accelerator.is_main_process:
+                # Clone to avoid modifying original params
+                sampling_params = copy.deepcopy(self.sampling_params)
+                sampling_params.n = self.num_generations
+                # Single generate call with all prompts
+                if self.accelerator.is_main_process:
+                    outputs = self.llm.generate(
+                        all_multimodal_inputs,
+                        sampling_params=sampling_params,
+                        use_tqdm=False,
+                    )
+                # Flatten outputs: [prompt1_gen1, prompt1_gen2, ..., prompt2_gen1, prompt2_gen2, ...]
+                completion_ids = [out.token_ids for completion in outputs for out in completion.outputs]
+                if self.temporal and shuffled_all_mm_data!=[]:
+                    # Clone to avoid modifying original params
+                    shuffled_sampling_params = copy.deepcopy(self.sampling_params)
+                    shuffled_sampling_params.n = self.num_generations // 2
+                    # Single generate call with all prompts
+                    if self.accelerator.is_main_process:
+                        shuffled_outputs = self.llm.generate(
+                            shuffled_all_multimodal_inputs,
+                            sampling_params=shuffled_sampling_params,
+                            use_tqdm=False,
+                        )
+                    # Flatten outputs: [prompt1_gen1, prompt1_gen2, ..., prompt2_gen1, prompt2_gen2, ...]
+                    shuffled_completion_ids = [out.token_ids for completion in shuffled_outputs for out in completion.outputs]
+            else:
+                completion_ids = [None] * len(all_multimodal_inputs) * self.num_generations
+                if self.temporal and shuffled_all_mm_data!=[]:
+                    shuffled_completion_ids = [None] * len(shuffled_all_multimodal_inputs) * (self.num_generations // 2)
+            # broadcast and slice
+            completion_ids = broadcast_object_list(completion_ids, from_process=0)
+            process_slice = slice(
+                self.accelerator.process_index * len(prompts) * self.num_generations,
+                (self.accelerator.process_index + 1) * len(prompts) * self.num_generations,
+            )
+            completion_ids = completion_ids[process_slice]
+            # Pad the completions, and concatenate them with the prompts
+            completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+            completion_ids = pad(
+                completion_ids, padding_value=self.processing_class.pad_token_id
+            )
+            prompt_ids = prompt_ids.repeat_interleave(self.num_generations, dim=0)
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+            prompt_length = prompt_ids.size(1)
+            # print('prompt_length:', prompt_length)
+            prompt_ids = prompt_completion_ids[:, :prompt_length]
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+            prompt_mask = prompt_mask.repeat_interleave(self.num_generations, dim=0)
+            if self.temporal and shuffled_all_mm_data!=[]:
+                # broadcast and slice
+                shuffled_completion_ids = broadcast_object_list(shuffled_completion_ids, from_process=0)
+                process_id_list = []
+                for mm_item in shuffled_all_mm_data:
+                    process_id_list += [mm_item[0]] * len(prompts) * (self.num_generations // 2)
+                if video_inputs:
+                    cur_shuffled_completion_ids = []
+                    for i in range(len(process_id_list)):
+                        if self.accelerator.process_index == process_id_list[i]:
+                            cur_shuffled_completion_ids.append(shuffled_completion_ids[i])
+                    # Pad the completions, and concatenate them with the prompts
+                    cur_shuffled_completion_ids = [torch.tensor(ids, device=device) for ids in cur_shuffled_completion_ids]
+                    cur_shuffled_completion_ids = pad(
+                        cur_shuffled_completion_ids, padding_value=self.processing_class.pad_token_id
+                    )
+                    shuffled_completion_ids = cur_shuffled_completion_ids
+        else:
+            raise ValueError("Only vLLM generation is supported in this version ")
+        # below are the same with yifan's code
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.processing_class.eos_token_id
+        device = self.accelerator.device
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+        prompt_inputs.pop("input_ids")
+        prompt_inputs.pop("attention_mask")
+        if data_type == 'image':
+            prompt_inputs["pixel_values"] = prompt_inputs["pixel_values"].repeat(len(prompt_completion_ids), 1)
+            prompt_inputs["image_grid_thw"] = prompt_inputs["image_grid_thw"].repeat(len(prompt_completion_ids), 1)
+        # import pdb; pdb.set_trace()
+        if data_type == 'video':
+            prompt_inputs["pixel_values_videos"] = prompt_inputs["pixel_values_videos"].repeat(len(prompt_completion_ids), 1)
+            prompt_inputs["video_grid_thw"] = prompt_inputs["video_grid_thw"].repeat(len(prompt_completion_ids), 1)
+            if 'second_per_grid_ts' in prompt_inputs:
+                del prompt_inputs["second_per_grid_ts"]
+        # import pdb
+        # pdb.set_trace()
+        # per_token_logps = self._get_per_token_logps(model, prompt_completion_ids, attention_mask, pixel_values, image_grid_thw)
+        per_token_logps = self._get_per_token_logps(model, prompt_completion_ids, **prompt_inputs)
+        # Get rid of the prompt (-1 because of the shift done in get_per_token_logps)
+        per_token_logps = per_token_logps[:, prompt_length - 1 :]
+        gc.collect()
+        torch.cuda.empty_cache()
+        with torch.inference_mode():
+            if self.ref_model is not None:
+                ref_per_token_logps = self._get_per_token_logps(self.ref_model, prompt_completion_ids, **prompt_inputs)
+            else:
+                with self.accelerator.unwrap_model(model).disable_adapter():
+                    ref_per_token_logps = self._get_per_token_logps(model, prompt_completion_ids, **prompt_inputs)
+        ref_per_token_logps = ref_per_token_logps[:, prompt_length - 1 :]
+        x_clamped = torch.clamp(ref_per_token_logps - per_token_logps, min=-10, max=10)  # 限制 x 的范围
+        per_token_kl = torch.exp(x_clamped) - x_clamped - 1
+        gc.collect()
+        torch.cuda.empty_cache()
+        if self.temporal and video_inputs:
+            shuffled_completions = self.processing_class.batch_decode(shuffled_completion_ids, skip_special_tokens=True)
+            if is_conversational(inputs[0]):
+                shuffled_completions = [[{"role": "assistant", "content": shuffled_completion}] for shuffled_completion in shuffled_completions]
+            # Compute the rewards
+            shuffled_prompts = [prompt for prompt in prompts for _ in range(self.shuffled_num_generations)]
+            shuffled_rewards_per_func = torch.zeros(len(shuffled_prompts), len(self.reward_funcs), device=device)
+            for i, (reward_func, reward_processing_class) in enumerate(
+                zip(self.reward_funcs, self.reward_processing_classes)
+            ):
+                # Repeat all input columns (but "prompt" and "completion") to match the number of generations
+                shuffled_reward_kwargs = {key: [] for key in inputs[0].keys() if key not in ["prompt", "completion"]}
+                for key in shuffled_reward_kwargs:
+                    for example in inputs:
+                        # Repeat each value in the column for `num_generations` times
+                        shuffled_reward_kwargs[key].extend([example[key]] * self.shuffled_num_generations)
+                shuffled_output_reward_func = reward_func(prompts=shuffled_prompts, completions=shuffled_completions, **shuffled_reward_kwargs)
+                shuffled_rewards_per_func[:, i] = torch.tensor(shuffled_output_reward_func, dtype=torch.float32, device=device)
+        # Decode the generated completions
+        completions = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in completions
+            ]
+        # Compute the rewards
+        prompts = [prompt for prompt in prompts for _ in range(self.num_generations)]
+        rewards_per_func = torch.zeros(
+            len(prompts), len(self.reward_funcs), device=device
+        )
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes)
+        ):
+            reward_kwargs = {
+                key: []
+                for key in inputs[0].keys()
+                if key not in ["prompt", "completion"]
+            }
+            for key in reward_kwargs:
+                for example in inputs:
+                    # Repeat each value in the column for `num_generations` times
+                    reward_kwargs[key].extend([example[key]] * self.num_generations)
+            output_reward_func = reward_func(
+                prompts=prompts, completions=completions, **reward_kwargs
+            )
+            rewards_per_func[:, i] = torch.tensor(
+                output_reward_func, dtype=torch.float32, device=device
+            )
+        # rewards_per_func = gather(rewards_per_func)
+        # # Sum the rewards from all reward functions
+        # rewards = rewards_per_func.sum(dim=1)
+        # process_slice = slice(
+        #     self.accelerator.process_index * len(prompts),
+        #     (self.accelerator.process_index + 1) * len(prompts),
+        # )
+        # rewards = rewards[process_slice]
+        if self.temporal and video_inputs:
+            temporal_rewards_per_func = rewards_per_func.clone()
+            acc_mean = temporal_rewards_per_func[:, 0].mean()
+            shuffled_acc_mean = shuffled_rewards_per_func[:, 0].mean()
+            if acc_mean >= 0.8 * shuffled_acc_mean:
+                mask = temporal_rewards_per_func[:, 0] > 0.1
+                temporal_rewards_per_func[mask, 0] = temporal_rewards_per_func[mask, 0] + 0.3
+                temporal_rewards = torch.tensor([1.0]).to('cuda')
+            else:
+                temporal_rewards = torch.tensor([0.0]).to('cuda')
+        else:
+            temporal_rewards =  torch.tensor([0.5]).to('cuda')
+        # Sum the rewards from all reward functions
+        if self.temporal and video_inputs:
+            rewards = temporal_rewards_per_func.sum(dim=1)
+        else:
+            rewards = rewards_per_func.sum(dim=1)
+        if self.len_control:
+            mem_rewards = [0] * self.num_generations
+            mask = rewards_per_func[:, 0] > 0.1
+            lenth_list = completion_mask.sum(1)
+            selected_indices = torch.nonzero(mask, as_tuple=True)[0].tolist()
+            #             if len(selected_indices) > 1 and len(selected_indices) < self.num_generations:
+            # if len(selected_indices) > 1:
+            #     selected_items = [(i, lenth_list[i]) for i in selected_indices]
+            #     sorted_items = sorted(selected_items, key=lambda x: x[1], reverse=True)
+            #     N = len(sorted_items)
+            #     for rank, (idx, length) in enumerate(sorted_items):
+            #         reward = 0.2 - 0.2 * (rank / N)
+            #         rewards[idx] += reward
+            #         mem_rewards[idx] = reward
+            # for idx in range(len(lenth_list)):
+            #     if lenth_list[idx] >= 512:
+            #         rewards[idx] -= 0.5
+            if len(selected_indices) > 1:
+                for idx in selected_indices:
+                    if 320 <= lenth_list[idx] <= 1600:
+                        rewards[idx] += 0.2
+        # print(rewards)
+        # print(completion_mask.sum(1))
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4)
+        # x - x.detach() allows for preserving gradients from x
+        per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1)
+        per_token_loss = -(per_token_loss - self.beta * per_token_kl)
+        # per_token_loss = -per_token_loss
+        loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        # import pdb
+        # pdb.set_trace()
+        # Log the metrics
+        completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
+        self._metrics["completion_length"].append(completion_length)
+        reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                reward_func_name = reward_func.config._name_or_path.split("/")[-1]
+            else:
+                reward_func_name = reward_func.__name__
+            self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item())
+        gathered_rewards = self.accelerator.gather_for_metrics(rewards)
+        num_devices = gathered_rewards.size(0) // self.num_generations
+        rewards_per_device = gathered_rewards.view(num_devices, self.num_generations)
+        wrong_devices = (rewards_per_device <= 1).all(dim=1)
+        wrong_ratio = wrong_devices.sum().item() / num_devices
+        correct_devices = (rewards_per_device >= 2).all(dim=1)
+        correct_ratio = correct_devices.sum().item() / num_devices
+        self._metrics["all_wrong"].append(wrong_ratio)
+        self._metrics["all_correct"].append(correct_ratio)
+        if self.temporal:
+            temporal_rewards_list = self.accelerator.gather_for_metrics(temporal_rewards)
+            self._metrics["temporal_rewards"].append(self.accelerator.gather_for_metrics(temporal_rewards_list).mean().item())
+        self._metrics["reward"].append(self.accelerator.gather_for_metrics(rewards).mean().item())
+        self._metrics["reward_std"].append(self.accelerator.gather_for_metrics(std_grouped_rewards).mean().item())
+        mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        return loss
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()}  # average the metrics
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if next(iter(logs.keys())).startswith("eval_"):
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+        logs = {**logs, **metrics}
+        if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"):
+            super().log(logs, start_time)
+        else:  # transformers<=4.46
+            super().log(logs)
+        self._metrics.clear()