|
|
import os |
|
|
import torch |
|
|
from vllm import LLM, SamplingParams |
|
|
from transformers import AutoProcessor, AutoTokenizer |
|
|
from qwen_vl_utils import process_vision_info |
|
|
|
|
|
|
|
|
model_path = "Video-R1/Video-R1-7B" |
|
|
|
|
|
|
|
|
video_path = "./src/example_video/video1.mp4" |
|
|
question = "Which move motion in the video lose the system energy?" |
|
|
|
|
|
|
|
|
problem_type = 'free-form' |
|
|
|
|
|
|
|
|
llm = LLM( |
|
|
model=model_path, |
|
|
tensor_parallel_size=1, |
|
|
max_model_len=81920, |
|
|
gpu_memory_utilization=0.8, |
|
|
limit_mm_per_prompt={"video": 1, "image": 1}, |
|
|
) |
|
|
|
|
|
sampling_params = SamplingParams( |
|
|
temperature=0.1, |
|
|
top_p=0.001, |
|
|
max_tokens=1024, |
|
|
) |
|
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_path) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
tokenizer.padding_side = "left" |
|
|
processor.tokenizer = tokenizer |
|
|
|
|
|
|
|
|
QUESTION_TEMPLATE = ( |
|
|
"{Question}\n" |
|
|
"Please think about this question as if you were a human pondering deeply. " |
|
|
"Engage in an internal dialogue using expressions such as 'let me think', 'wait', 'Hmm', 'oh, I see', 'let's break it down', etc, or other natural language thought expressions " |
|
|
"It's encouraged to include self-reflection or verification in the reasoning process. " |
|
|
"Provide your detailed reasoning between the <think> and </think> tags, and then give your final answer between the <answer> and </answer> tags." |
|
|
) |
|
|
|
|
|
|
|
|
TYPE_TEMPLATE = { |
|
|
"multiple choice": " Please provide only the single option letter (e.g., A, B, C, D, etc.) within the <answer> </answer> tags.", |
|
|
"numerical": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags.", |
|
|
"OCR": " Please transcribe text from the image/video clearly and provide your text answer within the <answer> </answer> tags.", |
|
|
"free-form": " Please provide your text answer within the <answer> </answer> tags.", |
|
|
"regression": " Please provide the numerical value (e.g., 42 or 3.14) within the <answer> </answer> tags." |
|
|
} |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "video", |
|
|
"video": video_path, |
|
|
"max_pixels": 200704, |
|
|
"nframes": 32 |
|
|
}, |
|
|
{ |
|
|
"type": "text", |
|
|
"text": QUESTION_TEMPLATE.format(Question=question) + TYPE_TEMPLATE[problem_type] |
|
|
}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
|
|
|
|
|
|
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True) |
|
|
|
|
|
|
|
|
llm_inputs = [{ |
|
|
"prompt": prompt, |
|
|
"multi_modal_data": {"video": video_inputs[0]}, |
|
|
"mm_processor_kwargs": {key: val[0] for key, val in video_kwargs.items()}, |
|
|
}] |
|
|
|
|
|
|
|
|
outputs = llm.generate(llm_inputs, sampling_params=sampling_params) |
|
|
output_text = outputs[0].outputs[0].text |
|
|
|
|
|
print(output_text) |
|
|
|
|
|
|