In [None]:
from modules.vlm_inference import analyze_product_image
from modules.data_processing import load_test_data, image_to_base64
from modules.evals import run_inference_on_dataframe_async, evaluate_all_categories, extract_metrics
from dotenv import load_dotenv
import os
from PIL import Image
import matplotlib.pyplot as plt
import io
import ast
import pandas as pd
import altair as alt

%load_ext autoreload
%autoreload 2

In [None]:
load_dotenv()
FIREWORKS_API_KEY = os.getenv("FIREWORKS_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_KEY")

assert FIREWORKS_API_KEY is not None, "FIREWORKS_API_KEY not found in environment variables"
assert OPENAI_API_KEY is not None, "OPENAI_API_KEY not found in environment variables"

In [None]:
df_test = load_test_data()
df_test.loc[:, "image_base64"] = df_test.loc[:, "image"].apply(lambda x: image_to_base64(x))

# Sample to 1000 images
df_test = df_test.sample(1000).reset_index()
print(f"Shape of final eval set {df_test.shape}")

**Note: if using this notebook make sure to replace "pyroworks" with your account name**

#### Run example image through a serverless Qwen VL model to test

In [None]:
img_bytes = df_test.loc[:, "image"][1]
img_dict = ast.literal_eval(img_bytes)
img_bytes = img_dict["bytes"]
img = Image.open(io.BytesIO(img_bytes))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
model_id = "accounts/fireworks/models/qwen2p5-vl-32b-instruct"
result = analyze_product_image(
 model=model_id,
 image_url=df_test.loc[:, "image_base64"][1],
 api_key=FIREWORKS_API_KEY,
 provider="Fireworks"
)

In [None]:
result

*Important*: If you are following through this notebook make sure to replace "pyroworks" with your account name

#### Run test set through base OSS model
1. Create a deployment for the model for faster inference
2. Check deployment status
3. Run test set through deployment for base model and save results

NOTE:make sure to delete or scale down deployment when done to avoid costs

##### Run inference on Qwen 2.5 VL 32B

In [None]:
! firectl create deployment accounts/fireworks/models/qwen2p5-vl-32b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB

In [None]:
! firectl -a pyroworks get deployment itmxuke2

In [None]:
df_predictions_qwen_base_32b = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen2p5-vl-32b-instruct-ralh0ben",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20, # Adjust based on rate limits
)

results_qwen_base_32b = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen_base_32b,
 categories=["masterCategory", "gender", "subCategory"]
)

##### Run inference on Qwen 2.5 VL 72B

In [None]:
! firectl create deployment accounts/fireworks/models/qwen2-vl-72b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB

In [None]:
! firectl-admin get deployment rou70025

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen_base = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen2-vl-72b-instruct-yaxztv7t",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20, # Adjust based on rate limits
)

results_qwen_base = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen_base,
 categories=["masterCategory", "gender", "subCategory"]
)

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen3_8B_base = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen3-vl-8b-instruct-y147m785",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20, # Adjust based on rate limits
)

results_qwen3_8B_base = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen3_8B_base,
 categories=["masterCategory", "gender", "subCategory"]
)

In [None]:
! firectl create deployment accounts/fireworks/models/qwen3-vl-32b-instruct --deployment-shape THROUGHPUT

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen3_32B_base = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen3-vl-32b-instruct-jalntd80",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20, # Adjust based on rate limits
)

results_qwen3_32B_base = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen3_32B_base,
 categories=["masterCategory", "gender", "subCategory"]
)

#### Run test set through fine tuned FW Qwen model
1. Create a Lora deployment of our fine tuned model
2. Check deployment status
3. Run test set through deployment for base model and save results

#### Run evals on Qwen 32B SFT

In [None]:
!firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-32b-fashion-catalog

Deployment ID: accounts/pyroworks/deployments/c09a2c4q

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen_32b_fine_tuned = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20, # Adjust based on rate limits
)

results_qwen_fine_tuned_32b = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen_32b_fine_tuned,
 categories=["masterCategory", "gender", "subCategory"]
)

#### Run evals on Qwen 72B SFT

In [None]:
! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-72b-fashion-catalog --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB

In [None]:
!firectl get deployment bedocpar

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen_fine_tuned = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen-72b-fashion-catalog-oueqouqs",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20, # Adjust based on rate limits
)

results_qwen_fine_tuned = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen_fine_tuned,
 categories=["masterCategory", "gender", "subCategory"]
)

#### Run evals on Qwen 3 8B SFT

In [None]:
! firectl-admin -a pyroworks create deployment accounts/pyroworks/models/qwen3-8b-fashion-catalog

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen_3_8b_fine_tuned = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen3-8b-fashion-catalog-bdo0tqxe",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20,
)

results_qwen__3_8b_fine_tuned = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen_3_8b_fine_tuned,
 categories=["masterCategory", "gender", "subCategory"]
)

#### Run evals on Qwen 3 32B SFT

In [None]:
! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen3-32b-fashion-catalog --world-size 4 --accelerator-type NVIDIA_H200_141GB --min-replica-count 1 --max-replica-count 1

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_qwen_3_32b_fine_tuned = await run_inference_on_dataframe_async(
 df_test,
 model="accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2",
 provider="FireworksAI",
 api_key=FIREWORKS_API_KEY,
 max_concurrent_requests=20,
)

results_qwen__3_32b_fine_tuned = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_qwen_3_32b_fine_tuned,
 categories=["masterCategory", "gender", "subCategory"]
)

#### Run test set through closed source model

In [None]:
# Run with concurrent requests using await directly in Jupyter
df_predictions_openai = await run_inference_on_dataframe_async(
 df_test,
 model="gpt-5-mini-2025-08-07",
 provider="OpenAI",
 api_key=OPENAI_API_KEY,
 max_concurrent_requests=5, # Lower for OpenAI to avoid rate limits
)

# Evaluate
results_openai = evaluate_all_categories(
 df_ground_truth=df_test,
 df_predictions=df_predictions_openai,
 categories=["masterCategory", "gender", "subCategory"]
)

### Compare eval metrics across models

In [None]:

# Combine all models into a single dataframe
all_metrics = []
all_metrics.extend(extract_metrics(results_openai, 'GPT-5-Mini'))
all_metrics.extend(extract_metrics(results_qwen_fine_tuned, 'Qwen-72B-SFT'))
all_metrics.extend(extract_metrics(results_qwen_base, 'Qwen-72B-Base'))

df_comparison = pd.DataFrame(all_metrics)

# Display the dataframe
print("Model Comparison Dataframe:")
print(df_comparison)

In [None]:
df_melted = df_comparison.melt(
 id_vars=['model', 'category', 'num_samples'],
 value_vars=['accuracy', 'precision', 'recall'],
 var_name='metric',
 value_name='score'
)

In [None]:
# Define custom color scheme
color_scale = alt.Scale(
 domain=['GPT-5-Mini', 'Qwen-72B-Base', 'Qwen-72B-SFT'],
 range=['#1f77b4', '#d4a5d4', '#6a1b6a'] # Blue, Light Purple, Dark Purple
)

chart = alt.Chart(df_melted).mark_bar().encode(
 x=alt.X('category:N', title='Category'),
 y=alt.Y('score:Q', title='Score', scale=alt.Scale(domain=[0, 1])),
 color=alt.Color('model:N', title='Model', scale=color_scale),
 column=alt.Column('metric:N', title='Metric'),
 xOffset='model:N',
 tooltip=[
 alt.Tooltip('model:N', title='Model'),
 alt.Tooltip('category:N', title='Category'),
 alt.Tooltip('metric:N', title='Metric'),
 alt.Tooltip('score:Q', title='Score', format='.4f'),
 alt.Tooltip('num_samples:Q', title='Samples')
 ]
).properties(
 width=200,
 height=300,
 title='Model Performance Comparison by Category and Metric'
).configure_axis(
 labelAngle=-45
)

chart

In [None]:
df_melted

In [None]:
# Define custom color scheme
color_scale = alt.Scale(
 domain=['GPT-5-Mini', 'Qwen-72B-Base', 'Qwen-72B-SFT'],
 range=['#1f77b4', '#d4a5d4', '#6a1b6a'] # Blue, Light Purple, Dark Purple
)

chart = alt.Chart(df_melted.loc[df_melted.metric == "accuracy", :]).mark_bar().encode(
 x=alt.X('category:N', title='Category'),
 y=alt.Y('score:Q', title='Score', scale=alt.Scale(domain=[0, 1])),
 color=alt.Color('model:N', title='Model', scale=color_scale),
 xOffset='model:N',
 tooltip=[
 alt.Tooltip('model:N', title='Model'),
 alt.Tooltip('category:N', title='Category'),
 alt.Tooltip('metric:N', title='Metric'),
 alt.Tooltip('score:Q', title='Score', format='.4f'),
 alt.Tooltip('num_samples:Q', title='Samples')
 ]
).properties(
 width=400,
 height=300,
 title='Accuracy by Category and Model'
).configure_axis(
 labelAngle=-45
)

chart