{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from modules.vlm_inference import analyze_product_image\n",
    "from modules.data_processing import load_test_data, image_to_base64\n",
    "from modules.evals import run_inference_on_dataframe_async, evaluate_all_categories, extract_metrics\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "from PIL import Image\n",
    "import matplotlib.pyplot as plt\n",
    "import io\n",
    "import ast\n",
    "import pandas as pd\n",
    "import altair as alt\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "load_dotenv()\n",
    "FIREWORKS_API_KEY = os.getenv(\"FIREWORKS_API_KEY\")\n",
    "OPENAI_API_KEY = os.getenv(\"OPENAI_KEY\")\n",
    "\n",
    "assert FIREWORKS_API_KEY is not None, \"FIREWORKS_API_KEY not found in environment variables\"\n",
    "assert OPENAI_API_KEY is not None, \"OPENAI_API_KEY not found in environment variables\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = load_test_data()\n",
    "df_test.loc[:, \"image_base64\"] = df_test.loc[:, \"image\"].apply(lambda x: image_to_base64(x))\n",
    "\n",
    "# Sample to 1000 images\n",
    "df_test = df_test.sample(1000).reset_index()\n",
    "print(f\"Shape of final eval set {df_test.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3",
   "metadata": {},
   "source": [
    "**Note: if using this notebook make sure to replace \"pyroworks\" with your account name**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4",
   "metadata": {},
   "source": [
    "#### Run example image through a serverless Qwen VL model to test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "img_bytes = df_test.loc[:, \"image\"][1]\n",
    "img_dict = ast.literal_eval(img_bytes)\n",
    "img_bytes = img_dict[\"bytes\"]\n",
    "img = Image.open(io.BytesIO(img_bytes))\n",
    "plt.imshow(img)\n",
    "plt.axis('off')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_id = \"accounts/fireworks/models/qwen2p5-vl-32b-instruct\"\n",
    "result = analyze_product_image(\n",
    "    model=model_id,\n",
    "    image_url=df_test.loc[:, \"image_base64\"][1],\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    provider=\"Fireworks\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8",
   "metadata": {},
   "source": [
    "*Important*: If you are following through this notebook make sure to replace \"pyroworks\" with your account name"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {},
   "source": [
    "#### Run test set through base OSS model\n",
    "1. Create a deployment for the model for faster inference\n",
    "2. Check deployment status\n",
    "3. Run test set through deployment for base model and save results\n",
    "\n",
    "NOTE:make sure to delete or scale down deployment when done to avoid costs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10",
   "metadata": {},
   "source": [
    "##### Run inference on Qwen 2.5 VL 32B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl create deployment accounts/fireworks/models/qwen2p5-vl-32b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks get deployment itmxuke2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_predictions_qwen_base_32b = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen2p5-vl-32b-instruct-ralh0ben\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
    ")\n",
    "\n",
    "results_qwen_base_32b = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen_base_32b,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {},
   "source": [
    "##### Run inference on Qwen 2.5 VL 72B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl create deployment accounts/fireworks/models/qwen2-vl-72b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl-admin get deployment rou70025"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen_base = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen2-vl-72b-instruct-yaxztv7t\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
    ")\n",
    "\n",
    "results_qwen_base = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen_base,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen3_8B_base = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen3-vl-8b-instruct-y147m785\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
    ")\n",
    "\n",
    "results_qwen3_8B_base = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen3_8B_base,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl create deployment accounts/fireworks/models/qwen3-vl-32b-instruct --deployment-shape THROUGHPUT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen3_32B_base = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen3-vl-32b-instruct-jalntd80\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
    ")\n",
    "\n",
    "results_qwen3_32B_base = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen3_32B_base,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {},
   "source": [
    "#### Run test set through fine tuned FW Qwen model\n",
    "1. Create a Lora deployment of our fine tuned model\n",
    "2. Check deployment status\n",
    "3. Run test set through deployment for base model and save results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22",
   "metadata": {},
   "source": [
    "#### Run evals on Qwen 32B SFT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23",
   "metadata": {},
   "outputs": [],
   "source": [
    "!firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-32b-fashion-catalog"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24",
   "metadata": {},
   "source": [
    "Deployment ID: accounts/pyroworks/deployments/c09a2c4q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen_32b_fine_tuned = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
    ")\n",
    "\n",
    "results_qwen_fine_tuned_32b = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen_32b_fine_tuned,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26",
   "metadata": {},
   "source": [
    "#### Run evals on Qwen 72B SFT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-72b-fashion-catalog --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28",
   "metadata": {},
   "outputs": [],
   "source": [
    "!firectl get deployment bedocpar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen_fine_tuned = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen-72b-fashion-catalog-oueqouqs\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
    ")\n",
    "\n",
    "results_qwen_fine_tuned = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen_fine_tuned,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30",
   "metadata": {},
   "source": [
    "#### Run evals on Qwen 3 8B SFT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl-admin -a pyroworks create deployment accounts/pyroworks/models/qwen3-8b-fashion-catalog"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen_3_8b_fine_tuned = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen3-8b-fashion-catalog-bdo0tqxe\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,\n",
    ")\n",
    "\n",
    "results_qwen__3_8b_fine_tuned = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen_3_8b_fine_tuned,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33",
   "metadata": {},
   "source": [
    "#### Run evals on Qwen 3 32B SFT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34",
   "metadata": {},
   "outputs": [],
   "source": [
    "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen3-32b-fashion-catalog --world-size 4 --accelerator-type NVIDIA_H200_141GB --min-replica-count 1 --max-replica-count 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_qwen_3_32b_fine_tuned = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2\",\n",
    "    provider=\"FireworksAI\",\n",
    "    api_key=FIREWORKS_API_KEY,\n",
    "    max_concurrent_requests=20,\n",
    ")\n",
    "\n",
    "results_qwen__3_32b_fine_tuned = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_qwen_3_32b_fine_tuned,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36",
   "metadata": {},
   "source": [
    "#### Run test set through closed source model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run with concurrent requests using await directly in Jupyter\n",
    "df_predictions_openai = await run_inference_on_dataframe_async(\n",
    "    df_test,\n",
    "    model=\"gpt-5-mini-2025-08-07\",\n",
    "    provider=\"OpenAI\",\n",
    "    api_key=OPENAI_API_KEY,\n",
    "    max_concurrent_requests=5,  # Lower for OpenAI to avoid rate limits\n",
    ")\n",
    "\n",
    "# Evaluate\n",
    "results_openai = evaluate_all_categories(\n",
    "    df_ground_truth=df_test,\n",
    "    df_predictions=df_predictions_openai,\n",
    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38",
   "metadata": {},
   "source": [
    "### Compare eval metrics across models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Combine all models into a single dataframe\n",
    "all_metrics = []\n",
    "all_metrics.extend(extract_metrics(results_openai, 'GPT-5-Mini'))\n",
    "all_metrics.extend(extract_metrics(results_qwen_fine_tuned, 'Qwen-72B-SFT'))\n",
    "all_metrics.extend(extract_metrics(results_qwen_base, 'Qwen-72B-Base'))\n",
    "\n",
    "df_comparison = pd.DataFrame(all_metrics)\n",
    "\n",
    "# Display the dataframe\n",
    "print(\"Model Comparison Dataframe:\")\n",
    "print(df_comparison)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_melted = df_comparison.melt(\n",
    "    id_vars=['model', 'category', 'num_samples'],\n",
    "    value_vars=['accuracy', 'precision', 'recall'],\n",
    "    var_name='metric',\n",
    "    value_name='score'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define custom color scheme\n",
    "color_scale = alt.Scale(\n",
    "    domain=['GPT-5-Mini', 'Qwen-72B-Base', 'Qwen-72B-SFT'],\n",
    "    range=['#1f77b4', '#d4a5d4', '#6a1b6a']  # Blue, Light Purple, Dark Purple\n",
    ")\n",
    "\n",
    "chart = alt.Chart(df_melted).mark_bar().encode(\n",
    "    x=alt.X('category:N', title='Category'),\n",
    "    y=alt.Y('score:Q', title='Score', scale=alt.Scale(domain=[0, 1])),\n",
    "    color=alt.Color('model:N', title='Model', scale=color_scale),\n",
    "    column=alt.Column('metric:N', title='Metric'),\n",
    "    xOffset='model:N',\n",
    "    tooltip=[\n",
    "        alt.Tooltip('model:N', title='Model'),\n",
    "        alt.Tooltip('category:N', title='Category'),\n",
    "        alt.Tooltip('metric:N', title='Metric'),\n",
    "        alt.Tooltip('score:Q', title='Score', format='.4f'),\n",
    "        alt.Tooltip('num_samples:Q', title='Samples')\n",
    "    ]\n",
    ").properties(\n",
    "    width=200,\n",
    "    height=300,\n",
    "    title='Model Performance Comparison by Category and Metric'\n",
    ").configure_axis(\n",
    "    labelAngle=-45\n",
    ")\n",
    "\n",
    "chart"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_melted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define custom color scheme\n",
    "color_scale = alt.Scale(\n",
    "    domain=['GPT-5-Mini', 'Qwen-72B-Base', 'Qwen-72B-SFT'],\n",
    "    range=['#1f77b4', '#d4a5d4', '#6a1b6a']  # Blue, Light Purple, Dark Purple\n",
    ")\n",
    "\n",
    "chart = alt.Chart(df_melted.loc[df_melted.metric == \"accuracy\", :]).mark_bar().encode(\n",
    "    x=alt.X('category:N', title='Category'),\n",
    "    y=alt.Y('score:Q', title='Score', scale=alt.Scale(domain=[0, 1])),\n",
    "    color=alt.Color('model:N', title='Model', scale=color_scale),\n",
    "    xOffset='model:N',\n",
    "    tooltip=[\n",
    "        alt.Tooltip('model:N', title='Model'),\n",
    "        alt.Tooltip('category:N', title='Category'),\n",
    "        alt.Tooltip('metric:N', title='Metric'),\n",
    "        alt.Tooltip('score:Q', title='Score', format='.4f'),\n",
    "        alt.Tooltip('num_samples:Q', title='Samples')\n",
    "    ]\n",
    ").properties(\n",
    "    width=400,\n",
    "    height=300,\n",
    "    title='Accuracy by Category and Model'\n",
    ").configure_axis(\n",
    "    labelAngle=-45\n",
    ")\n",
    "\n",
    "chart"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}