{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0", "metadata": {}, "outputs": [], "source": [ "from modules.vlm_inference import analyze_product_image\n", "from modules.data_processing import load_test_data, image_to_base64\n", "from modules.evals import run_inference_on_dataframe_async, evaluate_all_categories, extract_metrics\n", "from dotenv import load_dotenv\n", "import os\n", "from PIL import Image\n", "import matplotlib.pyplot as plt\n", "import io\n", "import ast\n", "import pandas as pd\n", "import altair as alt\n", "\n", "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "load_dotenv()\n", "FIREWORKS_API_KEY = os.getenv(\"FIREWORKS_API_KEY\")\n", "OPENAI_API_KEY = os.getenv(\"OPENAI_KEY\")\n", "\n", "assert FIREWORKS_API_KEY is not None, \"FIREWORKS_API_KEY not found in environment variables\"\n", "assert OPENAI_API_KEY is not None, \"OPENAI_API_KEY not found in environment variables\"" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "df_test = load_test_data()\n", "df_test.loc[:, \"image_base64\"] = df_test.loc[:, \"image\"].apply(lambda x: image_to_base64(x))\n", "\n", "# Sample to 1000 images\n", "df_test = df_test.sample(1000).reset_index()\n", "print(f\"Shape of final eval set {df_test.shape}\")" ] }, { "cell_type": "markdown", "id": "3", "metadata": {}, "source": [ "**Note: if using this notebook make sure to replace \"pyroworks\" with your account name**" ] }, { "cell_type": "markdown", "id": "4", "metadata": {}, "source": [ "#### Run example image through a serverless Qwen VL model to test" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": {}, "outputs": [], "source": [ "img_bytes = df_test.loc[:, \"image\"][1]\n", "img_dict = ast.literal_eval(img_bytes)\n", "img_bytes = img_dict[\"bytes\"]\n", "img = Image.open(io.BytesIO(img_bytes))\n", "plt.imshow(img)\n", "plt.axis('off')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "6", "metadata": {}, "outputs": [], "source": [ "model_id = \"accounts/fireworks/models/qwen2p5-vl-32b-instruct\"\n", "result = analyze_product_image(\n", " model=model_id,\n", " image_url=df_test.loc[:, \"image_base64\"][1],\n", " api_key=FIREWORKS_API_KEY,\n", " provider=\"Fireworks\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "result" ] }, { "cell_type": "markdown", "id": "8", "metadata": {}, "source": [ "*Important*: If you are following through this notebook make sure to replace \"pyroworks\" with your account name" ] }, { "cell_type": "markdown", "id": "9", "metadata": {}, "source": [ "#### Run test set through base OSS model\n", "1. Create a deployment for the model for faster inference\n", "2. Check deployment status\n", "3. Run test set through deployment for base model and save results\n", "\n", "NOTE:make sure to delete or scale down deployment when done to avoid costs" ] }, { "cell_type": "markdown", "id": "10", "metadata": {}, "source": [ "##### Run inference on Qwen 2.5 VL 32B" ] }, { "cell_type": "code", "execution_count": null, "id": "11", "metadata": {}, "outputs": [], "source": [ "! firectl create deployment accounts/fireworks/models/qwen2p5-vl-32b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB" ] }, { "cell_type": "code", "execution_count": null, "id": "12", "metadata": {}, "outputs": [], "source": [ "! firectl -a pyroworks get deployment itmxuke2" ] }, { "cell_type": "code", "execution_count": null, "id": "13", "metadata": {}, "outputs": [], "source": [ "df_predictions_qwen_base_32b = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen2p5-vl-32b-instruct-ralh0ben\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20, # Adjust based on rate limits\n", ")\n", "\n", "results_qwen_base_32b = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen_base_32b,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "14", "metadata": {}, "source": [ "##### Run inference on Qwen 2.5 VL 72B" ] }, { "cell_type": "code", "execution_count": null, "id": "15", "metadata": {}, "outputs": [], "source": [ "! firectl create deployment accounts/fireworks/models/qwen2-vl-72b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB" ] }, { "cell_type": "code", "execution_count": null, "id": "16", "metadata": {}, "outputs": [], "source": [ "! firectl-admin get deployment rou70025" ] }, { "cell_type": "code", "execution_count": null, "id": "17", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen_base = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen2-vl-72b-instruct-yaxztv7t\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20, # Adjust based on rate limits\n", ")\n", "\n", "results_qwen_base = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen_base,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "18", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen3_8B_base = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen3-vl-8b-instruct-y147m785\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20, # Adjust based on rate limits\n", ")\n", "\n", "results_qwen3_8B_base = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen3_8B_base,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "19", "metadata": {}, "outputs": [], "source": [ "! firectl create deployment accounts/fireworks/models/qwen3-vl-32b-instruct --deployment-shape THROUGHPUT" ] }, { "cell_type": "code", "execution_count": null, "id": "20", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen3_32B_base = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen3-vl-32b-instruct-jalntd80\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20, # Adjust based on rate limits\n", ")\n", "\n", "results_qwen3_32B_base = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen3_32B_base,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "21", "metadata": {}, "source": [ "#### Run test set through fine tuned FW Qwen model\n", "1. Create a Lora deployment of our fine tuned model\n", "2. Check deployment status\n", "3. Run test set through deployment for base model and save results" ] }, { "cell_type": "markdown", "id": "22", "metadata": {}, "source": [ "#### Run evals on Qwen 32B SFT" ] }, { "cell_type": "code", "execution_count": null, "id": "23", "metadata": {}, "outputs": [], "source": [ "!firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-32b-fashion-catalog" ] }, { "cell_type": "markdown", "id": "24", "metadata": {}, "source": [ "Deployment ID: accounts/pyroworks/deployments/c09a2c4q" ] }, { "cell_type": "code", "execution_count": null, "id": "25", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen_32b_fine_tuned = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20, # Adjust based on rate limits\n", ")\n", "\n", "results_qwen_fine_tuned_32b = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen_32b_fine_tuned,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "26", "metadata": {}, "source": [ "#### Run evals on Qwen 72B SFT" ] }, { "cell_type": "code", "execution_count": null, "id": "27", "metadata": {}, "outputs": [], "source": [ "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-72b-fashion-catalog --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB" ] }, { "cell_type": "code", "execution_count": null, "id": "28", "metadata": {}, "outputs": [], "source": [ "!firectl get deployment bedocpar" ] }, { "cell_type": "code", "execution_count": null, "id": "29", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen_fine_tuned = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen-72b-fashion-catalog-oueqouqs\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20, # Adjust based on rate limits\n", ")\n", "\n", "results_qwen_fine_tuned = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen_fine_tuned,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "30", "metadata": {}, "source": [ "#### Run evals on Qwen 3 8B SFT" ] }, { "cell_type": "code", "execution_count": null, "id": "31", "metadata": {}, "outputs": [], "source": [ "! firectl-admin -a pyroworks create deployment accounts/pyroworks/models/qwen3-8b-fashion-catalog" ] }, { "cell_type": "code", "execution_count": null, "id": "32", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen_3_8b_fine_tuned = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen3-8b-fashion-catalog-bdo0tqxe\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20,\n", ")\n", "\n", "results_qwen__3_8b_fine_tuned = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen_3_8b_fine_tuned,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "33", "metadata": {}, "source": [ "#### Run evals on Qwen 3 32B SFT" ] }, { "cell_type": "code", "execution_count": null, "id": "34", "metadata": {}, "outputs": [], "source": [ "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen3-32b-fashion-catalog --world-size 4 --accelerator-type NVIDIA_H200_141GB --min-replica-count 1 --max-replica-count 1" ] }, { "cell_type": "code", "execution_count": null, "id": "35", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_qwen_3_32b_fine_tuned = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2\",\n", " provider=\"FireworksAI\",\n", " api_key=FIREWORKS_API_KEY,\n", " max_concurrent_requests=20,\n", ")\n", "\n", "results_qwen__3_32b_fine_tuned = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_qwen_3_32b_fine_tuned,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "36", "metadata": {}, "source": [ "#### Run test set through closed source model" ] }, { "cell_type": "code", "execution_count": null, "id": "37", "metadata": {}, "outputs": [], "source": [ "# Run with concurrent requests using await directly in Jupyter\n", "df_predictions_openai = await run_inference_on_dataframe_async(\n", " df_test,\n", " model=\"gpt-5-mini-2025-08-07\",\n", " provider=\"OpenAI\",\n", " api_key=OPENAI_API_KEY,\n", " max_concurrent_requests=5, # Lower for OpenAI to avoid rate limits\n", ")\n", "\n", "# Evaluate\n", "results_openai = evaluate_all_categories(\n", " df_ground_truth=df_test,\n", " df_predictions=df_predictions_openai,\n", " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n", ")" ] }, { "cell_type": "markdown", "id": "38", "metadata": {}, "source": [ "### Compare eval metrics across models" ] }, { "cell_type": "code", "execution_count": null, "id": "39", "metadata": {}, "outputs": [], "source": [ "\n", "# Combine all models into a single dataframe\n", "all_metrics = []\n", "all_metrics.extend(extract_metrics(results_openai, 'GPT-5-Mini'))\n", "all_metrics.extend(extract_metrics(results_qwen_fine_tuned, 'Qwen-72B-SFT'))\n", "all_metrics.extend(extract_metrics(results_qwen_base, 'Qwen-72B-Base'))\n", "\n", "df_comparison = pd.DataFrame(all_metrics)\n", "\n", "# Display the dataframe\n", "print(\"Model Comparison Dataframe:\")\n", "print(df_comparison)" ] }, { "cell_type": "code", "execution_count": null, "id": "40", "metadata": {}, "outputs": [], "source": [ "df_melted = df_comparison.melt(\n", " id_vars=['model', 'category', 'num_samples'],\n", " value_vars=['accuracy', 'precision', 'recall'],\n", " var_name='metric',\n", " value_name='score'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "41", "metadata": {}, "outputs": [], "source": [ "# Define custom color scheme\n", "color_scale = alt.Scale(\n", " domain=['GPT-5-Mini', 'Qwen-72B-Base', 'Qwen-72B-SFT'],\n", " range=['#1f77b4', '#d4a5d4', '#6a1b6a'] # Blue, Light Purple, Dark Purple\n", ")\n", "\n", "chart = alt.Chart(df_melted).mark_bar().encode(\n", " x=alt.X('category:N', title='Category'),\n", " y=alt.Y('score:Q', title='Score', scale=alt.Scale(domain=[0, 1])),\n", " color=alt.Color('model:N', title='Model', scale=color_scale),\n", " column=alt.Column('metric:N', title='Metric'),\n", " xOffset='model:N',\n", " tooltip=[\n", " alt.Tooltip('model:N', title='Model'),\n", " alt.Tooltip('category:N', title='Category'),\n", " alt.Tooltip('metric:N', title='Metric'),\n", " alt.Tooltip('score:Q', title='Score', format='.4f'),\n", " alt.Tooltip('num_samples:Q', title='Samples')\n", " ]\n", ").properties(\n", " width=200,\n", " height=300,\n", " title='Model Performance Comparison by Category and Metric'\n", ").configure_axis(\n", " labelAngle=-45\n", ")\n", "\n", "chart" ] }, { "cell_type": "code", "execution_count": null, "id": "42", "metadata": {}, "outputs": [], "source": [ "df_melted" ] }, { "cell_type": "code", "execution_count": null, "id": "43", "metadata": {}, "outputs": [], "source": [ "# Define custom color scheme\n", "color_scale = alt.Scale(\n", " domain=['GPT-5-Mini', 'Qwen-72B-Base', 'Qwen-72B-SFT'],\n", " range=['#1f77b4', '#d4a5d4', '#6a1b6a'] # Blue, Light Purple, Dark Purple\n", ")\n", "\n", "chart = alt.Chart(df_melted.loc[df_melted.metric == \"accuracy\", :]).mark_bar().encode(\n", " x=alt.X('category:N', title='Category'),\n", " y=alt.Y('score:Q', title='Score', scale=alt.Scale(domain=[0, 1])),\n", " color=alt.Color('model:N', title='Model', scale=color_scale),\n", " xOffset='model:N',\n", " tooltip=[\n", " alt.Tooltip('model:N', title='Model'),\n", " alt.Tooltip('category:N', title='Category'),\n", " alt.Tooltip('metric:N', title='Metric'),\n", " alt.Tooltip('score:Q', title='Score', format='.4f'),\n", " alt.Tooltip('num_samples:Q', title='Samples')\n", " ]\n", ").properties(\n", " width=400,\n", " height=300,\n", " title='Accuracy by Category and Model'\n", ").configure_axis(\n", " labelAngle=-45\n", ")\n", "\n", "chart" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }