Spaces:

fireworks-ai
/

catalog-extract

Running

App Files Files Community

RobertoBarrosoLuque commited on 26 days ago

Commit

582e83e

1 Parent(s): 69ab3a1

Add qwen 3 vl

Browse files

Files changed (4) hide show

notebooks/01-eda-and-fine-tuning.ipynb +37 -1
notebooks/02-model-evals.ipynb +170 -25
src/modules/evals.py +132 -14
src/modules/vlm_inference.py +1 -1

notebooks/01-eda-and-fine-tuning.ipynb CHANGED Viewed

@@ -331,10 +331,46 @@
     "! firectl -a pyroworks get sftj bew0pztj"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "28",
    "metadata": {},
    "outputs": [],
    "source": []

     "! firectl -a pyroworks get sftj bew0pztj"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "28",
+   "metadata": {},
+   "source": [
+    "##### Fine tune Qwen 3 vl 8B"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen3-vl-8b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen3-8b-fashion-catalog --display-name \"Qwen3-8B-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop --eval-auto-carveout"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30",
+   "metadata": {},
+   "source": [
+    "##### Fine tune Qwen 3 VL 32B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen3-vl-32b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen3-32b-fashion-catalog --display-name \"Qwen3-32B-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop --eval-auto-carveout"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32",
    "metadata": {},
    "outputs": [],
    "source": []

notebooks/02-model-evals.ipynb CHANGED Viewed

@@ -7,9 +7,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from src.modules.vlm_inference import analyze_product_image\n",
-    "from src.modules.data_processing import load_test_data, image_to_base64\n",
-    "from src.modules.evals import run_inference_on_dataframe_async, evaluate_all_categories, extract_metrics\n",
     "from dotenv import load_dotenv\n",
     "import os\n",
     "from PIL import Image\n",
@@ -137,8 +137,7 @@
    "id": "10",
    "metadata": {},
    "source": [
-    "##### Run inference on Qwen 2.5 VL 32B\n",
-    "m"
    ]
   },
   {
@@ -235,9 +234,65 @@
    ]
   },
   {
-   "cell_type": "markdown",
    "id": "18",
    "metadata": {},
    "source": [
     "#### Run test set through fine tuned FW Qwen model\n",
     "1. Create a Lora deployment of our fine tuned model\n",
@@ -247,33 +302,41 @@
   },
   {
    "cell_type": "markdown",
-   "id": "19",
    "metadata": {},
    "source": [
-    "#### Run evals on Qwen 32B SFT\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "20",
    "metadata": {},
    "outputs": [],
    "source": [
-    "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-32b-fashion-catalog --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "21",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Run with concurrent requests using await directly in Jupyter\n",
     "df_predictions_qwen_32b_fine_tuned = await run_inference_on_dataframe_async(\n",
     "    df_test,\n",
-    "    model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-c6fhxibo\",\n",
     "    provider=\"FireworksAI\",\n",
     "    api_key=FIREWORKS_API_KEY,\n",
     "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
@@ -288,7 +351,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "22",
    "metadata": {},
    "source": [
     "#### Run evals on Qwen 72B SFT"
@@ -297,7 +360,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "23",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -307,17 +370,17 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "24",
    "metadata": {},
    "outputs": [],
    "source": [
-    "!firectl-admin get deployment bedocpar"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "25",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -339,7 +402,89 @@
   },
   {
    "cell_type": "markdown",
-   "id": "26",
    "metadata": {},
    "source": [
     "#### Run test set through closed source model"
@@ -348,7 +493,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "27",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -371,7 +516,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "28",
    "metadata": {},
    "source": [
     "### Compare eval metrics across models"
@@ -380,7 +525,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "29",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -401,7 +546,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "30",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -416,7 +561,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "31",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -453,7 +598,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "32",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -463,7 +608,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "33",
    "metadata": {},
    "outputs": [],
    "source": [

    "metadata": {},
    "outputs": [],
    "source": [
+    "from modules.vlm_inference import analyze_product_image\n",
+    "from modules.data_processing import load_test_data, image_to_base64\n",
+    "from modules.evals import run_inference_on_dataframe_async, evaluate_all_categories, extract_metrics\n",
     "from dotenv import load_dotenv\n",
     "import os\n",
     "from PIL import Image\n",
    "id": "10",
    "metadata": {},
    "source": [
+    "##### Run inference on Qwen 2.5 VL 32B"
    ]
   },
   {
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "18",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run with concurrent requests using await directly in Jupyter\n",
+    "df_predictions_qwen3_8B_base = await run_inference_on_dataframe_async(\n",
+    "    df_test,\n",
+    "    model=\"accounts/pyroworks/deployedModels/qwen3-vl-8b-instruct-y147m785\",\n",
+    "    provider=\"FireworksAI\",\n",
+    "    api_key=FIREWORKS_API_KEY,\n",
+    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
+    ")\n",
+    "\n",
+    "results_qwen3_8B_base = evaluate_all_categories(\n",
+    "    df_ground_truth=df_test,\n",
+    "    df_predictions=df_predictions_qwen3_8B_base,\n",
+    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl create deployment accounts/fireworks/models/qwen3-vl-32b-instruct --deployment-shape THROUGHPUT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run with concurrent requests using await directly in Jupyter\n",
+    "df_predictions_qwen3_32B_base = await run_inference_on_dataframe_async(\n",
+    "    df_test,\n",
+    "    model=\"accounts/pyroworks/deployedModels/qwen3-vl-32b-instruct-jalntd80\",\n",
+    "    provider=\"FireworksAI\",\n",
+    "    api_key=FIREWORKS_API_KEY,\n",
+    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
+    ")\n",
+    "\n",
+    "results_qwen3_32B_base = evaluate_all_categories(\n",
+    "    df_ground_truth=df_test,\n",
+    "    df_predictions=df_predictions_qwen3_32B_base,\n",
+    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21",
+   "metadata": {},
    "source": [
     "#### Run test set through fine tuned FW Qwen model\n",
     "1. Create a Lora deployment of our fine tuned model\n",
   },
   {
    "cell_type": "markdown",
+   "id": "22",
    "metadata": {},
    "source": [
+    "#### Run evals on Qwen 32B SFT"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "23",
    "metadata": {},
    "outputs": [],
    "source": [
+    "!firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-32b-fashion-catalog"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24",
+   "metadata": {},
+   "source": [
+    "Deployment ID: accounts/pyroworks/deployments/c09a2c4q"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "25",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Run with concurrent requests using await directly in Jupyter\n",
     "df_predictions_qwen_32b_fine_tuned = await run_inference_on_dataframe_async(\n",
     "    df_test,\n",
+    "    model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2\",\n",
     "    provider=\"FireworksAI\",\n",
     "    api_key=FIREWORKS_API_KEY,\n",
     "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
   },
   {
    "cell_type": "markdown",
+   "id": "26",
    "metadata": {},
    "source": [
     "#### Run evals on Qwen 72B SFT"
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "27",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "28",
    "metadata": {},
    "outputs": [],
    "source": [
+    "!firectl get deployment bedocpar"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "29",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "30",
+   "metadata": {},
+   "source": [
+    "#### Run evals on Qwen 3 8B SFT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl-admin -a pyroworks create deployment accounts/pyroworks/models/qwen3-8b-fashion-catalog"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run with concurrent requests using await directly in Jupyter\n",
+    "df_predictions_qwen_3_8b_fine_tuned = await run_inference_on_dataframe_async(\n",
+    "    df_test,\n",
+    "    model=\"accounts/pyroworks/deployedModels/qwen3-8b-fashion-catalog-bdo0tqxe\",\n",
+    "    provider=\"FireworksAI\",\n",
+    "    api_key=FIREWORKS_API_KEY,\n",
+    "    max_concurrent_requests=20,\n",
+    ")\n",
+    "\n",
+    "results_qwen__3_8b_fine_tuned = evaluate_all_categories(\n",
+    "    df_ground_truth=df_test,\n",
+    "    df_predictions=df_predictions_qwen_3_8b_fine_tuned,\n",
+    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33",
+   "metadata": {},
+   "source": [
+    "#### Run evals on Qwen 3 32B SFT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen3-32b-fashion-catalog --world-size 4 --accelerator-type NVIDIA_H200_141GB --min-replica-count 1 --max-replica-count 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run with concurrent requests using await directly in Jupyter\n",
+    "df_predictions_qwen_3_32b_fine_tuned = await run_inference_on_dataframe_async(\n",
+    "    df_test,\n",
+    "    model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-pwb1mga2\",\n",
+    "    provider=\"FireworksAI\",\n",
+    "    api_key=FIREWORKS_API_KEY,\n",
+    "    max_concurrent_requests=20,\n",
+    ")\n",
+    "\n",
+    "results_qwen__3_32b_fine_tuned = evaluate_all_categories(\n",
+    "    df_ground_truth=df_test,\n",
+    "    df_predictions=df_predictions_qwen_3_32b_fine_tuned,\n",
+    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36",
    "metadata": {},
    "source": [
     "#### Run test set through closed source model"
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "37",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "38",
    "metadata": {},
    "source": [
     "### Compare eval metrics across models"
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "39",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "40",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "41",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "42",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "43",
    "metadata": {},
    "outputs": [],
    "source": [

src/modules/evals.py CHANGED Viewed

@@ -8,9 +8,10 @@ from sklearn.metrics import (
 )
 from tqdm.asyncio import tqdm as async_tqdm
 import asyncio
-from src.modules.vlm_inference import analyze_product_image_async
-from src.modules.data_processing import image_to_base64
 from pathlib import Path
 DATA_PATH = Path(__file__).parents[2] / "data"
@@ -149,7 +150,9 @@ def run_inference_on_dataframe(
             - pred_description: Predicted description
     """
     return asyncio.run(
-        run_inference_on_dataframe_async(df, model, api_key, provider, max_concurrent_requests)
     )
@@ -312,13 +315,128 @@ def extract_metrics(results_dict, model_name):
     metrics_list = []
     for category, metrics in results_dict.items():
-        metrics_list.append({
-            'model': model_name,
-            'category': category,
-            'accuracy': metrics['accuracy'],
-            'precision': metrics['precision'],
-            'recall': metrics['recall'],
-            'num_samples': metrics['num_samples']
-        })
-    return metrics_list

 )
 from tqdm.asyncio import tqdm as async_tqdm
 import asyncio
+import re
+from glob import glob
+from modules.vlm_inference import analyze_product_image_async
+from modules.data_processing import image_to_base64
 from pathlib import Path
 DATA_PATH = Path(__file__).parents[2] / "data"
             - pred_description: Predicted description
     """
     return asyncio.run(
+        run_inference_on_dataframe_async(
+            df, model, api_key, provider, max_concurrent_requests
+        )
     )
     metrics_list = []
     for category, metrics in results_dict.items():
+        metrics_list.append(
+            {
+                "model": model_name,
+                "category": category,
+                "accuracy": metrics["accuracy"],
+                "precision": metrics["precision"],
+                "recall": metrics["recall"],
+                "num_samples": metrics["num_samples"],
+            }
+        )
+    return metrics_list
+def parse_model_name(filename: str) -> str:
+    """
+    Parse a human-readable model name from prediction CSV filename.
+    Examples:
+        df_pred_FireworksAI_qwen2-vl-72b-BASE-instruct-yaxztv7t.csv -> Qwen2-VL-72B-BASE
+        df_pred_OpenAI_gpt-5-mini-2025-08-07.csv -> GPT-5-Mini
+        df_pred_FireworksAI_qwen-72b-SFT-fashion-catalog-oueqouqs.csv -> Qwen2-VL-72B-SFT
+        df_pred_FireworksAI_qwen2p5-vl-32b-instruct-ralh0ben.csv -> Qwen2.5-VL-32B-BASE
+        df_pred_FireworksAI_qwen-32b-SFT-fashion-catalog-c6fhxibo.csv -> Qwen2.5-VL-32B-SFT
+        df_pred_FireworksAI_qwen3-vl-8b-instruct-*.csv -> Qwen3-VL-8B-BASE
+        df_pred_FireworksAI_qwen3-8b-fashion-catalog-*.csv -> Qwen3-VL-8B-SFT
+    """
+    basename = Path(filename).stem
+    # Remove prefix
+    name = basename.replace("df_pred_FireworksAI_", "").replace("df_pred_OpenAI_", "")
+    # GPT models
+    if "gpt" in name.lower():
+        return "GPT-5-Mini"
+    # Check if SFT (fine-tuned) model
+    is_sft = "SFT" in name or "fashion-catalog" in name
+    if "qwen3" in name.lower():
+        size_match = re.search(r"(\d+)b", name.lower())
+        size = size_match.group(1) if size_match else "?"
+        suffix = "SFT" if is_sft else "BASE"
+        return f"Qwen3-VL-{size}B-{suffix}"
+    if "qwen2p5" in name.lower() or (
+        "qwen-32b" in name.lower() and "qwen2-vl" not in name.lower()
+    ):
+        size_match = re.search(r"(\d+)b", name.lower())
+        size = size_match.group(1) if size_match else "?"
+        suffix = "SFT" if is_sft else "BASE"
+        return f"Qwen2.5-VL-{size}B-{suffix}"
+    if "qwen2-vl" in name.lower() or "qwen-72b" in name.lower():
+        size_match = re.search(r"(\d+)b", name.lower())
+        size = size_match.group(1) if size_match else "?"
+        suffix = "SFT" if is_sft else "BASE"
+        return f"Qwen2-VL-{size}B-{suffix}"
+    return name
+def compile_evaluation_results(data_path: str = None) -> pd.DataFrame:
+    """
+    Compile evaluation results from all prediction CSVs in the data directory.
+    Finds all df_pred_*.csv files, calculates metrics against ground truth,
+    and creates a consolidated evaluation_results.csv.
+    Args:
+        data_path: Path to data directory. Defaults to project's data/ folder.
+    Returns:
+        pd.DataFrame: Compiled evaluation results with columns:
+            model, category, accuracy, precision, recall, num_samples
+    """
+    if data_path is None:
+        data_path = Path(__file__).parents[2] / "data"
+    else:
+        data_path = Path(data_path)
+    # Load ground truth
+    test_csv = data_path / "test.csv"
+    df_test = pd.read_csv(test_csv)
+    print(f"Loaded {len(df_test)} ground truth samples from {test_csv}")
+    # Find all prediction CSVs
+    pred_files = sorted(glob(str(data_path / "df_pred_*.csv")))
+    print(f"Found {len(pred_files)} prediction files")
+    all_metrics = []
+    for pred_file in pred_files:
+        model_name = parse_model_name(pred_file)
+        print(f"\nProcessing: {Path(pred_file).name} -> {model_name}")
+        # Load predictions
+        df_pred = pd.read_csv(pred_file)
+        # Calculate metrics
+        results = evaluate_all_categories(
+            df_ground_truth=df_test,
+            df_predictions=df_pred,
+            id_col="id",
+        )
+        # Skip models with all errors
+        valid_results = {k: v for k, v in results.items() if "error" not in v}
+        if not valid_results:
+            print(f"  Skipping {model_name}: no valid predictions")
+            continue
+        # Extract metrics for this model (only valid categories)
+        metrics = extract_metrics(valid_results, model_name)
+        all_metrics.extend(metrics)
+    # Create final DataFrame
+    df_eval = pd.DataFrame(all_metrics)
+    # Save results
+    output_path = data_path / "evaluation_results.csv"
+    df_eval.to_csv(output_path, index=False)
+    print(f"\nSaved evaluation results to {output_path}")
+    return df_eval

src/modules/vlm_inference.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from openai import OpenAI, AsyncOpenAI
 from pydantic import BaseModel, Field
 from typing import Optional, Literal
-from src.modules.constants import PROMPT_LIBRARY
 SYSTEM_PROMPT = """
     You are an e-commerce fashion catalog assistant.

 from openai import OpenAI, AsyncOpenAI
 from pydantic import BaseModel, Field
 from typing import Optional, Literal
+from modules.constants import PROMPT_LIBRARY
 SYSTEM_PROMPT = """
     You are an e-commerce fashion catalog assistant.