Spaces:

Antuke
/

FaR-FT-PE

Sleeping

App Files Files Community

Antuke commited on Nov 10

Commit

c69c4af

1 Parent(s): eea3a1c

init

Browse files

Files changed (37) hide show

app.py +788 -0
core/args.py +72 -0
core/checkpoint.py +379 -0
core/transformer.py +646 -0
core/transforms/image_transform.py +409 -0
core/utils.py +40 -0
core/vision_encoder/__init__.py +0 -0
core/vision_encoder/__pycache__/__init__.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/__init__.cpython-313.pyc +0 -0
core/vision_encoder/__pycache__/config.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/config.cpython-313.pyc +0 -0
core/vision_encoder/__pycache__/pe.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/pe.cpython-313.pyc +0 -0
core/vision_encoder/__pycache__/pe_lora.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/rope.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/rope.cpython-313.pyc +0 -0
core/vision_encoder/__pycache__/tokenizer.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/tokenizer.cpython-313.pyc +0 -0
core/vision_encoder/__pycache__/transforms.cpython-312.pyc +0 -0
core/vision_encoder/__pycache__/transforms.cpython-313.pyc +0 -0
core/vision_encoder/config.py +260 -0
core/vision_encoder/pe.py +833 -0
core/vision_encoder/rope.py +347 -0
core/vision_encoder/transforms.py +86 -0
core/vision_projector/base.py +26 -0
core/vision_projector/mlp.py +62 -0
requirements.txt +10 -0
setup.py +7 -0
src/model.py +809 -0
utils/__pycache__/commons.cpython-313.pyc +0 -0
utils/__pycache__/dataset.cpython-313.pyc +0 -0
utils/__pycache__/face_detector.cpython-313.pyc +0 -0
utils/__pycache__/task_config.cpython-313.pyc +0 -0
utils/commons.py +158 -0
utils/deploy.prototxt +1789 -0
utils/face_detector.py +105 -0
utils/task_config.py +21 -0

app.py ADDED Viewed

	@@ -0,0 +1,788 @@

+"""
+VLM Soft Biometrics - Gradio Interface
+A web application for analyzing facial soft biometrics (age, gender, emotion) using Vision-Language Models.
+"""
+import os
+import gradio as gr
+import torch
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import base64
+from io import BytesIO
+import traceback  # Import traceback at the top
+from utils.face_detector import FaceDetector
+# Class definitions
+from src.model import MTLModel
+from utils.commons import get_backbone_pe
+from utils.task_config import Task
+TASKS = [
+    Task(name='Age', class_labels=["0-2", "3-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70+"], criterion=None),
+    Task(name='Gender', class_labels=["Male", "Female"], criterion=None),
+    Task(name='Emotion', class_labels=["Surprise", "Fear", "Disgust", "Happy", "Sad", "Angry", "Neutral"], criterion=None)
+]
+CLASSES = [
+    ["0-2", "3-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70+"],
+    ["M", "F"],
+    ["Surprise", "Fear", "Disgust", "Happy", "Sad", "Angry", "Neutral"]
+]
+# Global variables for model and detector
+model = None
+transform = None
+detector = None
+device = None
+current_ckpt_dir = None
+CHECKPOINTS_DIR = './checkpoints/'
+def scan_checkpoints(ckpt_dir):
+    """Scans a directory for .pt or .pth files."""
+    if not os.path.exists(ckpt_dir):
+        print(f"Warning: Checkpoint directory not found: {ckpt_dir}")
+        return [], None
+    try:
+        ckpt_files = [
+            os.path.join(ckpt_dir, f)
+            for f in sorted(os.listdir(ckpt_dir))
+        ]
+    except Exception as e:
+        print(f"Error scanning checkpoint directory {ckpt_dir}: {e}")
+        return [], None
+    # Create a list of (label, value) tuples
+    # label = filename (e.g., "mtlora.pt"), value = full path
+    choices_list = [(os.path.basename(f), f) for f in ckpt_files]
+    default_ckpt_path = os.path.join(ckpt_dir, 'mtlora.pt')
+    if default_ckpt_path in ckpt_files:
+        return choices_list, default_ckpt_path
+    elif ckpt_files:
+        return choices_list, ckpt_files[0] # default_ckpt_value is the full path
+    else:
+        print(f"No checkpoints found in {ckpt_dir}")
+        return [], None
+def load_model(device,ckpt_dir='./checkpoints/mtlora.pt', pe_vision_config="PE-Core-L14-336"):
+    """Load and configure model."""
+    backbone, transform, _ = get_backbone_pe(version='PE-Core-L14-336', apply_migration_flag=True, pretrained=False)
+    model = MTLModel(backbone,tasks=TASKS,use_lora=True,use_deep_head=True,
+        use_mtl_lora=('mtlora' in ckpt_dir),
+    )
+    print(f'loading from {ckpt_dir}')
+    model.load_model(filepath=ckpt_dir,map_location=device)
+    return model,transform
+def load_model_and_update_status(ckpt_dir):
+    """
+    Wrapper function to load a model and return a status message.
+    This is used by the dropdown's 'change' event.
+    """
+    global model, current_ckpt_dir
+    if ckpt_dir is None or ckpt_dir == "":
+        return "No checkpoint selected."
+    if model is not None and ckpt_dir == current_ckpt_dir:
+        status = f"Model already loaded: {os.path.basename(ckpt_dir)}"
+        print(status)
+        return status
+    gr.Info(f"Loading model: {os.path.basename(ckpt_dir)}...")
+    try:
+        init_model(ckpt_dir=ckpt_dir, detection_confidence=0.5)
+        current_ckpt_dir = ckpt_dir  # Set global directory on successful load
+        status = f"Successfully loaded: {os.path.basename(ckpt_dir)}"
+        gr.Info("Model loaded successfully!")
+        print(status)
+        return status
+    except Exception as e:
+        status = f"Failed to load {os.path.basename(ckpt_dir)}: {str(e)}"
+        print(status)
+        traceback.print_exc()
+        return status
+def predict(model, image):
+    """Make predictions for age, gender, and emotion."""
+    with torch.no_grad():
+        results = model(image)
+        age_logits, gender_logits, emotion_logits = results['Age'], results['Gender'], results['Emotion']
+        # Get probabilities using softmax
+        age_probs = torch.softmax(age_logits, dim=-1)
+        gender_probs = torch.softmax(gender_logits, dim=-1)
+        emotion_probs = torch.softmax(emotion_logits, dim=-1)
+        ages = torch.argmax(age_logits, dim=-1).cpu().tolist()
+        genders = torch.argmax(gender_logits, dim=-1).cpu().tolist()
+        emotions = torch.argmax(emotion_logits, dim=-1).cpu().tolist()
+        results = []
+        for i in range(len(ages)):
+            # Get all probabilities for each class
+            age_all_probs = {
+                CLASSES[0][j]: float(age_probs[i][j].cpu().detach())
+                for j in range(len(CLASSES[0]))
+            }
+            gender_all_probs = {
+                CLASSES[1][j]: float(gender_probs[i][j].cpu().detach())
+                for j in range(len(CLASSES[1]))
+            }
+            emotion_all_probs = {
+                CLASSES[2][j]: float(emotion_probs[i][j].cpu().detach())
+                for j in range(len(CLASSES[2]))
+            }
+            results.append({
+                'age': {
+                    'predicted_class': CLASSES[0][ages[i]],
+                    'predicted_confidence': float(age_probs[i][ages[i]].cpu().detach()),
+                    'all_probabilities': age_all_probs
+                },
+                'gender': {
+                    'predicted_class': CLASSES[1][genders[i]],
+                    'predicted_confidence': float(gender_probs[i][genders[i]].cpu().detach()),
+                    'all_probabilities': gender_all_probs
+                },
+                'emotion': {
+                    'predicted_class': CLASSES[2][emotions[i]],
+                    'predicted_confidence': float(emotion_probs[i][emotions[i]].cpu().detach()),
+                    'all_probabilities': emotion_all_probs
+                }
+            })
+        return results
+def get_centroid_weighted_age(probs):
+    probs = list(probs.values())
+    centroids =  [1, 4.5, 14.5, 24.5, 34.5, 44.5, 54.5, 64.5, 80]
+    age = 0
+    # print(probs) # DEBUG
+    for i,p in enumerate(probs):
+        age += p * centroids[i]
+    return age
+def init_model(ckpt_dir="./checkpoints/mtlora.pt", detection_confidence=0.5):
+    """Initialize model and detector."""
+    global model, transform, detector, device
+    print(f"\n{'='*60}")
+    print(f"INITIALIZING MODEL: {ckpt_dir}")
+    print(f"{'='*60}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    # Verify model weights exist
+    if not os.path.exists(ckpt_dir):
+       error_msg = f"Model weights not found: {ckpt_dir}."
+       print(f"ERROR: {error_msg}")
+       raise FileNotFoundError(error_msg)
+    print(f"Model weights found: {ckpt_dir}")
+    # Load the perception encoder
+    model, transform = load_model(ckpt_dir= ckpt_dir,device= device)
+    model.eval()
+    model.to(device)
+    detector = FaceDetector(confidence_threshold=detection_confidence)
+    print("✓ Model and detector initialized successfully")
+    print(f"{'='*60}\n")
+def process_image(image, selected_checkpoint_path):
+    """
+    Process an uploaded image and return predictions with annotated image.
+    Args:
+        image: PIL Image or numpy array
+        selected_checkpoint_path: The path from the checkpoint dropdown
+    Returns:
+        tuple: (annotated_image, results_html)
+    """
+    if image is None:
+        return None, "<p style='color: red;'>Please upload an image</p>"
+    # Ensure model is initialized
+    if model is None or selected_checkpoint_path != current_ckpt_dir:
+        status = load_model_and_update_status(selected_checkpoint_path)
+        if "Failed" in status or "Error" in status:
+            return image, f"<p style'color: red;'>Model Error: {status}</p>"
+    try:
+        # --- 1. Prepare images for detection and drawing ---
+        # Convert PIL to OpenCV format (BGR) for the detector
+        if isinstance(image, Image.Image):
+            img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        else:
+            # Assuming it's a numpy array from Gradio webcam
+            img_cv = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+        # Create a PIL copy to draw annotations on
+        img_pil_annotated = image.copy()
+        draw = ImageDraw.Draw(img_pil_annotated)
+        # --- 2. Detect faces ---
+        faces = detector.detect(img_cv, pad_rect=True)
+        if faces is None or len(faces) == 0:
+            return image, "<p style='color: orange;'>No faces detected in the image</p>"
+        # --- 3. Process detected faces ---
+        crops_pil = []
+        face_data = []
+        for idx, (crop, confidence, bbox) in enumerate(faces):
+            crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
+            crop_pil = Image.fromarray(crop_rgb)
+            crops_pil.append(crop_pil)
+            # Resize crop to 336x336 for display
+            crop_resized = crop_pil.resize((336, 336), Image.Resampling.LANCZOS)
+            face_data.append({
+                'bbox': bbox,
+                'detection_confidence': float(confidence),
+                'crop_image': crop_resized  # Store the resized crop
+            })
+        # --- 4. Batch transform and predict ---
+        crop_tensors = [transform(crop_pil) for crop_pil in crops_pil]
+        batch_tensor = torch.stack(crop_tensors).to(device)
+        # Get predictions
+        predictions = predict(model, batch_tensor)
+        # Combine face data with predictions
+        for face, pred in zip(face_data, predictions):
+            face['predictions'] = pred
+        # --- 5. Create annotated image (using PIL) ---
+        for idx, face in enumerate(face_data):
+            bbox = face['bbox']
+            pred = face['predictions']
+            x, y, w, h = bbox
+            # --- Calculate Adaptive Font (from demo.py) ---
+            font_size_ratio = 0.08
+            min_font_size = 12
+            max_font_size = 48
+            adaptive_font_size = max(min_font_size, min(int(w * font_size_ratio), max_font_size))
+            try:
+                font = ImageFont.load_default(size=adaptive_font_size)
+            except IOError:
+                font = ImageFont.load_default()
+            # --- Draw Bounding Box ---
+            draw.rectangle([(x, y), (x + w, y + h)], outline="lime", width=2)
+            # --- Prepare Text Lines (Top-1 Only) ---
+            lines_to_draw = []
+            # Age
+            age_label = pred['age']['predicted_class']
+            age_conf = pred['age']['predicted_confidence']
+            lines_to_draw.append(f"Age: {age_label} ({age_conf*100:.0f}%)")
+            # Gender
+            gen_label = pred['gender']['predicted_class']
+            gen_conf = pred['gender']['predicted_confidence']
+            lines_to_draw.append(f"Gender: {gen_label} ({gen_conf*100:.0f}%)")
+            # Emotion
+            emo_label = pred['emotion']['predicted_class']
+            emo_conf = pred['emotion']['predicted_confidence']
+            lines_to_draw.append(f"Emotion: {emo_label} ({emo_conf*100:.0f}%)")
+            # --- Calculate total height of the text block (from demo.py) ---
+            line_spacing = 10
+            total_text_height = 0
+            for line in lines_to_draw:
+                _left, top, _right, bottom = draw.textbbox((0, 0), line, font=font)
+                total_text_height += (bottom - top) + line_spacing
+            # --- Place text ABOVE or BELOW the box (from demo.py) ---
+            if y - total_text_height > 0:
+                # PLACE TEXT ABOVE: There is enough space
+                text_y = y - line_spacing
+                for line in reversed(lines_to_draw):
+                    left, top, right, bottom = draw.textbbox((x, text_y), line, font=font, anchor="ls") # anchor left-baseline
+                    draw.rectangle([(left - 2, top - 2), (right + 2, bottom + 2)], fill="black")
+                    draw.text((x, text_y), line, font=font, fill="white", anchor="ls")
+                    text_y = top - line_spacing # Move y-position up for the next line
+            else:
+                # PLACE TEXT BELOW: Not enough space above, so draw downwards
+                text_y = y + h + line_spacing
+                for line in lines_to_draw:
+                    left, top, right, bottom = draw.textbbox((x, text_y), line, font=font, anchor="lt")
+                    draw.rectangle([(left - 2, top - 2), (right + 2, bottom + 2)], fill="black")
+                    draw.text((x, text_y), line, font=font, fill="white", anchor="lt")
+                    text_y = bottom + line_spacing
+        # --- 6. Create HTML results ---
+        # Helper function to convert PIL image to base64
+        def pil_to_base64(img_pil):
+            buffered = BytesIO()
+            img_pil.save(buffered, format="JPEG")
+            img_str = base64.b64encode(buffered.getvalue()).decode()
+            return f"data:image/jpeg;base64,{img_str}"
+        # (HTML Generation code remains the same as before)
+        results_html = f"""
+        <style>
+            :root {{
+                --primary-color: #4f46e5;
+                --success-color: #10b981;
+                --text-primary: #ffffff;
+                --text-secondary: #9ca3af;
+                --background-dark: #1f2937;
+                --background-darker: #111827;
+                --border-color: #374151;
+            }}
+            .results-container {{
+                font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+                background: var(--background-darker);
+                padding: 20px;
+                border-radius: 12px;
+                color: var(--text-primary);
+            }}
+            .results-container h2 {{
+                color: var(--text-primary);
+                margin-bottom: 20px;
+            }}
+            .face-count {{
+                display: inline-block;
+                background: var(--primary-color);
+                color: white;
+                padding: 4px 12px;
+                border-radius: 20px;
+                font-size: 0.9rem;
+                font-weight: 500;
+                margin-left: 8px;
+            }}
+            .face-card {{
+                background: var(--background-dark);
+                border-radius: 8px;
+                padding: 20px;
+                margin-top: 15px;
+                border: 1px solid var(--border-color);
+                display: flex;
+                gap: 20px;
+                align-items: flex-start;
+            }}
+            .face-header {{
+                font-size: 1rem;
+                font-weight: 600;
+                margin-bottom: 20px;
+                color: var(--text-primary);
+            }}
+            .face-image-left {{
+                flex-shrink: 0;
+                width: 336px;
+                height: 336px;
+                background: var(--background-darker);
+                border-radius: 8px;
+                overflow: hidden;
+                border: 1px solid var(--border-color);
+            }}
+            .face-image-left img {{
+                width: 100%;
+                height: 100%;
+                object-fit: cover;
+            }}
+            .face-predictions-right {{
+                flex: 1;
+                display: flex;
+                flex-direction: column;
+                gap: 10px;
+            }}
+            .predictions-horizontal {{
+                display: flex;
+                flex-direction: row;
+                gap: 30px;
+                justify-content: space-between;
+            }}
+            .prediction-section {{
+                flex: 1;
+                min-width: 0;
+            }}
+            .prediction-category-label {{
+                font-size: 0.8rem;
+                font-weight: 700;
+                text-transform: uppercase;
+                letter-spacing: 0.5px;
+                color: var(--primary-color);
+                margin-bottom: 8px;
+                border-bottom: 2px solid var(--primary-color);
+                padding-bottom: 4px;
+            }}
+            .probabilities-list {{
+                display: flex;
+                flex-direction: column;
+                gap: 6px;
+            }}
+            .probability-item {{
+                display: grid;
+                grid-template-columns: 70px 1fr 55px;
+                align-items: center;
+                gap: 8px;
+                padding: 4px 6px;
+                border-radius: 4px;
+            }}
+            .probability-item.predicted {{
+                background: rgba(79, 70, 229, 0.2);
+                border-left: 3px solid var(--primary-color);
+                padding-left: 8px;
+            }}
+            .prob-class {{
+                font-size: 0.8rem;
+                font-weight: 600;
+                color: var(--text-primary);
+                word-wrap: break-word; /* Ensure long class names wrap */
+            }}
+            .probability-item.predicted .prob-class {{
+                color: var(--primary-color);
+                font-weight: 700;
+            }}
+            .prob-bar-container {{
+                height: 6px;
+                background: var(--border-color);
+                border-radius: 3px;
+                overflow: hidden;
+            }}
+            .prob-bar {{
+                height: 100%;
+                background: linear-gradient(90deg, var(--primary-color), var(--success-color));
+                border-radius: 3px;
+                transition: width 0.6s ease;
+            }}
+            .probability-item.predicted .prob-bar {{
+                background: var(--primary-color);
+            }}
+            .prob-percentage {{
+                font-size: 0.75rem;
+                font-weight: 500;
+                color: var(--text-secondary);
+                text-align: right;
+            }}
+            .probability-item.predicted .prob-percentage {{
+                color: var(--primary-color);
+                font-weight: 700;
+            }}
+            @media (max-width: 1200px) {{
+                .predictions-horizontal {{
+                    flex-direction: column;
+                    gap: 15px;
+                }}
+            }}
+            @media (max-width: 900px) {{
+                .face-card {{
+                    flex-direction: column;
+                }}
+                .face-image-left {{
+                    width: 100%;
+                    max-width: 336px;
+                    margin: 0 auto;
+                }}
+                .probability-item {{
+                    grid-template-columns: 60px 1fr 50px; /* Adjust for smaller screens */
+                }}
+                .prob-class {{
+                    font-size: 0.75rem;
+                }}
+            }}
+        </style>
+        <div class='results-container'>
+            <h2 style='margin-top: 0;'>Classification Results <span class='face-count'>{len(face_data)} face(s)</span></h2>
+        """
+        for idx, face in enumerate(face_data):
+            pred = face['predictions']
+            face_img_base64 = pil_to_base64(face['crop_image'])
+            age = get_centroid_weighted_age(pred['age']['all_probabilities'])
+            results_html += f"""
+            <div class='face-card'>
+                <div class='face-image-left'>
+                    <img src='{face_img_base64}' alt='Face {idx+1}'>
+                </div>
+                <div class='face-predictions-right'>
+                    <div class='face-header'>Face {idx+1} - Detection Confidence: {face['detection_confidence']:.1%} - Centroid Age: {int(age)}</div>
+                    <div class='predictions-horizontal'>
+                        <div class='prediction-section'>
+                            <div class='prediction-category-label'>Age</div>
+                            <div class='probabilities-list'>
+            """
+            for age_class in CLASSES[0]:
+                prob = pred['age']['all_probabilities'][age_class]
+                is_predicted = (age_class == pred['age']['predicted_class'])
+                predicted_class = 'predicted' if is_predicted else ''
+                results_html += f"""
+                                <div class='probability-item {predicted_class}'>
+                                    <span class='prob-class'>{age_class}</span>
+                                    <div class='prob-bar-container'>
+                                        <div class='prob-bar' style='width: {prob*100}%'></div>
+                                    </div>
+                                    <span class='prob-percentage'>{prob*100:.1f}%</span>
+                                </div>
+                """
+            results_html += f"""
+                            </div>
+                        </div>
+                        <div class='prediction-section'>
+                            <div class='prediction-category-label'>Gender</div>
+                            <div class='probabilities-list'>
+            """
+            for gender_class in CLASSES[1]:
+                prob = pred['gender']['all_probabilities'][gender_class]
+                is_predicted = (gender_class == pred['gender']['predicted_class'])
+                predicted_class = 'predicted' if is_predicted else ''
+                results_html += f"""
+                                <div class='probability-item {predicted_class}'>
+                                    <span class='prob-class'>{gender_class}</span>
+                                    <div class='prob-bar-container'>
+                                        <div class='prob-bar' style='width: {prob*100}%'></div>
+                                    </div>
+                                    <span class='prob-percentage'>{prob*100:.1f}%</span>
+                                </div>
+                """
+            results_html += """
+                            </div>
+                        </div>
+                        <div class='prediction-section'>
+                            <div class='prediction-category-label'>Emotion</div>
+                            <div class='probabilities-list'>
+            """
+            for emotion_class in CLASSES[2]:
+                prob = pred['emotion']['all_probabilities'][emotion_class]
+                is_predicted = (emotion_class == pred['emotion']['predicted_class'])
+                predicted_class = 'predicted' if is_predicted else ''
+                results_html += f"""
+                                <div class='probability-item {predicted_class}'>
+                                    <span class='prob-class'>{emotion_class}</span>
+                                    <div class='prob-bar-container'>
+                                        <div class='prob-bar' style='width: {prob*100}%'></div>
+                                    </div>
+                                    <span class='prob-percentage'>{prob*100:.1f}%</span>
+                                </div>
+                """
+            results_html += """
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            """
+        results_html += "</div>"
+        # --- 7. Return the annotated PIL image and HTML ---
+        return img_pil_annotated, results_html
+    except Exception as e:
+        traceback.print_exc()
+        return image, f"<p style='color: red;'>Error processing image: {str(e)}</p>"
+def create_interface(checkpoint_list, default_checkpoint, initial_status):
+    """Create and configure the Gradio interface."""
+    # Custom CSS for better styling
+    custom_css = """
+    .gradio-container {
+        font-family: 'Arial', sans-serif;
+    }
+    .output-html {
+        max-height: none !important;
+        overflow-y: auto;
+    }
+    """
+    # Create interface
+    with gr.Blocks(css=custom_css, title="Face Classification System") as demo:
+        with gr.Row():
+            gr.Markdown("# Face Classification System")
+        # --- Model Selection ---
+        with gr.Row():
+            with gr.Column(scale=3):
+                checkpoint_dropdown = gr.Dropdown(
+                    label="Select Model Checkpoint",
+                    choices=checkpoint_list,
+                    value=default_checkpoint,
+                )
+            with gr.Column(scale=2):
+                model_status_text = gr.Textbox(
+                    label="Model Status",
+                    value=initial_status,
+                    interactive=False,
+                )
+        # Features | Instructions
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("""
+                ### Features
+                - **Age Classification**: 9 categories (0-2, 3-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70+) + Age estimation with weighted centroid average
+                - **Gender Classification**: M/F
+                - **Emotion Recognition**: 7 categories (Surprise, Fear, Disgust, Happy, Sad, Angry, Neutral)
+                - **Automatic Face Detection**: Detects and analyzes multiple faces
+                - **Detailed Probability Distributions**: View confidence for all classes
+                """)
+            with gr.Column(scale=1):
+                gr.Markdown("""
+                ### Instructions
+                1. (Optional) Select a model checkpoint from the dropdown.
+                2. Upload an image or capture from webcam (or select an example below)
+                3. Click "Classify Image"
+                4. View detected faces with age, gender, and emotion predictions below
+                """)
+        # Upload Image | Annotated Image
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    label="Upload Image",
+                    type="pil",
+                    sources=["upload", "webcam"],
+                    height=400
+                )
+            with gr.Column(scale=1):
+                output_image = gr.Image(
+                    label="Annotated Image",
+                    type="pil",
+                    height=400
+                )
+        with gr.Row():
+            with gr.Column(scale=1):
+                analyze_btn = gr.Button(
+                    "Classify Image",
+                    variant="primary",
+                    size="lg"
+                )
+        # Examples - after button
+        # Dynamically load example images from example directory
+        example_dir = "example"
+        example_images = []
+        if os.path.exists(example_dir):
+            try:
+                example_images = [
+                    os.path.join(example_dir, f)
+                    for f in sorted(os.listdir(example_dir))
+                    if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp'))
+                ]
+            except Exception as e:
+                print(f"Error reading example images from {example_dir}: {e}")
+        if example_images:
+            gr.Markdown("### 📸 Try with example images")
+            gr.Examples(
+                examples=example_images,
+                inputs=input_image,
+                cache_examples=False
+            )
+        # Results section - full width below everything
+        with gr.Row():
+            with gr.Column(scale=1):
+                output_html = gr.HTML(
+                    label="Classification Results",
+                    elem_classes="output-html"
+                )
+        # Event handlers
+        analyze_btn.click(
+            fn=process_image,
+            inputs=[input_image, checkpoint_dropdown], # Pass dropdown value
+            outputs=[output_image, output_html]
+        )
+        checkpoint_dropdown.change(
+            fn=load_model_and_update_status,
+            inputs=[checkpoint_dropdown],
+            outputs=[model_status_text]
+        )
+    return demo
+# === Main Application Startup ===
+# Initialize for Hugging Face Spaces (module-level)
+print("="*60)
+print("VLM SOFT BIOMETRICS - GRADIO INTERFACE")
+print("="*60)
+# --- 1. Scan for models first ---
+checkpoint_list, default_checkpoint = scan_checkpoints(CHECKPOINTS_DIR)
+if not checkpoint_list:
+    print(f"CRITICAL: No checkpoints found in {CHECKPOINTS_DIR}. App may not function.")
+else:
+    print(f"Found checkpoints: {len(checkpoint_list)} file(s).")
+    print(f"Default checkpoint: {default_checkpoint}")
+# --- 2. Try to initialize default model ---
+initial_status_msg = "No default model found. Please select one."
+if default_checkpoint:
+    print(f"\nInitializing default model: {default_checkpoint}")
+    # This will load the model AND set current_ckpt_dir
+    initial_status_msg = load_model_and_update_status(default_checkpoint)
+    print(initial_status_msg)
+else:
+    print("⚠ Warning: No default model to load.")
+# --- 3. Create interface FIRST (so it shows even if model fails) ---
+print("Creating Gradio interface...")
+demo = create_interface(checkpoint_list, default_checkpoint, initial_status_msg)
+print("✓ Interface created successfully!")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="VLM Soft Biometrics - Gradio Interface")
+    parser.add_argument("--ckpt_dir", type=str, default="./checkpoints/",
+                        help="Path to the checkpoint directory (overridden by UI)")
+    parser.add_argument("--detection_confidence", type=float, default=0.5,
+                        help="Confidence threshold for face detection")
+    parser.add_argument("--port", type=int, default=7860,
+                        help="Port to run the Gradio app")
+    parser.add_argument("--share", action="store_true",
+                        help="Create a public share link")
+    parser.add_argument("--server_name", type=str, default="0.0.0.0",
+                        help="Server name/IP to bind to")
+    args = parser.parse_args()
+    # Update global config if args are provided (though UI dropdown is primary)
+    CHECKPOINTS_DIR = args.ckpt_dir
+    # Note: detection_confidence is passed to init_model, so it's handled.
+    print(f"\nLaunching server on {args.server_name}:{args.port}")
+    print(f"Monitoring checkpoint directory: {CHECKPOINTS_DIR}")
+    print("="*60)
+    demo.launch(
+        share=args.share,
+        server_name=args.server_name,
+        server_port=args.port,
+        show_error=True,
+    )

core/args.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import logging
+from typing import Type, TypeVar
+from omegaconf import DictConfig, ListConfig, OmegaConf
+logger = logging.getLogger()
+T = TypeVar("T")
+def set_struct_recursively(cfg, strict: bool = True):
+    # Set struct mode for the current level
+    OmegaConf.set_struct(cfg, strict)
+    # Traverse through nested dictionaries and lists
+    if isinstance(cfg, DictConfig):
+        for key, value in cfg.items():
+            if isinstance(value, (DictConfig, ListConfig)):
+                set_struct_recursively(value, strict)
+    elif isinstance(cfg, ListConfig):
+        for item in cfg:
+            if isinstance(item, (DictConfig, ListConfig)):
+                set_struct_recursively(item, strict)
+def flatten_dict(d, parent_key="", sep="_"):
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+def dataclass_from_dict(cls: Type[T], data: dict, strict: bool = True) -> T:
+    """
+    Converts a dictionary to a dataclass instance, recursively for nested structures.
+    """
+    base = OmegaConf.structured(cls())
+    OmegaConf.set_struct(base, strict)
+    override = OmegaConf.create(data)
+    return OmegaConf.to_object(OmegaConf.merge(base, override))
+def dataclass_to_dict(dataclass_instance: T) -> dict:
+    """
+    Converts a dataclass instance to a dictionary, recursively for nested structures.
+    """
+    if isinstance(dataclass_instance, dict):
+        return dataclass_instance
+    return OmegaConf.to_container(
+        OmegaConf.structured(dataclass_instance), resolve=True
+    )
+def load_config_file(config_file, dataclass_cls: Type[T]) -> T:
+    config = OmegaConf.to_container(OmegaConf.load(config_file), resolve=True)
+    return dataclass_from_dict(dataclass_cls, config)
+def dump_config(config, path, log_config=True):
+    yaml_dump = OmegaConf.to_yaml(OmegaConf.structured(config))
+    with open(path, "w") as f:
+        if log_config:
+            logger.info("Using the following config for this run:")
+            logger.info(yaml_dump)
+        f.write(yaml_dump)

core/checkpoint.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+import torch.optim.optimizer
+from omegaconf import OmegaConf
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed.checkpoint.format_utils import dcp_to_torch_save
+from torch.distributed.checkpoint.state_dict import (get_model_state_dict,
+                                                     get_state_dict,
+                                                     set_state_dict)
+from core.distributed import get_is_master
+logger = logging.getLogger("CHECKPOINT")
+FOLDER_NAME = "{:010d}"
+RE_FOLDER = r"\d{10}"
+RE_CKPT = r"__\d_\d\.distcp"
+CONSOLIDATE_FOLDER = "consolidated"
+CONSOLIDATE_NAME = "consolidated.pth"
+CONFIG_NAME = "params.json"
+TRAIN_STATE_NAME = "train_state_{:05d}.json"
+RE_DIGITS = re.compile(r"\d+")
+@dataclass
+class SaveEvery:
+    every: int = 1000
+    keep: int = 0
+@dataclass
+class CheckpointArgs:
+    dump: SaveEvery = field(default_factory=SaveEvery)
+    eval: SaveEvery = field(default_factory=SaveEvery)
+    path: Optional[str] = None
+    init_ckpt_path: Optional[str] = None
+    vision_model_path: Optional[str] = None
+    is_consolidated_model: bool = False
+    continue_training_from_init: bool = False
+def _get_key_step(name: str):
+    return int(re.findall(RE_DIGITS, name)[-1])
+def consolidate_checkpoints(ckpt_dir: str):
+    """
+    Consolidates all FSDP checkpoints in a directory to a single file
+    Consolidate checkpoint is saved in a subdirectory of ckpt_dir
+    Parameters:
+        ckpt_dir: str - path to the directory containing the checkpoints
+    Returns the path to the consolidated checkpoint
+    """
+    consolidate_path = Path(ckpt_dir) / CONSOLIDATE_FOLDER
+    if not (consolidate_path / CONSOLIDATE_NAME).exists():
+        consolidate_path.mkdir(exist_ok=True)
+        logger.info(f"Consolidating to: {str(consolidate_path)}")
+        dcp_to_torch_save(ckpt_dir, str(consolidate_path / CONSOLIDATE_NAME))
+        (consolidate_path / CONFIG_NAME).write_text(
+            (Path(ckpt_dir) / CONFIG_NAME).read_text()
+        )
+        logger.info("Consolidated !")
+    return consolidate_path
+def load_from_checkpoint(
+    ckpt_dir: str,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    model_key: str = "model",
+    optim_key: str = "optim",
+):
+    if not (Path(ckpt_dir) / ".metadata").exists():
+        raise ValueError(
+            "Please convert the checkpoint distcp format using `torch.distributed.checkpoint.format_utils.torch_save_to_dcp` before loading it"
+        )
+    state_dict = {}
+    if optimizer is not None:
+        state_dict[model_key], state_dict[optim_key] = get_state_dict(model, optimizer)
+    else:
+        state_dict[model_key] = get_model_state_dict(model)
+        if model_key == "":  # If only loading a model directly, the key should be empty
+            state_dict = state_dict.pop(model_key)
+    dcp.load(state_dict, checkpoint_id=ckpt_dir)
+class CheckpointManager:
+    def __init__(self, args: CheckpointArgs):
+        self.path = args.path
+        self.dump_every = args.dump
+        self.eval_every = args.eval
+        self.init_ckpt_path = args.init_ckpt_path
+        self.continue_training_from_init = args.continue_training_from_init
+        assert os.path.exists(
+            self.path
+        ), f"Path {self.path} does not exist and needs to be created before using CheckpointManager (use instantiate_and_make_dir)"
+        self.existing_saves = self.get_existing_saves()
+    def get_existing_saves(self) -> List[Path]:
+        folders = [
+            p
+            for p in Path(self.path).iterdir()
+            if p.is_dir() and re.match(RE_FOLDER, p.name)
+        ]
+        folders.sort(key=lambda p: _get_key_step(p.name))
+        return folders
+    def clean_up(self):
+        logger.info("Cleaning up checkpoints...")
+        dump_folders = []
+        eval_folders = []
+        other_folders = []
+        for p in self.existing_saves:
+            is_dump = _get_key_step(p.name) % self.dump_every.every == 0
+            is_eval = _get_key_step(p.name) % self.eval_every.every == 0
+            if is_dump:
+                dump_folders.append(p)
+            if is_eval:
+                eval_folders.append(p)
+            if not (is_dump or is_eval):
+                other_folders.append(p)
+        logger.info(f"Dump folders: {dump_folders}")
+        logger.info(f"Eval folders: {eval_folders}")
+        logger.info(f"Other folders: {other_folders}")
+        if self.dump_every.keep > 0:
+            dump_folders = dump_folders[-self.dump_every.keep :]
+        if self.eval_every.keep > 0:
+            eval_folders = eval_folders[-self.eval_every.keep :]
+        folder_to_keep = set(other_folders + dump_folders + eval_folders)
+        folder_to_remove = set(self.existing_saves) - folder_to_keep
+        logger.info(f"Removing folders: {folder_to_remove}")
+        if dist.get_rank() == 0:
+            for folder in folder_to_remove:
+                for file in folder.iterdir():
+                    if file.is_file():
+                        file.unlink()
+                    elif file.is_dir():
+                        assert file.name in [CONSOLIDATE_FOLDER]
+                        for f in file.iterdir():
+                            f.unlink()
+                        file.rmdir()
+                folder.rmdir()
+        dist.barrier()
+        self.existing_saves = list(folder_to_keep)
+        self.existing_saves.sort(key=lambda p: _get_key_step(p.name))
+    def get_last_step_path(self, dp_rank: int = 0) -> Optional[Path]:
+        path = None
+        for p in reversed(self.existing_saves):
+            if (p / TRAIN_STATE_NAME.format(dp_rank)).is_file():
+                path = p
+                break
+        return path
+    def _create_folder(self, base_path: Path, folder_name: str) -> Path:
+        folder = base_path / folder_name
+        if get_is_master():
+            folder.mkdir(parents=False, exist_ok=True)
+        if dist.is_initialized():
+            dist.barrier()
+        return folder
+    def _get_dp_tp_mesh(
+        self, device_mesh: Optional[DeviceMesh] = None
+    ) -> Tuple[int, int]:
+        dp_rank = 0
+        tp_rank = 0
+        if device_mesh is not None:
+            if "dp_replicate" in device_mesh.mesh_dim_names:
+                dp_rank = device_mesh.get_local_rank("dp_replicate")
+                if "dp_shard" in device_mesh.mesh_dim_names:
+                    dp_rank = dp_rank * device_mesh[
+                        "dp_replicate"
+                    ].size() + device_mesh.get_local_rank("dp_shard")
+            if "tp" in device_mesh.mesh_dim_names:
+                tp_rank = device_mesh.get_local_rank("tp")
+        return dp_rank, tp_rank
+    @torch.no_grad()
+    def get_state_dict(
+        self,
+        model,
+        optimizer,
+    ):
+        model_sd, optim_sd = get_state_dict(model, optimizer)
+        return {"model": model_sd, "optim": optim_sd}
+    def save(
+        self,
+        model,
+        optimizer,
+        train_state,
+        config,
+        device_mesh: Optional[DeviceMesh] = None,
+    ) -> bool:
+        # When creating directory check if only rank0 or is there other solution
+        path = Path(self.path)
+        curr_save_dir = self._create_folder(path, FOLDER_NAME.format(train_state.step))
+        logger.info(f"Saving to: {str(curr_save_dir)}")
+        if dist.is_initialized():
+            dist.barrier()
+        logger.info("Saving...")
+        state_dict = self.get_state_dict(model, optimizer)
+        dcp.save(state_dict, checkpoint_id=curr_save_dir)
+        logger.info("State dict saved!")
+        if dist.is_initialized():
+            dist.barrier()
+        if get_is_master():
+            with open(curr_save_dir / CONFIG_NAME, "w") as f:
+                json.dump(
+                    OmegaConf.to_container(OmegaConf.structured(config), resolve=True),
+                    f,
+                    indent=4,
+                )
+        # Add json dump here
+        dp_rank, tp_rank = self._get_dp_tp_mesh(device_mesh)
+        if tp_rank == 0:
+            train_state_name = TRAIN_STATE_NAME.format(dp_rank)
+            logger.info(
+                f"Saving train state to: {str(curr_save_dir / train_state_name)}"
+            )
+            # logger.info(f"train_state.state_dict()={train_state.state_dict()}")
+            with open(curr_save_dir / train_state_name, "w") as f:
+                json.dump(train_state.state_dict(), f)
+            logger.info("Train state saved !")
+        self.existing_saves.append(curr_save_dir)
+        self.clean_up()
+        if dist.is_initialized():
+            dist.barrier()
+        return True
+    @torch.no_grad()
+    def load(
+        self,
+        model: nn.Module,
+        optimizer,
+        train_state,
+        device_mesh: DeviceMesh,
+        path: Optional[Path] = None,
+    ):
+        dp_rank, tp_rank = self._get_dp_tp_mesh(device_mesh)
+        # Loading tries to load the provided path, if not available the last saved step and finally from the init path
+        path = path or self.get_last_step_path(dp_rank=dp_rank)
+        # If none of those are available don't do anything
+        if path is None:
+            # If no checkpoints exist do nothing
+            return
+        # Only load train state if it's provided, the files exist and we're not loading from init path
+        train_state_name = TRAIN_STATE_NAME.format(dp_rank)
+        logger.info("Reloading train state")
+        with open(path / train_state_name, "r") as f:
+            train_state_dict = json.load(f)
+        train_state.load_state_dict(train_state_dict)
+        logger.info("Train state reloaded")
+        logger.info(f"Loading from: {str(path)}")
+        state_dict = self.get_state_dict(
+            model=model,
+            optimizer=optimizer,
+        )
+        dcp.load(state_dict, checkpoint_id=path)
+        logger.info("State dict loaded.")
+        logger.info("Reloading model and optim")
+        set_state_dict(
+            model,
+            optimizer,
+            model_state_dict=state_dict["model"],
+            optim_state_dict=state_dict["optim"],
+        )
+        logger.info("Model and optim reloaded")
+    @classmethod
+    def instantiate_and_make_dir(cls, args: CheckpointArgs):
+        if get_is_master():
+            os.makedirs(args.path, exist_ok=True)
+        dist.barrier()
+        return cls(args)
+def get_consolidated_ckpt_path(ckpt_dir: Path, mp_rank: int = 0, mp_size: int = 1):
+    if mp_size == 1:
+        assert mp_rank == 0
+        no_rank_path = ckpt_dir / "consolidated.pth"
+        if no_rank_path.exists():
+            return no_rank_path
+    return ckpt_dir / f"consolidated.{mp_rank:02d}.pth"
+def load_consolidated_checkpoint(
+    model: nn.Module,
+    consolidated_path: str,
+    vision_model_path: Optional[str] = None,
+):
+    """
+    Loads a consolidated checkpoint into the model.
+    This version supports both:
+      - a single file named 'consolidated.pth'
+      - multiple parts named like 'consolidated.00.pth', 'consolidated.01.pth', etc.
+    """
+    ckpt_path = Path(consolidated_path)
+    cp_file = get_consolidated_ckpt_path(ckpt_path, mp_rank=0, mp_size=1)
+    if cp_file.exists():
+        # Use the single file
+        st_dict = torch.load(cp_file, weights_only=True)
+        if "model" in st_dict:
+            st_dict = st_dict["model"]
+    else:
+        # Fall back to multi-part consolidated files (e.g. consolidated.00.pth, consolidated.01.pth, …)
+        checkpoint_files = sorted(ckpt_path.glob("consolidated.*.pth"))
+        if not checkpoint_files:
+            raise FileNotFoundError(
+                f"No consolidated checkpoint file found in {ckpt_path}."
+            )
+        st_dict = {}
+        for ckpt_file in checkpoint_files:
+            part = torch.load(ckpt_file, weights_only=True)
+            # If the checkpoint part is wrapped with "model", unwrap it
+            if "model" in part:
+                part = part["model"]
+            # Merge the state dicts (assumes the keys are all unique or will correctly overwrite)
+            st_dict.update(part)
+    model.vision_projector.init_tensors()
+    model.vision_model.init_tensors()
+    model.rope_embeddings.reset_parameters()
+    if vision_model_path is not None:
+        model.vision_model.load_ckpt(vision_model_path)
+    missing_keys, unexpected_keys = model.load_state_dict(st_dict, strict=False)
+    missing_keys = [k for k in missing_keys if "tied_module.weight" not in k]
+    if vision_model_path is not None:
+        # vision_model is already loaded separately
+        missing_keys = [k for k in missing_keys if "vision_model." not in k]
+    if len(missing_keys) > 0:
+        logger.warning(f"Missing keys when reloading: {missing_keys}")
+    if len(unexpected_keys) > 0:
+        logger.warning(f"Unexpected keys when reloading: {unexpected_keys}")

core/transformer.py ADDED Viewed

	@@ -0,0 +1,646 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature,
+                                               flex_attention)
+from xformers.ops import AttentionBias, fmha
+from core import probe
+class InitStdFactor(Enum):
+    DISABLED = "disabled"  # Init std is divided by 1.0
+    GLOBAL_DEPTH = "global_depth"  # Init std is divided by sqrt(2*n_layers)
+    CURRENT_DEPTH = "current_depth"  # Init std is divided by sqrt(2*depth)
+    DIM_RATIO = "dim_ratio"  # Init std is divided by model_dim/4096
+@dataclass
+class BaseTransformerArgs:
+    dim: int = 512
+    n_layers: int = 8
+    head_dim: Optional[int] = None
+    n_heads: Optional[int] = None
+    n_kv_heads: Optional[int] = None
+    ffn_dim_multiplier: Optional[float] = None
+    multiple_of: int = 256
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000.0
+    old_context_len: int = 8192
+    rope_scale_factor: int = 1
+    low_freq_factor: int = 1
+    high_freq_factor: int = 32
+    init_base_std: Optional[float] = None
+    init_std_factor: str = "disabled"
+    max_seqlen: int = 1024
+def cross_entropy(pred, target, **kwargs):
+    return F.nll_loss(
+        F.log_softmax(pred.flatten(end_dim=-2).float(), -1),
+        target.flatten(end_dim=-1),
+        **kwargs,
+    )
+def repeat_kv(x: torch.Tensor, n_rep: int, dim: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    assert dim == 2, "Only dim=2 is supported. Check the implementation for other dims."
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor, seq_dim: int):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        seq_dim (int): Sequence dimension index.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    """
+    ndim = x.ndim
+    assert 0 <= seq_dim < ndim
+    assert freqs_cis.shape == (
+        x.shape[seq_dim],
+        x.shape[-3],
+        2,
+        2,
+    ), f"freqs_cis vs x: {(freqs_cis.shape, x.shape)}"
+    shape = [
+        d if i == seq_dim or i == ndim - 3 else 1 for i, d in enumerate(x.shape[:-2])
+    ] + [2, 2]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    seq_dim: int,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    freqs_cis = reshape_for_broadcast(
+        freqs_cis, xq_, seq_dim
+    ).float()  # S D/2 2 2 -> 1 S 1 D/2 2 2
+    xq_out = (xq_ * freqs_cis).sum(5).flatten(3)
+    xk_out = (xk_ * freqs_cis).sum(5).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+def lengths_to_start_ids(lengths):
+    doc_start = lengths.cumsum(0)
+    doc_start = doc_start.roll(1)
+    doc_start[0] = 0
+    return doc_start
+def lengths_to_local_ids(lengths):
+    assert lengths.ndim == 1
+    nb_seqs = lengths.size(0)
+    total_seqlen = lengths.sum()
+    # This gives the document id of each token
+    doc_id = torch.repeat_interleave(lengths)
+    # Compute document start for each document
+    doc_start = lengths_to_start_ids(lengths)
+    # Compute document start for each token
+    doc_start = doc_start[doc_id]
+    # Compute the position of each token within each document
+    tok_id = torch.arange(total_seqlen, device=lengths.device) - doc_start
+    return doc_id, tok_id
+def generate_doc_mask_mod(
+    mask_mod: _mask_mod_signature,
+    lengths: torch.Tensor,
+    kv_lengths: Optional[torch.Tensor] = None,
+) -> _mask_mod_signature:
+    """Generates mask mods that apply to inputs to flex attention in the sequence stacked
+    format.
+    Args:
+        mask_mod: The mask mod to apply to the documents
+        lengths: Lengths of each document
+    Note:
+        What is the sequence stacked format? When assembling batches of inputs, we
+        take multiple sequences and stack them together to form 1 large sequence. We then
+        use masking to ensure that the attention scores are only applied to tokens within
+        the same document.
+    Example:
+    - Square mask
+      doc_mask         lengths
+      a a b b b c c    2 3 2
+    a 1 0 0 0 0 0 0
+    a 1 1 0 0 0 0 0
+    b 0 0 1 0 0 0 0
+    b 0 0 1 1 0 0 0
+    b 0 0 1 1 1 0 0
+    c 0 0 0 0 0 1 0
+    c 0 0 0 0 0 1 1
+    """
+    kv_lengths = kv_lengths if kv_lengths is not None else lengths
+    q_document_id, q_token_id = lengths_to_local_ids(lengths)
+    kv_document_id, kv_token_id = lengths_to_local_ids(kv_lengths)
+    q_max_idx = lengths.sum() - 1
+    kv_max_idx = kv_lengths.sum() - 1
+    def doc_mask_mod(b, h, q_idx, kv_idx):
+        q_idx_cap = torch.minimum(q_max_idx, q_idx)
+        kv_idx_cap = torch.minimum(kv_max_idx, kv_idx)
+        valid_idx = (q_idx <= q_max_idx) & (kv_idx <= kv_max_idx)
+        same_doc = q_document_id[q_idx_cap] == kv_document_id[kv_idx_cap]
+        q_logical = q_token_id[q_idx_cap]
+        kv_logical = kv_token_id[kv_idx_cap]
+        inner_mask = mask_mod(b, h, q_logical, kv_logical)
+        return same_doc & inner_mask & valid_idx
+    return doc_mask_mod
+# Rotary embedding as in xformer, see if torchtrain implementation is not better. Also might be usefull to make it work with batch*seqlen collapsed.
+class RotaryEmbedding(torch.nn.Module):
+    """
+    RotaryEmbedding Module
+    """
+    def __init__(
+        self,
+        theta: float,
+        head_dim: int,
+        max_seqlen: int = 1024,
+        scale_factor: int = 1,
+        low_freq_factor: int = 1,
+        high_freq_factor: int = 32,
+        old_context_len: int = 8192,
+    ):
+        super().__init__()
+        self.theta = theta
+        self.head_dim = head_dim
+        self.max_seqlen = max_seqlen
+        self.scale_factor = scale_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.old_context_len = old_context_len
+        if scale_factor != 1:
+            self.low_freq_wavelen = old_context_len / low_freq_factor
+            self.high_freq_wavelen = old_context_len / high_freq_factor
+            assert self.low_freq_wavelen >= self.high_freq_wavelen
+    def reset_parameters(self):
+        self.register_buffer(
+            "freqs_cis",
+            self.precompute_freqs_cis(
+                dim=self.head_dim, end=self.max_seqlen, theta=self.theta
+            ),
+            persistent=False,
+        )
+    def apply_scaling(self, freqs):
+        if self.scale_factor == 1:
+            return freqs
+        new_freqs = []
+        for freq in freqs:
+            wavelen = 2 * math.pi / freq
+            if wavelen < self.high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > self.low_freq_wavelen:
+                new_freqs.append(freq / self.scale_factor)
+            else:
+                assert self.low_freq_wavelen != self.high_freq_wavelen
+                smooth = (self.old_context_len / wavelen - self.low_freq_factor) / (
+                    self.high_freq_factor - self.low_freq_factor
+                )
+                new_freqs.append(
+                    (1 - smooth) * freq / self.scale_factor + smooth * freq
+                )
+        return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+    def precompute_freqs_cis(
+        self,
+        dim: int,
+        end: int,
+        theta: float = 10000.0,
+    ):
+        """
+        Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+        This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+        and the end index 'end'. The 'theta' parameter scales the frequencies.
+        The returned tensor contains complex values in complex64 data type.
+        Args:
+            dim (int): Dimension of the frequency tensor.
+            end (int): End index for precomputing frequencies.
+            theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        Returns:
+            torch.Tensor: Precomputed frequency tensor with complex exponentials.
+        """
+        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        freqs = self.apply_scaling(freqs)
+        t = torch.arange(end, device=freqs.device)
+        freqs = torch.outer(t, freqs).float()
+        cos, sin = freqs.cos(), freqs.sin()
+        return torch.stack((cos, -sin, sin, cos), dim=-1).view(*freqs.size(), 2, 2)
+    def forward(
+        self, seqlen: Optional[int] = None, tok_idx: Optional[torch.Tensor] = None
+    ):
+        """
+        Return freqs_cis corresponding to consecutive seqlen positions or the corresponding tok_idx positions
+        Args:
+            seqlen (int): Contiguous sequence length
+            tok_idx (torch.Tensor[int]): Position indices of each token this overrides seqlen
+        Returns:
+            Tuple(torch.Tensor, torch.Tensor): Embedded input tensor and freqs_cis
+        """
+        test = (seqlen is not None) or (tok_idx is not None)
+        assert test, "Should provide atleast seqlen or tok_idx"
+        if tok_idx is not None:
+            return self.freqs_cis[tok_idx]
+        elif seqlen is not None:
+            return self.freqs_cis[0:seqlen]
+class RMSNorm(nn.Module):
+    """
+    Initialize the RMSNorm normalization layer.
+    Args:
+        dim (int): The dimension of the input tensor.
+        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+    Attributes:
+        eps (float): A small value added to the denominator for numerical stability.
+        weight (nn.Parameter): Learnable scaling parameter.
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x: torch.Tensor):
+        return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor):
+        x = probe.log_stats(x, "resid")
+        output = self._norm(x.float())
+        return (output * self.weight.float()).type_as(x)
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+class TiedLinear(nn.Module):
+    def __init__(self, tied_module: nn.Module) -> None:
+        super().__init__()
+        self.tied_module = tied_module
+        if not hasattr(tied_module, "weight"):
+            raise AttributeError(
+                "Provided module does not have attribute 'weight'. Please check your tied_module."
+            )
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.tied_module.weight)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        rope_theta: float,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.heads_per_group = self.n_heads // self.n_kv_heads
+        self.wq = nn.Linear(
+            dim,
+            n_heads * head_dim,
+            bias=False,
+        )
+        self.wk = nn.Linear(
+            dim,
+            n_kv_heads * head_dim,
+            bias=False,
+        )
+        self.wv = nn.Linear(
+            dim,
+            n_kv_heads * head_dim,
+            bias=False,
+        )
+        self.wo = nn.Linear(
+            n_heads * head_dim,
+            dim,
+            bias=False,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        freq_cis: torch.Tensor,
+        tok_idx: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+        attn_impl: str = "sdpa",
+    ) -> torch.Tensor:
+        # B S D
+        bsz, seq_len, dim = x.shape
+        xq = self.wq(x.view_as(x))
+        xk = self.wk(x.view_as(x))
+        xv = self.wv(x.view_as(x))
+        output_shape = xq.shape
+        # B S D -> B S H D
+        xq = xq.view(bsz, seq_len, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, 1, freq_cis[0:seq_len])
+        # This condition helps us be easily compatible
+        # with inference by adding a pluggable KVCache
+        if hasattr(self, "kv_cache"):
+            xk, xv = self.kv_cache.update(xk, xv, tok_idx)
+        xk = repeat_kv(xk, self.heads_per_group, dim=2)
+        xv = repeat_kv(xv, self.heads_per_group, dim=2)
+        if attn_impl == "flex_attention":
+            assert mask is None or isinstance(mask, BlockMask)
+            xq, xk, xv = map(lambda e: e.transpose(1, 2), (xq, xk, xv))
+            output = flex_attention(xq, xk, xv, block_mask=mask)
+            output = output.transpose(1, 2).contiguous()  # B H S D -> B S H D
+        elif attn_impl == "fmha":
+            assert mask is None or isinstance(mask, AttentionBias)
+            output = fmha.memory_efficient_attention(xq, xk, xv, attn_bias=mask)
+            # This uses B S H D instead of B H S D of pytorch
+        elif attn_impl == "sdpa":
+            xq, xk, xv = map(lambda e: e.transpose(1, 2), (xq, xk, xv))
+            assert mask is None or isinstance(mask, (str, torch.Tensor))
+            is_causal = (mask == "causal") if isinstance(mask, str) else False
+            mask = mask if isinstance(mask, torch.Tensor) else None
+            output = F.scaled_dot_product_attention(
+                xq,
+                xk,
+                xv,
+                is_causal=is_causal,
+                attn_mask=mask,
+            )
+            output = output.transpose(1, 2).contiguous()  # B H S D -> B S H D
+        else:
+            raise NotImplementedError(
+                f"Attention implementation {attn_impl} not supported"
+            )
+        output = self.wo(output.reshape(output_shape))
+        return output
+    def reset_parameters(self, init_std=None, factor=1.0):
+        init_std = init_std or (self.dim ** (-0.5))
+        for w in [self.wq, self.wk, self.wv]:
+            nn.init.trunc_normal_(
+                w.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+        nn.init.trunc_normal_(
+            self.wo.weight,
+            mean=0.0,
+            std=init_std / factor,
+            a=-3 * init_std,
+            b=3 * init_std,
+        )
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        mp_size: int = 1,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        assert hidden_dim % mp_size == 0
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.w1 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+        )
+        self.w3 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+        )
+        self.w2 = nn.Linear(
+            hidden_dim,
+            dim,
+            bias=False,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # B S D
+        x1 = self.w1(x.view_as(x))
+        x3 = self.w3(x.view_as(x))
+        output = self.w2(F.silu(x1) * x3)
+        return output
+    def reset_parameters(self, init_std=None, factor=1.0):
+        in_init_std = init_std or (self.dim ** (-0.5))
+        out_init_std = init_std or (self.hidden_dim ** (-0.5))
+        in_init_std = in_init_std
+        out_init_std = out_init_std / factor
+        for w in [self.w1, self.w3]:
+            nn.init.trunc_normal_(
+                w.weight,
+                mean=0.0,
+                std=in_init_std,
+                a=-3 * in_init_std,
+                b=3 * in_init_std,
+            )
+        nn.init.trunc_normal_(
+            self.w2.weight,
+            mean=0.0,
+            std=out_init_std,
+            a=-3 * out_init_std,
+            b=3 * out_init_std,
+        )
+class TransformerBlock(nn.Module):
+    def __init__(self, args: BaseTransformerArgs):
+        super().__init__()
+        assert (args.head_dim is not None) or (
+            args.n_heads is not None
+        ), "Should specify at least head_dim or n_heads"
+        self.head_dim = args.head_dim or args.dim // args.n_heads
+        self.n_heads = args.n_heads or args.dim // args.head_dim
+        self.n_kv_heads = args.n_kv_heads or self.n_heads
+        assert args.n_heads % self.n_kv_heads == 0
+        assert args.dim % args.n_heads == 0
+        self.attention = Attention(
+            dim=args.dim,
+            head_dim=self.head_dim,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            rope_theta=args.rope_theta,
+        )
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+        )
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        freq_cis: torch.Tensor,
+        tok_idx: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+        attn_impl: str = "sdpa",
+    ) -> torch.Tensor:
+        h = x + self.attention(
+            self.attention_norm(x),
+            freq_cis,
+            tok_idx=tok_idx,
+            mask=mask,
+            attn_impl=attn_impl,
+        )
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+    def init_weights(self, init_std=None, factor=1.0):
+        self.attention.reset_parameters(init_std, factor)
+        self.attention_norm.reset_parameters()
+        self.feed_forward.reset_parameters(init_std, factor)
+        self.ffn_norm.reset_parameters()
+class BaseTransformer(nn.Module):
+    def __init__(self, args: BaseTransformerArgs):
+        super().__init__()
+        self.dim = args.dim
+        self.init_base_std = args.init_base_std
+        self.init_std_factor = InitStdFactor(args.init_std_factor)
+        self.max_seqlen = args.max_seqlen
+        self.rope_embeddings = RotaryEmbedding(
+            theta=args.rope_theta,
+            head_dim=args.head_dim or args.dim // args.n_heads,
+            max_seqlen=args.max_seqlen,
+            scale_factor=args.rope_scale_factor,
+            low_freq_factor=args.low_freq_factor,
+            high_freq_factor=args.high_freq_factor,
+            old_context_len=args.old_context_len,
+        )
+        self.layers = nn.ModuleList()
+        for _ in range(args.n_layers):
+            self.layers.append(TransformerBlock(args))
+    def forward(
+        self,
+        h,
+        tok_idx: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+        attn_impl: str = "sdpa",
+    ):
+        freq_cis = self.rope_embeddings(seqlen=self.max_seqlen, tok_idx=tok_idx)
+        for i, layer in enumerate(self.layers):
+            h = layer(h, freq_cis, tok_idx=tok_idx, mask=mask, attn_impl=attn_impl)
+        return h
+    def reset_parameters(self):
+        # Either use fixed base std or sqrt model dim
+        self.rope_embeddings.reset_parameters()
+    def init_weights(self):
+        self.reset_parameters()
+        for depth, layer in enumerate(self.layers):
+            factor = {
+                InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
+                InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
+                InitStdFactor.DIM_RATIO: self.dim / 4096,
+                InitStdFactor.DISABLED: 1.0,
+            }[self.init_std_factor]
+            layer.init_weights(self.init_base_std, factor)

core/transforms/image_transform.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+from functools import reduce
+from logging import getLogger
+from typing import Any, Callable, Tuple
+import numpy as np
+import torch
+import torchvision.transforms as tv
+from PIL import Image
+from torchvision.transforms import functional as F
+from torchvision.transforms.functional import InterpolationMode
+logger = getLogger()
+MEAN = (0.5, 0.5, 0.5)
+STD = (0.5, 0.5, 0.5)
+def get_image_transform(
+    vision_input_type: str = "vanilla",
+    image_res: int = 336,
+    max_num_tiles: int = 1,
+    normalize_img: bool = True,
+) -> Tuple[Callable, int]:
+    if vision_input_type == "thumb+tile":
+        transforms = VariableSizeImageTransform(
+            size=image_res,
+            max_num_tiles=max_num_tiles,
+            normalize_img=normalize_img,
+            use_thumbnail="before",
+        )
+    else:
+        transforms = ImageTransform(
+            size=image_res,
+            normalize_img=normalize_img,
+        )
+    logger.info(
+        f"Initalized transforms with: vision_input_type: '{vision_input_type}' and max_num_tiles: {max_num_tiles}."
+    )
+    return transforms
+class ImageTransform(object):
+    """
+    Image transform will resize the longer edge to a given size and pad the shorter edge with mean pixel value of the image.
+    """
+    def __init__(
+        self,
+        size: int = 336,
+        normalize_img: bool = True,
+    ) -> None:
+        self.size = size
+        self._mean = MEAN
+        self._std = STD
+        logger.info(f"ImageTransform size: {self.size}")
+        self.to_tensor = tv.ToTensor()
+        self.normalize = (
+            tv.Normalize(
+                mean=self._mean,
+                std=self._std,
+                inplace=True,
+            )
+            if normalize_img
+            else lambda x: x
+        )
+    def __call__(self, image: Image.Image):
+        w, h = image.size
+        image = F.resize(
+            image, (self.size, self.size), interpolation=InterpolationMode.BICUBIC
+        )
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        # Add chunk dim to make it compatible with existing dataloaders
+        image = image.view(1, 3, self.size, self.size)
+        return image, (w, h)
+    def _transform_torch_tensor(self, image: torch.Tensor):
+        h, w = image.shape[-2:]  # Image shape (C, H, W) or (N, C, H, W)
+        image = F.resize(
+            image, size=(self.size, self.size), interpolation=InterpolationMode.BICUBIC
+        )
+        image = (
+            image.to(torch.float32) / 255.0
+        )  # Convert to float and scale to [0, 1] range
+        image = self.normalize(image)
+        return image, (w, h)
+class VariableSizeImageTransform(object):
+    """
+    The variable size image transform will resize the image dynamically
+    based on the image aspect ratio and the number of image chunks we allow.
+    The algorithm will not upsample low-res images to fit a certain aspect
+    ratio, because that leads to a significant degradation in image quality.
+    For example, if an input image is of size 300x800, and we want to allow
+    a maximum of 16 image chunks, it will find the closest aspect ratio that
+    is allowed within 16 image chunks, i.e., 2:5 = 2 horizontal patches and
+    5 vertical patches, giving a total of 10 chunks.
+    The image will then be resized to products of the base size (default is
+    224px because MetaCLIP takes that), so in this case it will  be resized to
+    2*224:5*224 = 448:1120, where we maintain the original aspect ratio and
+    pad with the mean value for the rest. This approach minimizes the amount
+    of padding required for any arbitrary resolution.
+    The final output will therefore be of shape (11, 3, 224, 224), where 10
+    patches are coming from the resizing and chunking, and the first patch
+    is a downsampled version of the image that preserves aspect ratios.
+    """
+    def __init__(
+        self,
+        size: int = 336,
+        normalize_img: bool = True,
+        max_num_tiles: int = 1,
+        use_thumbnail: str = "no",
+        area_limit: bool = False,
+    ) -> None:
+        self.size = size
+        self._mean = MEAN
+        self._std = STD
+        logger.info(f"VariableSizeImageTransform size: {self.size}")
+        self.to_tensor = tv.ToTensor()
+        self.normalize = (
+            tv.Normalize(
+                mean=self._mean,
+                std=self._std,
+                inplace=True,
+            )
+            if normalize_img
+            else lambda x: x
+        )
+        self.area_limit = area_limit
+        self.max_num_tiles = max_num_tiles
+        self.use_thumbnail = use_thumbnail
+        if self.use_thumbnail != "no":
+            self.thumbnail_transform = ImageTransform(
+                size=self.size,
+                normalize_img=normalize_img,
+            )
+    @staticmethod
+    def _factors(n: int):
+        """Return all factors of a number."""
+        return set(
+            reduce(
+                list.__add__,
+                ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0),
+            )
+        )
+    def _find_supported_aspect_ratios(self):
+        """
+        This function computes all the allowed aspect ratios for a fixed
+        number of input chunks.
+        For example, with `num_tiles=5`, it will return:
+        {
+            0.2: [(1, 5)],
+            5.0: [(5, 1)],
+            0.25: [(1, 4)],
+            1.0: [(2, 2), (1, 1)],
+            4.0: [(4, 1)],
+            0.3333333333333333: [(1, 3)],
+            3.0: [(3, 1)],
+            0.5: [(1, 2)],
+            2.0: [(2, 1)]
+        }
+        """
+        asp_dict = {}
+        for chunk_size in range(self.max_num_tiles, 0, -1):
+            _factors = sorted(VariableSizeImageTransform._factors(chunk_size))
+            _asp_ratios = [(x, chunk_size // x) for x in _factors]
+            for ratio in _asp_ratios:
+                k = ratio[0] / ratio[1]
+                if k not in asp_dict:
+                    asp_dict[k] = [ratio]
+                else:
+                    asp_dict[k].append(ratio)
+        return asp_dict
+    def _find_closest_aspect_ratio(self, img_width: int, img_height: int) -> Tuple:
+        """
+        Given an image width, height and target number of chunks
+        this function will find the closest supported aspect ratio.
+        """
+        tgt_ar = img_width / img_height
+        asp_dict = self._find_supported_aspect_ratios()
+        cl_d, cl_p = 1e23, None
+        if tgt_ar >= 1:
+            cl_p = min(
+                [k for k in asp_dict.keys() if k <= tgt_ar],
+                key=lambda x: abs(x - tgt_ar),
+            )
+            v = asp_dict[cl_p]
+            # select width
+            widths = [(idx, self.size * vv[0]) for idx, vv in enumerate(v)]
+            tgt_idx = max(widths, key=lambda x: x[1])[0]
+        else:
+            cl_p = min(
+                [k for k in asp_dict.keys() if k > tgt_ar],
+                key=lambda x: abs(1 / x - 1 / tgt_ar),
+            )
+            v = asp_dict[cl_p]
+            # select height
+            heights = [(idx, self.size * vv[1]) for idx, vv in enumerate(v)]
+            tgt_idx = max(heights, key=lambda x: x[1])[0]
+        out = v[tgt_idx]
+        return out
+    def _resize(
+        self, image: Image.Image, target_width: int, target_height: int
+    ) -> Image.Image:
+        # Resize longer edge to given size.
+        w, h = image.size
+        scale = w / h
+        if scale > 1.0:
+            # width > height
+            new_w = target_width
+            new_h = math.floor(new_w / scale)
+        else:
+            # height >= width
+            new_h = target_height
+            new_w = math.floor(new_h * scale)
+        image = F.resize(image, (new_h, new_w))
+        return image
+    def _pad(self, image: Image.Image, new_width: int, new_height: int) -> Image.Image:
+        mean_per_channel = tuple(
+            np.clip(np.array(image).mean(axis=(0, 1)), 0, 255).astype(np.uint8)
+        )
+        new_im = Image.new(mode="RGB", size=(new_width, new_height), color=(0, 0, 0))  # type: ignore
+        new_im.paste(image)
+        return new_im
+    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        # Split image into number of required tiles (width x height)
+        num_channels, height, width = image.size()
+        image = image.view(num_channels, nch, height // nch, ncw, width // ncw)
+        # Permute dimensions to reorder the axes
+        image = image.permute(1, 3, 0, 2, 4).contiguous()
+        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+        image = image.view(ncw * nch, num_channels, height // nch, width // ncw)
+        return image
+    def _get_image_height_width(
+        self, image_width: int, image_height: int, target_width: int, target_height: int
+    ) -> Tuple[int, int]:
+        """
+        Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
+        with aspect ratio preservation.
+        """
+        scale = image_width / image_height
+        if scale > 1.0:
+            # Width is larger than height
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+            # Set new width to target width and height to the rescaled height.
+            new_w = rescaling_factor * image_width
+            new_h = math.floor(new_w / scale)
+        else:
+            # Height is larger than width
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(
+                target_width / image_width, target_height / image_height
+            )
+            # Set new height to target height and width to the rescaled width.
+            new_h = rescaling_factor * image_height
+            new_w = math.floor(new_h * scale)
+        return new_w, new_h
+    def _fit_image_to_canvas(
+        self, img_width: int, img_height: int, area_limit: bool
+    ) -> Any:
+        """
+        Given an image width, height and target number of chunks this function will see if the image
+        can be fit into any of the canvases that can be build from arranging the tiles in a grid.
+        If the image can be fit onto several canvases, it will return the canvas where the shorter edge
+        of the image will be largest.
+        If area_limit is set to True, the tie-breaking prefers the canvas where area is less than 2x the original area.
+        """
+        # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None.
+        optimal_canvas = None
+        optimal_image_width_height = None
+        scale = img_width / img_height
+        # Gather all potential supported image resolutions and iterate through them to find best match
+        potential_arrangements = [
+            item
+            for sublist in self._find_supported_aspect_ratios().values()
+            for item in sublist
+        ]
+        for n_w, n_h in potential_arrangements:
+            # Compute the canvas size
+            canvas_width, canvas_height = n_w * self.size, n_h * self.size
+            # Check if image can fit into the canvas without downsampling
+            if canvas_width >= img_width and canvas_height >= img_height:
+                # If we did not find a good canvas yet, we will use the current one
+                if optimal_canvas is None:
+                    # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling
+                    optimal_canvas = (n_w, n_h)
+                    optimal_image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * self.size,
+                        target_height=n_h * self.size,
+                    )
+                else:
+                    # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas.
+                    # This means we can potentially upsample the image resolution which is beneficial to performance.
+                    image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * self.size,
+                        target_height=n_h * self.size,
+                    )
+                    if area_limit:
+                        # Prioritize aspect ratio, and choose best within area limit when tied.
+                        curr_scale = image_width_height[0] / image_width_height[1]
+                        optim_scale = (
+                            optimal_image_width_height[0]
+                            / optimal_image_width_height[1]
+                        )
+                        if abs(scale - curr_scale) < abs(scale - optim_scale):
+                            # 1. optimize aspect ratio
+                            optimal_canvas = (n_w, n_h)
+                            optimal_image_width_height = image_width_height
+                        elif abs(scale - curr_scale) == abs(scale - optim_scale):
+                            # 2. optimize area
+                            if (
+                                image_width_height[0] * image_width_height[1]
+                                < 2 * img_width * img_height
+                            ):
+                                # 2.1 area is less than 2x the original area
+                                optimal_canvas = (n_w, n_h)
+                                optimal_image_width_height = image_width_height
+                    else:
+                        # NOTE: L3V dynamid tiling. Priortize biggest canvas.
+                        if (
+                            scale < 1.0
+                            and (image_width_height[0] >= optimal_image_width_height[0])
+                        ) or (
+                            scale >= 1.0
+                            and (image_width_height[1] >= optimal_image_width_height[1])
+                        ):
+                            optimal_canvas = (n_w, n_h)
+                            optimal_image_width_height = image_width_height
+        return optimal_canvas
+    def __call__(self, image: Image.Image) -> Tuple[Any, Any]:
+        assert isinstance(image, Image.Image), type(image)
+        if self.use_thumbnail != "no":
+            thumbnail = self.thumbnail_transform(image)[0]
+        w, h = image.size
+        # Check if the image can be fit to the canvas without downsampling
+        ar = self._fit_image_to_canvas(
+            img_width=w, img_height=h, area_limit=self.area_limit
+        )
+        if ar is None:
+            # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
+            ar = self._find_closest_aspect_ratio(img_width=w, img_height=h)
+        image = F.resize(
+            image,
+            (ar[1] * self.size, ar[0] * self.size),  # (h, w)
+            interpolation=InterpolationMode.BICUBIC,
+        )
+        image = self._pad(image, ar[0] * self.size, ar[1] * self.size)
+        image = self.to_tensor(image)
+        image = self.normalize(image)
+        image = self._split(image, ar[0], ar[1])  # type: ignore
+        if self.use_thumbnail == "before":
+            image = torch.cat((thumbnail, image), dim=0)
+        elif self.use_thumbnail == "after":
+            image = torch.cat((image, thumbnail), dim=0)
+        elif self.use_thumbnail == "both":
+            image = torch.cat((thumbnail, image, thumbnail), dim=0)
+        return image, ar

core/utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, Optional
+import torch
+@dataclass
+class InitArgs:
+    use_gaussian: bool = True  # gaussian vs uniform
+    coeff_std: Optional[float] = None  # std coeff multiplier
+    no_init: bool = False
+def get_init_fn(
+    args: InitArgs, input_dim: int, init_depth: Optional[int]
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    """
+    Init functions.
+    """
+    if args.no_init:
+        return lambda x: x
+    # standard deviation
+    std = 1 / math.sqrt(input_dim)
+    std = std if args.coeff_std is None else (args.coeff_std * std)
+    # rescale with depth
+    if init_depth is not None:
+        std = std / math.sqrt(2 * init_depth)
+    # gaussian vs uniform
+    if args.use_gaussian:
+        return partial(
+            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
+        )
+    else:
+        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
+        return partial(torch.nn.init.uniform_, a=-bound, b=bound)

core/vision_encoder/__init__.py ADDED Viewed

File without changes

core/vision_encoder/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (159 Bytes). View file

core/vision_encoder/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (218 Bytes). View file

core/vision_encoder/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (4.31 kB). View file

core/vision_encoder/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (4.26 kB). View file

core/vision_encoder/__pycache__/pe.cpython-312.pyc ADDED Viewed

Binary file (42.9 kB). View file

core/vision_encoder/__pycache__/pe.cpython-313.pyc ADDED Viewed

Binary file (38.1 kB). View file

core/vision_encoder/__pycache__/pe_lora.cpython-312.pyc ADDED Viewed

Binary file (35.5 kB). View file

core/vision_encoder/__pycache__/rope.cpython-312.pyc ADDED Viewed

Binary file (14.6 kB). View file

core/vision_encoder/__pycache__/rope.cpython-313.pyc ADDED Viewed

Binary file (14.6 kB). View file

core/vision_encoder/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (17.2 kB). View file

core/vision_encoder/__pycache__/tokenizer.cpython-313.pyc ADDED Viewed

Binary file (17.3 kB). View file

core/vision_encoder/__pycache__/transforms.cpython-312.pyc ADDED Viewed

Binary file (3.51 kB). View file

core/vision_encoder/__pycache__/transforms.cpython-313.pyc ADDED Viewed

Binary file (3.3 kB). View file

core/vision_encoder/config.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+Include all available vision encoder configurations.
+"""
+from dataclasses import dataclass, replace
+from typing import Optional
+from huggingface_hub import hf_hub_download
+def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
+    path = path or f"hf://facebook/{name}:{name}.pt"
+    if path.startswith("hf://"):
+        # Load from huggingface
+        path = path[len("hf://"):]
+        repo, file = path.split(":")
+        return hf_hub_download(repo_id=repo, filename=file)
+    else:
+        return path
+@dataclass
+class PEConfig:
+    """ Vision Tower Config. """
+    patch_size: int
+    width: int
+    layers: int
+    heads: int
+    mlp_ratio: float
+    output_dim: Optional[int]
+    ls_init_value: float = None
+    drop_path: float = 0.0
+    image_size: int = 224,
+    use_abs_posemb: bool = True
+    use_cls_token: bool = False
+    use_rope2d: bool = True
+    pool_type: str = "attn"
+    attn_pooler_heads: int = 8
+    use_ln_pre: bool = True
+    use_ln_post: bool = True
+@dataclass
+class PETextConfig:
+    """ Text Tower Config. """
+    context_length: int
+    width: int
+    heads: int
+    layers: int
+    output_dim: int
+    mlp_ratio: float = 4.0
+    vocab_size: int = 49408
+PE_VISION_CONFIG = {}
+PE_TEXT_CONFIG = {}
+#########################################
+#                PE CORE                #
+#########################################
+PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
+    image_size=448,
+    patch_size=14,
+    width=1536,
+    layers=50,
+    heads=16,
+    mlp_ratio=8960 / 1536,
+    pool_type="attn",
+    output_dim=1280,
+    use_cls_token=False,
+)
+PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
+    context_length=72,
+    width=1280,
+    heads=20,
+    layers=24,
+    output_dim=1280
+)
+PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
+    image_size=336,
+    patch_size=14,
+    width=1024,
+    layers=24,
+    heads=16,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=1024,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
+    context_length=32,
+    width=1024,
+    heads=16,
+    layers=24,
+    output_dim=1024
+)
+PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
+    image_size=224,
+    patch_size=16,
+    width=768,
+    layers=12,
+    heads=12,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=1024,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]
+PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig(
+    image_size=384,
+    patch_size=16,
+    width=384,
+    layers=12,
+    heads=6,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=512,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig(
+    context_length=32,
+    width=512,
+    heads=8,
+    layers=12,
+    output_dim=512
+)
+PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig(
+    image_size=384,
+    patch_size=16,
+    width=192,
+    layers=12,
+    heads=3,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=512,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"]
+#########################################
+#                PE Lang                #
+#########################################
+PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-G14-448"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+    ls_init_value=0.1,
+    layers=47,
+)
+PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-L14-336"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+    ls_init_value=0.1,
+    layers=23
+)
+# Stage 2 checkpoints for PLM-8B and PLM-3B respectively. Pretrained with tiling.
+# Use these checkpoints if you're building a model that uses tiling downstream!
+PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"]
+PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"]
+#########################################
+#               PE Spatial              #
+#########################################
+PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-G14-448"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+    ls_init_value=0.1,
+)
+# No layerscale on the smaller spatial models
+PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-L14-336"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)
+PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace(
+    PE_VISION_CONFIG["PE-Core-B16-224"],
+    image_size=512,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)
+PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace(
+    PE_VISION_CONFIG["PE-Core-S16-384"],
+    image_size=512,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)
+PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace(
+    PE_VISION_CONFIG["PE-Core-T16-384"],
+    image_size=512,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)

core/vision_encoder/pe.py ADDED Viewed

	@@ -0,0 +1,833 @@

+from collections import OrderedDict
+from dataclasses import asdict
+from functools import partial
+from logging import getLogger
+from typing import Callable, Optional, Literal
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from timm.layers import DropPath
+from torch.nn import functional as F
+from torch.nn.init import constant_, xavier_uniform_
+from torch.nn.parameter import Parameter
+from torch.utils.checkpoint import checkpoint
+import types
+from core.vision_encoder.rope import Rope2D
+from core.vision_encoder.config import PEConfig, PETextConfig, PE_VISION_CONFIG, PE_TEXT_CONFIG, fetch_pe_checkpoint
+logger = getLogger()
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.dim = dim
+        self.init_values = init_values
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+    def init_tensors(self):
+        self.gamma = nn.Parameter(self.init_values * torch.ones(self.dim))
+class AttentionPooling(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_probe: int = 1,
+        mlp_ratio: int = 4,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.probe = nn.Parameter(torch.randn(1, num_probe, self.embed_dim))
+        self.attn = nn.MultiheadAttention(self.embed_dim, self.num_heads, batch_first=True)
+        self.layernorm = norm_layer(embed_dim)
+        self.mlp_width = int(embed_dim * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(self.embed_dim, self.mlp_width)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(self.mlp_width, self.embed_dim)),
+                ]
+            )
+        )
+        self._is_converted = False
+    def forward(self, x: torch.Tensor):
+        # This is the original forward method that will be replaced.
+        batch, _, _ = x.shape
+        q = self.probe.repeat((batch, 1, 1)).to(x.dtype)
+        x_attn = self.attn(q, x, x, need_weights=False)[0]
+        x = x_attn + self.mlp(self.layernorm(x_attn))
+        return x
+class SelfAttention(nn.Module):
+    r"""
+    Implements sequence packed attention and RoPe
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        rope: Optional[nn.Module] = None,
+    ):
+        super(SelfAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        # To make this compatibile with nn.MultiHeadAttention
+        self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+        self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.rope = rope
+        self.scale = self.head_dim ** (-0.5)
+    def init_tensors(self):
+        xavier_uniform_(self.in_proj_weight)
+        constant_(self.in_proj_bias, 0.0)
+        constant_(self.out_proj.bias, 0.0)
+    def del_muda(self):
+        del self.in_proj_weight
+        del self.in_proj_bias
+    def migrate_weights(self):
+        """
+        MUST be called *after* loading the state_dict.
+        This copies the weights from the old Parameters to the new nn.Linear layer.
+        """
+        # Use torch.no_grad() to ensure this is done without tracking gradients
+        with torch.no_grad():
+            self.in_proj.weight.copy_(self.in_proj_weight)
+            self.in_proj.bias.copy_(self.in_proj_bias)
+        # del self.in_proj_weight
+        # del self.in_proj_bias
+        # print("Migration complete. Old parameters have been removed.")
+    def forward(self, x, attn_mask=None, need_weights=False):
+        batch, seq, embed_dim = x.shape
+        #proj = F.linear(x, self.in_proj_weight, self.in_proj_bias)
+        proj = self.in_proj(x)
+        # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
+        proj = (
+            proj.unflatten(-1, (3, embed_dim))
+            .unsqueeze(0)
+            .transpose(0, -2)
+            .squeeze(-2)
+            .contiguous()
+        )
+        q, k, v = proj[0], proj[1], proj[2]
+        # Use "q_" so that we don't accidentally quit in pdb :)
+        q = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
+        k = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
+        v = rearrange(v, "b s (h d) -> b h s d", h=self.num_heads)
+        if self.rope:
+            q, k = self.rope(q, k)
+        if not need_weights:
+            # Original efficient path
+            attn = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=self.scale
+            )
+            attn = rearrange(attn, "b h s d -> b s (h d)")
+            return self.out_proj(attn)
+        else:
+            # Path to get attention weights
+            q_scaled = q * self.scale
+            # attn_weights shape: (batch, num_heads, seq_len, seq_len)
+            attn_weights = torch.matmul(q_scaled, k.transpose(-2, -1))
+            if attn_mask is not None:
+                attn_weights += attn_mask
+            attn_weights = F.softmax(attn_weights, dim=-1)
+            attn_output = torch.matmul(attn_weights, v)
+            attn_output = rearrange(attn_output, "b h s d -> b s (h d)")
+            output = self.out_proj(attn_output)
+            return output, attn_weights
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        drop_path: float = 0.0,
+        rope: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        if rope:
+            self.attn = SelfAttention(d_model, n_head, rope=rope)
+        else:
+            self.attn = nn.MultiheadAttention(d_model, n_head, batch_first=True)
+        self.ls_1 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ls_2 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ln_1 = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, mlp_width)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(mlp_width, d_model)),
+                ]
+            )
+        )
+    def _call_attn(
+        self,
+        q_x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = False,
+    ):
+        if attn_mask is not None:
+            if not attn_mask.dtype == torch.bool:
+                attn_mask = attn_mask.to(q_x.dtype)
+        if isinstance(self.attn, SelfAttention):
+            # Pass the flag to your custom SelfAttention
+            return self.attn(q_x, attn_mask=attn_mask, need_weights=need_weights)
+        else:
+            # Standard nn.MultiheadAttention
+            return self.attn(q_x, q_x, q_x, attn_mask=attn_mask, need_weights=need_weights)[0]
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = False,
+    ):
+        attn_result = self._call_attn(self.ln_1(x), attn_mask=attn_mask, need_weights=need_weights)
+        attn_weights = None
+        if need_weights:
+            # Unpack the output and the weights
+            attn_output, attn_weights = attn_result
+        else:
+            attn_output = attn_result
+        x = x + self.drop_path1(self.ls_1(attn_output))
+        x = x + self.drop_path2(self.ls_2(self.mlp(self.ln_2(x))))
+        if need_weights:
+            return x, attn_weights # Return weights
+        return x
+    def del_muda(self):
+        self.attn.del_muda()
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        drop_path: float = 0.0,
+        rope: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.grad_checkpointing = False
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(
+                    width,
+                    heads,
+                    mlp_ratio,
+                    ls_init_value=ls_init_value,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    drop_path=drop_path,
+                    rope=rope,
+                )
+                for _ in range(layers)
+            ]
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def truncate(self, layer_idx: int):
+        """ Delete layers so the last layer is the given layer index. """
+        self.layers = ((self.layers + layer_idx) % self.layers) + 1
+        self.resblocks = nn.ModuleList(self.resblocks[:self.layers])
+    def del_muda(self):
+        for resblock in self.resblocks:
+            resblock.del_muda()
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        layer_idx: int = -1,
+        need_weights: bool = False, # Add need_weights flag
+    ):
+        stop_idx = (self.layers + layer_idx) % self.layers
+        attention_maps = [] # List to store maps from each layer
+        for i, r in enumerate(self.resblocks):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                if need_weights:
+                    raise ValueError("Cannot get attention maps with gradient checkpointing enabled.")
+                x = checkpoint(r, x, attn_mask, use_reentrant=False)
+            else:
+                if need_weights:
+                    x, attn_map = r(x, attn_mask=attn_mask, need_weights=True)
+                    attention_maps.append(attn_map)
+                else:
+                    x = r(x, attn_mask=attn_mask, need_weights=False)
+            if i == stop_idx:
+                break
+        if need_weights:
+            return x, attention_maps # Return the list of maps
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-5),
+        use_ln_pre: bool = True,
+        use_ln_post: bool = True,
+        ls_init_value: float = None,
+        drop_path: float = 0.0,
+        image_size: int = 448,  # Pretrain image size only; you can pass in any image size
+        use_abs_posemb: bool = True,
+        use_rope2d: bool = True,
+        use_cls_token: bool = False,
+        output_dim: Optional[int] = 1280,
+        attn_pooler_heads: int = 8,
+        pool_type: Literal["attn", "tok", "avg", "none"] = "attn",
+    ):
+        super().__init__()
+        assert pool_type in ("attn", "tok", "avg", "none")
+        self.pool_type = pool_type
+        self.patch_size = patch_size
+        self.output_dim = output_dim or width
+        self.proj_dim = output_dim
+        self.heads = heads
+        self.width = width
+        self.layers = layers
+        self.use_abs_posemb = use_abs_posemb
+        self.use_cls_token = use_cls_token
+        self.use_rope2d = use_rope2d
+        self.image_size = image_size
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        self.rope = (
+            Rope2D(
+                dim=width // heads,
+                use_cls_token=self.use_cls_token,
+            )
+            if self.use_rope2d
+            else None
+        )
+        self.ln_pre = norm_layer(width) if use_ln_pre else nn.Identity()
+        self.ln_post = norm_layer(self.width) if use_ln_post else nn.Identity()
+        self.transformer = Transformer(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            drop_path=drop_path,
+            rope=self.rope,
+        )
+        if pool_type == "attn":
+            self.attn_pool = AttentionPooling(
+                embed_dim=width,
+                num_heads=attn_pooler_heads,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.attn_pool = None
+        self.init_tensors()
+    def del_muda(self):
+        self.transformer.del_muda()
+    def delete_attn_pool(self):
+        del self.attn_pool
+    def init_tensors(self):
+        def init_submodule_tensors(module):
+            for name, child in module.named_children():
+                if hasattr(child, "init_tensors"):
+                    logger.debug(f"Initializing tensors for submodule: {name}")
+                    child.init_tensors()
+                init_submodule_tensors(child)
+        init_submodule_tensors(self)
+        self.rope.init_tensors()
+        # class embeddings and positional embeddings
+        init_scale = self.width**-0.5
+        if self.use_cls_token:
+            self.class_embedding = nn.Parameter(init_scale * torch.randn(self.width))
+        if self.use_abs_posemb:
+            self.posemb_grid_size = self.image_size // self.patch_size
+            self.positional_embedding = nn.Parameter(
+                init_scale
+                * torch.randn(
+                    int(self.use_cls_token) + self.posemb_grid_size**2, self.width
+                )
+            )
+        if self.proj_dim is not None:
+            self.proj = nn.Parameter(
+                init_scale * torch.randn(self.width, self.proj_dim)
+            )
+    def load_ckpt(self, ckpt_path: str, verbose: bool = True):
+        _sd = torch.load(ckpt_path, weights_only=True)
+        if "state_dict" in _sd:
+            _sd = _sd["state_dict"]
+        elif "weights" in _sd:
+            _sd = _sd["weights"]
+        # for backwards compatibility
+        _sd = {k.replace("module.", ""): v for k, v in _sd.items()}
+        if any(k.startswith("visual.") for k in _sd):
+            _sd = {k.replace("visual.", ""): v for k, v in _sd.items() if "visual" in k}
+        m, u = self.load_state_dict(_sd, strict=False)
+        if verbose or (m or u):
+            logger.info(f"Missing keys for loading vision encoder: {m}")
+            logger.info(f"Unexpected keys for loading vision encoder: {u}")
+            print(f"Missing keys for loading vision encoder: {m}")
+            print(f"Unexpected keys for loading vision encoder: {u}")
+    def truncate(self, layer_idx: int):
+        """ Delete layers so the last layer is the given layer index. """
+        self.transformer.truncate(layer_idx)
+        self.layers = self.transformer.layers
+    @classmethod
+    def from_config(
+        cls,
+        name: str,
+        pretrained: bool = False,
+        checkpoint_path: Optional[str] = None,
+        **kwdargs
+    ):
+        if name not in PE_VISION_CONFIG:
+            raise RuntimeError(f"{name} not found in configs.")
+        args = asdict(PE_VISION_CONFIG[name])
+        args.update(kwdargs)
+        model = cls(**args)
+        if pretrained:
+            model.load_ckpt(fetch_pe_checkpoint(name, checkpoint_path))
+        return model
+    @classmethod
+    def available_configs(cls):
+        return list(PE_VISION_CONFIG.keys())
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.set_grad_checkpointing(enable=enable)
+    def _sample_abs_posemb(self, grid_h: int, grid_w: int):
+        """Interpolates the absolute position embedding if necessary."""
+        if self.posemb_grid_size == grid_h and self.posemb_grid_size == grid_w:
+            return self.positional_embedding[None, ...]
+        pos_embed = self.positional_embedding
+        if self.use_cls_token:
+            cls_token_embed, pos_embed = pos_embed[:1], pos_embed[1:]
+        pos_embed = (
+            pos_embed.reshape(1, self.posemb_grid_size, self.posemb_grid_size, -1)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        pos_embed = F.interpolate(
+            pos_embed, size=(grid_h, grid_w), mode="bilinear", align_corners=False
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1).reshape(-1, self.width).contiguous()
+        if self.use_cls_token:
+            pos_embed = torch.cat([cls_token_embed, pos_embed], dim=0)
+        return pos_embed[None, ...]
+    def _pool(self, x: torch.Tensor):
+        if self.pool_type == "tok":
+            return x[:, 0]
+        elif self.pool_type == "avg":
+            return x.mean(dim=1)
+        elif self.pool_type == "attn":
+            return self.attn_pool(x).squeeze(1)
+        elif self.pool_type == "none":
+            return x
+        else:
+            raise NotImplementedError
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        norm: bool = False,
+        layer_idx: int = -1,
+        strip_cls_token: bool = False,
+        need_weights: bool = False, # Add need_weights flag
+    ):
+        batch, _, h, w = x.shape
+        grid_h, grid_w = h // self.patch_size, w // self.patch_size
+        x = self.conv1(x)
+        x = x.permute(0, 2, 3, 1).reshape(batch, -1, self.width)
+        if self.use_cls_token:
+            x = torch.cat(
+                [self.class_embedding.view(1, 1, -1).expand(batch, -1, -1), x],
+                dim=1,
+            )
+        if self.use_abs_posemb:
+            x = x + self._sample_abs_posemb(grid_h, grid_w)
+        if self.use_rope2d:
+            self.rope.update_grid(x.device, grid_h, grid_w)
+        x = self.ln_pre(x)
+        # Get output from the transformer
+        transformer_output = self.transformer(x, layer_idx=layer_idx, need_weights=need_weights)
+        attention_maps = None
+        if need_weights:
+            x, attention_maps = transformer_output
+        else:
+            x = transformer_output
+        if norm:
+            x = self.ln_post(x)
+        if strip_cls_token and self.use_cls_token:
+            x = x[:, 1:, :]
+        if need_weights:
+            return x, attention_maps # Return maps
+        return x
+    def forward(self, x: torch.Tensor, **kwargs):
+        x = self.forward_features(x, norm=True, **kwargs)
+        x = self._pool(x)
+        if self.proj_dim is not None:
+            x = x @ self.proj
+        return x
+class TextTransformer(nn.Module):
+    def __init__(
+        self,
+        context_length: int = 72,
+        vocab_size: int = 49408,
+        width: int = 512,
+        heads: int = 8,
+        layers: int = 12,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        output_dim: int = 1280,
+        no_causal_mask: bool = False,
+        pad_id: int = 0,
+        pool_type: str = "argmax",
+        proj_bias: bool = False,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-5),
+        output_tokens: bool = False,
+        use_ln_post: bool = True,
+    ):
+        super().__init__()
+        assert pool_type in ("first", "last", "argmax", "none")
+        self.pool_type = pool_type
+        self.output_tokens = output_tokens
+        self.num_pos = self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.width = width
+        self.output_dim = output_dim
+        self.heads = heads
+        self.pad_id = pad_id
+        self.layers = layers
+        self.token_embedding = nn.Embedding(vocab_size, width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width))
+        self.transformer = Transformer(
+            width=width,
+            layers=layers,
+            heads=heads,
+            mlp_ratio=mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.ln_final = norm_layer(width) if use_ln_post else nn.Identity()
+        if no_causal_mask:
+            self.attn_mask = None
+        else:
+            self.register_buffer(
+                "attn_mask", self.build_causal_mask(), persistent=False
+            )
+        if pool_type == "attn" or pool_type == "attn_eos":
+            self.attn_pool = AttentionPooling(
+                embed_dim=width,
+                num_heads=heads,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+            )
+        else:  # argmax
+            self.attn_pool = None
+        if proj_bias:
+            self.text_projection = nn.Linear(width, output_dim)
+        else:
+            self.text_projection = nn.Parameter(torch.empty(width, output_dim))
+    def build_causal_mask(self):
+        # lazily create causal attention mask, with full attention between the tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def load_ckpt(self, ckpt_path: str, verbose: bool = True):
+        _sd = torch.load(ckpt_path, weights_only=True)
+        if "state_dict" in _sd:
+            _sd = _sd["state_dict"]
+        elif "weights" in _sd:
+            _sd = _sd["weights"]
+        _sd = {k.replace("module.", ""): v for k, v in _sd.items()}
+        m, u = self.load_state_dict(_sd, strict=False)
+        if verbose or (m or u):
+            logger.info(f"Missing keys for loading model: {m}")
+            logger.info(f"Unexpected keys for loading model: {u}")
+            print(f"Missing keys for loading model: {m}")
+            print(f"Unexpected keys for loading model: {u}")
+    def build_cls_mask(self, text):
+        cls_mask = (text != self.pad_id).unsqueeze(1)
+        cls_mask = F.pad(cls_mask, (1, 0, cls_mask.shape[2], 0), value=True)
+        additive_mask = torch.empty(cls_mask.shape, device=cls_mask.device)
+        additive_mask.fill_(0)
+        additive_mask.masked_fill_(~cls_mask, float("-inf"))
+        additive_mask = torch.repeat_interleave(additive_mask, self.heads, 0)
+        return additive_mask
+    def text_global_pool(
+        self, x, text: Optional[torch.Tensor] = None, pool_type: str = "argmax"
+    ):
+        if pool_type == "first":
+            pooled, tokens = x[:, 0], x[:, 1:]
+        elif pool_type == "last":
+            pooled, tokens = x[:, -1], x[:, :-1]
+        elif pool_type == "argmax":
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            assert text is not None
+            pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x
+        else:
+            pooled = tokens = x
+        return pooled, tokens
+    def forward(self, text):
+        seq_len = text.shape[1]
+        x = self.token_embedding(
+            text
+        )
+        attn_mask = self.attn_mask
+        if attn_mask is not None:
+            attn_mask = attn_mask[:seq_len, :seq_len]
+        x = x + self.positional_embedding[:seq_len]
+        x = self.transformer(x, attn_mask=attn_mask)
+        x = self.ln_final(x)
+        pooled, tokens = self.text_global_pool(x, text, pool_type=self.pool_type)
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                pooled = self.text_projection(pooled)
+            else:
+                pooled = pooled @ self.text_projection
+        if self.output_tokens:
+            return pooled, tokens
+        return pooled
+class CLIP(TextTransformer):
+    def __init__(
+        self,
+        vision_cfg: PEConfig,
+        text_cfg: PETextConfig,
+        init_logit_scale: float = np.log(1 / 0.07)
+    ):
+        super(CLIP, self).__init__(**asdict(text_cfg))
+        self.visual = VisionTransformer(**asdict(vision_cfg))
+        self.image_size = self.visual.image_size  # For ease of use
+        self.logit_scale = nn.Parameter(torch.ones([]) * init_logit_scale)
+    def encode_image(self, image, normalize: bool = False):
+        x = self.visual(image)
+        return F.normalize(x, dim=-1) if normalize else x
+    def encode_video(self, video, normalize: bool = False): # b n c h w
+        b, n, c, h, w = video.shape
+        frms = video.reshape(b * n, c, h, w)
+        frm_feats = self.encode_image(frms, normalize=normalize)
+        video_feats = frm_feats.reshape(b, n, -1)
+        video_feats = video_feats.mean(dim=1)
+        return video_feats
+    def encode_text(self, text, normalize: bool = False):
+        x = super().forward(text)
+        return F.normalize(x, dim=-1) if normalize else x
+    def forward(
+        self,
+        image: Optional[torch.Tensor] = None,
+        text: Optional[torch.Tensor] = None,
+    ):
+        image_features = (
+            self.encode_image(image, normalize=True) if image is not None else None
+        )
+        text_features = (
+            self.encode_text(text, normalize=True) if text is not None else None
+        )
+        return image_features, text_features, self.logit_scale.exp()
+    @classmethod
+    def from_config(
+        cls,
+        name: str,
+        pretrained: bool = False,
+        checkpoint_path: Optional[str] = None  # To load your own
+    ):
+        if name not in PE_VISION_CONFIG or name not in PE_TEXT_CONFIG:
+            raise RuntimeError(f"{name} not found in configs.")
+        model = cls(PE_VISION_CONFIG[name], PE_TEXT_CONFIG[name])
+        if pretrained:
+            model.load_ckpt(fetch_pe_checkpoint(name, checkpoint_path))
+        return model
+    @classmethod
+    def available_configs(cls):
+        return [k for k in PE_VISION_CONFIG if k in PE_TEXT_CONFIG]

core/vision_encoder/rope.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from math import pi
+from typing import Literal, Optional, Union
+import torch
+from einops import rearrange, repeat
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    dtype = t.dtype
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:]
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert (
+        rot_dim <= t.shape[-1]
+    ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    out = torch.cat((t_left, t, t_right), dim=-1)
+    return out.type(dtype)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Optional[Tensor] = None,
+        freqs_for: Union[
+            Literal["lang"], Literal["pixel"], Literal["constant"]
+        ] = "lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        self.cache_if_possible = cache_if_possible
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.tmp_store("dummy", torch.tensor(0))
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            self.tmp_store("scale", None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store("scale", scale)
+        # add apply_rotary_emb as static method
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+    @property
+    def device(self):
+        return self.dummy.device
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (
+            torch.arange(seq_len, device=device, dtype=dtype) + offset
+        ) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert (
+            not self.use_xpos
+        ), "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset),
+            seq_len=seq_len,
+            offset=offset,
+        )
+        if seq_dim == -3:
+            freqs = rearrange(freqs, "n d -> n 1 d")
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        rotated_q = self.rotate_queries_or_keys(
+            q, seq_dim=seq_dim, offset=k_len - q_len + offset
+        )
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, "n d -> n 1 d")
+            scale = rearrange(scale, "n d -> n 1 d")
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
+        assert self.use_xpos
+        should_cache = self.cache_if_possible and exists(seq_len)
+        if (
+            should_cache
+            and exists(self.cached_scales)
+            and (seq_len + offset) <= self.cached_scales.shape[0]
+        ):
+            return self.cached_scales[offset : (offset + seq_len)]
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, "n -> n 1")
+            scale = torch.cat((scale, scale), dim=-1)
+        if should_cache:
+            self.tmp_store("cached_scales", scale)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+            freqs = self.forward(pos, seq_len=dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible
+            and not self.learned_freq
+            and exists(seq_len)
+            and self.freqs_for != "pixel"
+        )
+        if (
+            should_cache
+            and exists(self.cached_freqs)
+            and (offset + seq_len) <= self.cached_freqs.shape[0]
+        ):
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+        freqs = self.freqs
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        if should_cache:
+            self.tmp_store("cached_freqs", freqs.detach())
+        return freqs
+class Rope2D:
+    """ Helper class to apply RoPE2D as well as interpolate on the fly. """
+    def __init__(self, dim, use_cls_token=False):
+        self.dim = dim
+        self.use_cls_token = use_cls_token
+        self.grid_size = None
+        self.freq = None
+    def init_tensors(self):
+        self.rope = RotaryEmbedding(self.dim // 2)
+    def update_grid(self, device, grid_h, grid_w):
+        if self.grid_size != (grid_h, grid_w):
+            self.grid_size = (grid_h, grid_w)
+            self.rope = self.rope.to(device)
+            if self.use_cls_token:
+                # +1 to leave space for the cls token to be (0, 0)
+                grid_y_range = torch.arange(grid_h, device=device) + 1
+                grid_x_range = torch.arange(grid_w, device=device) + 1
+            else:
+                grid_y_range = torch.arange(grid_h, device=device)
+                grid_x_range = torch.arange(grid_w, device=device)
+            freqs_y = self.rope(grid_y_range)[:, None].expand(grid_h, grid_w, -1)
+            freqs_x = self.rope(grid_x_range)[None, :].expand(grid_h, grid_w, -1)
+            freq = torch.cat([freqs_x, freqs_y], dim=-1).reshape(grid_h * grid_w, -1)
+            if self.use_cls_token:
+                freq = torch.cat(
+                    [torch.zeros(1, freq.shape[-1], device=device), freq], dim=0
+                )
+            self.freq = freq[None, ...]
+        self.freq = self.freq.to(device)
+    def __call__(self, q, k):
+        # batch, heads, seq, dim = q.shape
+        q = apply_rotary_emb(self.freq[:, None, :, :], q)
+        k = apply_rotary_emb(self.freq[:, None, :, :], k)
+        return q, k

core/vision_encoder/transforms.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torchvision.transforms as T
+def get_image_transform(
+    image_size: int,
+    center_crop: bool = False,
+    interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR  # We used bilinear during training
+):
+    if center_crop:
+        crop = [
+            T.Resize(image_size, interpolation=interpolation),
+            T.CenterCrop(image_size)
+        ]
+    else:
+        # "Squash": most versatile
+        crop = [
+            T.Resize((image_size, image_size), interpolation=interpolation)
+        ]
+    return T.Compose(crop + [
+        T.Lambda(lambda x: x.convert("RGB")),
+        T.ToTensor(),
+        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
+    ])
+import torchvision.transforms as T
+from PIL import Image
+def get_image_transform(
+    image_size: int,
+    center_crop: bool = False,
+    interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR  # We used bilinear during training
+):
+    if center_crop:
+        crop = [
+            T.Resize(image_size, interpolation=interpolation),
+            T.CenterCrop(image_size)
+        ]
+    else:
+        # "Squash": most versatile
+        crop = [
+            T.Resize((image_size, image_size), interpolation=interpolation)
+        ]
+    return T.Compose(crop + [
+        T.Lambda(lambda x: x.convert("RGB")),
+        T.ToTensor(),
+        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
+    ])
+def _convert_to_rgb(image: Image.Image) -> Image.Image:
+    """Converts a PIL Image to RGB format."""
+    return image.convert("RGB")
+def get_image_transform_fix(
+    image_size: int,
+    center_crop: bool = False,
+    interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR
+):
+    if center_crop:
+        crop = [
+            T.Resize(image_size, interpolation=interpolation),
+            T.CenterCrop(image_size)
+        ]
+    else:
+        # "Squash": most versatile
+        crop = [
+            T.Resize((image_size, image_size), interpolation=interpolation)
+        ]
+    return T.Compose(crop + [
+        T.Lambda(_convert_to_rgb),
+        T.ToTensor(),
+        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
+    ])
+def get_text_tokenizer(context_length: int):
+    return SimpleTokenizer(context_length=context_length)

core/vision_projector/base.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from abc import ABC, abstractmethod
+from torch import nn
+class BaseProjector(nn.Module, ABC):
+    def __init__(self):
+        super().__init__()
+        self.adaptive_avg_pool = None
+    @abstractmethod
+    def setup_projector(self):
+        """
+        Setup the vision_projector attribute in subclasses.
+        """
+        pass
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.projector(x)
+        x = x.permute(1, 0, 2)
+        if self.adaptive_avg_pool is not None:
+            x = self.adaptive_avg_pool(x)
+        return x

core/vision_projector/mlp.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from core.utils import get_init_fn
+from core.vision_projector.base import BaseProjector
+class AdaptiveAvgPooling(nn.Module):
+    def __init__(self, pooling_ratio=2):
+        super(AdaptiveAvgPooling, self).__init__()
+        self.pooling_ratio = pooling_ratio
+    def forward(self, x):
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        assert h * h == num_tokens
+        shape = (h // self.pooling_ratio, h // self.pooling_ratio)
+        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+        x = F.adaptive_avg_pool2d(x, shape)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class MLPProjector(BaseProjector):
+    def __init__(self, args):
+        super().__init__()
+        self.setup_projector(args)
+        self.pooling_ratio = args.pooling_ratio
+        self.adaptive_avg_pool = AdaptiveAvgPooling(pooling_ratio=args.pooling_ratio)
+        self.remove_vision_class_token = args.remove_vision_class_token
+    def init_tensors(self):
+        self.init_method(self.projector[0].weight)
+        self.init_method(self.projector[0].bias)
+        self.init_method(self.projector[2].weight)
+        self.init_method(self.projector[2].bias)
+    def setup_projector(self, args):
+        self.init_method = get_init_fn(args.mlp_init, args.dim, init_depth=None)
+        input_size = args.vision_model["width"]
+        output_size = args.dim
+        self.projector = nn.Sequential(
+            nn.Linear(
+                in_features=input_size,
+                out_features=output_size,
+                bias=True,
+                dtype=torch.get_default_dtype(),
+            ),
+            nn.GELU(),
+            nn.Linear(
+                in_features=output_size,
+                out_features=output_size,
+                bias=True,
+                dtype=torch.get_default_dtype(),
+            ),
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+torchvision
+opencv-python-headless
+Pillow
+numpy
+einops
+peft
+python-dotenv
+tqdm
+gradio

setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from setuptools import setup, find_packages
+setup(
+    name='pe_adaptation',
+    version='0.1',
+    packages=find_packages(),
+)

src/model.py ADDED Viewed

	@@ -0,0 +1,809 @@

+from einops import rearrange
+from torch.nn import functional as F
+from dotenv import load_dotenv
+import os
+import sys
+from core.vision_encoder.pe import SelfAttention, AttentionPooling
+import torch.nn as nn
+from typing import Dict, List
+from utils.task_config import Task
+import torch
+from typing import Optional, Union, Mapping,OrderedDict
+from src.dlora import *
+from peft import PeftModel, get_peft_model, LoraConfig
+DROPOUT_P = 0.5
+class MTLModel(nn.Module):
+    def __init__(self, backbone, tasks: List[Task],
+                rank: int = 64,
+                use_lora: bool = True,
+                truncate_idx: int = 22,
+                last_lora_layers: int = -99,
+                lora_dropout: float = 0.5,
+                use_mtl_lora :bool = False,
+                use_deep_head:bool = False,
+                use_batch_norm:bool = True,
+                use_mtl_attn_pool: bool = True,
+                use_dora:bool = True):
+        super().__init__()
+        self.use_mtl_attn_pool=use_mtl_attn_pool
+        self.tasks = tasks
+        self.use_mtl_lora = use_mtl_lora
+        self.use_deep_head= use_deep_head
+        self.use_lora = use_lora
+        self.use_mtlora = use_mtl_lora
+        output_dim = backbone.output_dim
+        # log_vars is for uncertainty weighting
+        self.log_vars =  nn.Parameter(torch.zeros(len(tasks)))
+        task_names = [task.name for task in tasks]
+        self.backbone = backbone
+        width = backbone.width
+        heads = backbone.heads
+        rope = backbone.rope
+        if self.use_mtl_lora:
+            # save last residual attention block, as we need the weights values to seed the new mtl version
+            orig_last_block = backbone.transformer.resblocks[-1]
+            self.ln_post = backbone.ln_post
+            # save the attention pooling, as we need the weights values to seed the task specifics attention pooling layers
+            orig_attn_pool = backbone.attn_pool.to('cuda')
+            self.backbone.truncate(layer_idx=truncate_idx) # 23th block becomes the last (the idx is 22)
+            # mtl block that produces t-task specific features maps, plus a shared one
+            self.mtl_layer = MTLoRAResidualAttentionBlock(
+                d_model=width,
+                n_head=heads,
+                rope=rope,
+                r={'shared': rank, **{name: rank for name in task_names}},
+                tasks=task_names,
+                shared_mode='matrix' ,
+                lora_shared_scale=0.0 # We do not use the shared matrix, so we set it's scale to 0
+            )
+            self.mtl_layer.load_from_original_block(orig_last_block)
+            print("MTL-LoRA final block created and initialized from pretrained weights.")
+            if self.use_mtl_attn_pool:
+                self.attn_pool = MTLoRAAttentionPooling(
+                    embed_dim=width,
+                    num_heads=8,
+                    tasks=task_names,
+                    r={'shared': rank, **{name: rank for name in task_names}},
+                    lora_dropout=lora_dropout,
+                    lora_task_scale=1.0,
+                    lora_shared_scale=0.0
+                )
+                self.attn_pool.load_from_original(orig_attn_pool)
+            else:
+                self.task_specific_attn_pool = nn.ModuleDict({
+                    task.name: AttentionPooling(embed_dim=width, num_heads=8)
+                    for task in self.tasks
+                })
+                for task in self.tasks:
+                    self.task_specific_attn_pool[task.name].load_state_dict(orig_attn_pool.state_dict())
+                print("Task-specific Attention Pooling layers created and initialized.")
+            del self.backbone.attn_pool
+        if use_lora:
+            # You can modify this list if you want to target only attention layers or mlp layers
+            target_layers = ["attn.in_proj", "attn.out_proj", "mlp.c_fc", "mlp.c_proj"]
+            target_modules = []
+            for name, param in self.backbone.named_modules():
+                if not isinstance(param, nn.Linear):
+                    continue
+                is_target_layer = any(s in name for s in target_layers)
+                if is_target_layer:
+                    if "attn_pool" in name:
+                        target_modules.append(name)
+                    elif "transformer.resblocks" in name:
+                        layer_idx = int(name.split('.')[2])
+                        if layer_idx >= last_lora_layers:
+                            target_modules.append(name)
+            lora_config = LoraConfig(
+                r=rank,
+                lora_alpha=rank,
+                target_modules= target_modules,
+                use_dora=use_dora,
+                lora_dropout=lora_dropout,
+                bias = "none"
+            )
+            self.backbone = get_peft_model(self.backbone,lora_config)
+            print("PEFT LoRA module added")
+        if self.use_deep_head == False:
+            self.prediction_layers = nn.ModuleDict({
+                    task.name: nn.Sequential(
+                        nn.BatchNorm1d(backbone.output_dim) if use_batch_norm else nn.Identity(),
+                        nn.Dropout(p=DROPOUT_P),
+                        nn.Linear( backbone.output_dim, len(task.class_labels))
+                    )
+                    for task in self.tasks
+                })
+            print("Task-specific prediction heads created.")
+        else:
+            self.prediction_layers = nn.ModuleDict({
+                    task.name: nn.Sequential(
+                        nn.BatchNorm1d(backbone.output_dim) if use_batch_norm else nn.Identity(),
+                        nn.Dropout(p=DROPOUT_P),
+                        nn.Linear(backbone.output_dim,  backbone.output_dim),
+                        nn.GELU(),
+                        nn.Linear(backbone.output_dim, len(task.class_labels)),
+                    )
+                    for task in self.tasks
+                })
+            print("Task-specific prediction deep-heads created.")
+        self.backbone.del_muda()
+    def enable_gradient_checkpointing(self):
+        """Call this method after setting up parameter requires_grad"""
+        backbone_has_trainable = any(param.requires_grad for param in self.backbone.parameters())
+        if backbone_has_trainable:
+            self.backbone.set_grad_checkpointing()
+            print("Gradient checkpointing enabled for backbone (has trainable parameters)")
+        else:
+            print("Gradient checkpointing not enabled - backbone has no trainable parameters")
+    def forward(self, x: torch.Tensor):
+        if self.use_mtl_lora:
+            return self._forward_mtl_block(x)
+        else:
+            return self._forward_shared(x)
+    def _forward_shared(self, x: torch.Tensor):
+        logits = {}
+        #if self.attention_specific_pool == True:
+        #    features = self.backbone.forward_features(x, norm=True, strip_cls_token=False)
+        #    for task in self.tasks:
+        #
+        #        pooled_feat = self.task_specific_attn_pool[task_name](features)
+        #        pooled_feat = pooled_feat.squeeze(1)
+        #        logits[task_name] = self.prediction_layers[task_name](pooled_feat)
+        #else:
+        features = self.backbone(x)
+        # print(features.shape)
+        for task in self.tasks:
+            logits[task.name] = self.prediction_layers[task.name](features)
+        return logits
+    def _forward_mtl_block(self, x: torch.Tensor, return_feat=False, feat_to_return="None"):
+        # Shared feature map from the backbone
+        # norm=False, because normalization is "trained" on the feature map of the output of the last ResidualAttentionBlock
+        # so we will normalize the task specific feature map, instead of the shared one
+        # strip_cls_token=False, because in the PE paper it has been shown to be beneficial to keep it
+        features = self.backbone.forward_features(x, norm=False, strip_cls_token=False)
+        # Equal for each task, as our mtl layer follows a task-agnostic layer
+        task_features_input = {task.name: features for task in self.tasks}
+        # Returns also a shared features map, that is discarded,
+        # task features is a dictionary, the key is task name, and the value is a tensor of shape (batch_size, n_tokens, d_model)
+        # rappresting the task specific features map
+        _, task_features  = self.mtl_layer(features, x_tasks=task_features_input)
+        normalized_task_features = {
+            task.name: self.ln_post(task_features[task.name])
+            for task in self.tasks
+        }
+        if self.use_mtl_attn_pool:
+            pooled_features = self.attn_pool(normalized_task_features)
+        else:
+            pooled_features = {}
+            for task in self.tasks:
+                feat = normalized_task_features[task.name]
+                pooled_features[task.name] = self.task_specific_attn_pool[task.name](feat)
+        # this stuff is for pca/tsne visualization
+        if return_feat:
+            if feat_to_return == "Age":
+                return pooled_features['Age']
+            elif feat_to_return == "Emotion":
+                return pooled_features['Emotion']
+            elif feat_to_return == "Gender":
+                return pooled_features['Gender']
+        logits = {}
+        for task in self.tasks:
+            # Squeeze the pooling dimension (1)
+            pooled_feat = pooled_features[task.name].squeeze(1) # (batch, 1, d_model) -> (batch, d_model)
+            logits[task.name] = self.prediction_layers[task.name](pooled_feat)
+        return logits
+    def save_whole_model(self, filepath: str):
+        print(f"Saving model state_dict to {filepath}")
+        torch.save(self.state_dict(), filepath)
+    def load_model(self, filepath:str,map_location='cuda'):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if self.use_lora or self.use_mtlora:
+            self.backbone.merge_and_unload()
+        self.to(device)
+        state_dict = torch.load(filepath, map_location=map_location)
+        self.load_state_dict(state_dict, strict=True)
+    def save_adapters_peft(self, save_directory: str):
+        print(f"Saving adapters to directory: {save_directory}")
+        os.makedirs(save_directory, exist_ok=True)
+        custom_layers_state_dict = {
+            'prediction_layers': self.prediction_layers.state_dict()
+        }
+        if self.use_lora:
+            self.backbone.save_pretrained(save_directory)
+        if self.use_mtlora:
+            custom_layers_state_dict['mtl_layer'] = self.mtl_layer.state_dict()
+            #custom_layers_state_dict['task_specific_attn_pooling'] = self.task_specific_attn_pool.state_dict()
+            custom_layers_state_dict['mtl_attn_pool'] = self.attn_pool.state_dict()
+        torch.save(custom_layers_state_dict, os.path.join(save_directory, 'custom_layers.pt'))
+        print("Successfully saved PEFT backbone and custom task heads.")
+    def load_heads(self, filepaths: List[str],device='cuda'):
+        for ckpt in filepaths:
+            checkpoint = torch.load(ckpt, map_location=device)
+            model_state_dict = self.state_dict()
+            if "prediction_layers" in checkpoint:
+                for loaded_key, value in checkpoint["prediction_layers"].items():
+                    new_key = loaded_key
+                    # Remap prefix: 'heads.emotion.' -> 'prediction_layers.Emotion.'
+                    if new_key.startswith('heads.emotion.'):
+                        new_key = new_key.replace('heads.emotion.', 'prediction_layers.Emotion.')
+                    if new_key.startswith('heads.age.'):
+                        new_key = new_key.replace('heads.age.', 'prediction_layers.Age.')
+                    if new_key.startswith('heads.gender.'):
+                        new_key = new_key.replace('heads.gender.', 'prediction_layers.Gender.')
+                    # Remap final layer index for deep head: '.5.' -> '.4.'
+                    if '.5.' in new_key:
+                        new_key = new_key.replace('.5.', '.4.')
+                    if new_key in model_state_dict:
+                        if model_state_dict[new_key].shape == value.shape:
+                            model_state_dict[new_key].copy_(value)
+    def load_adapters_peft(self, load_directory: str, custom_head_name:str = 'custom_layers.pt'):
+        print(f"Loading adapters from directory: {load_directory}")
+        if self.use_lora:
+            self.backbone = self.backbone.merge_and_unload()
+            self.backbone = PeftModel.from_pretrained(self.backbone, load_directory)
+        custom_layers_path = os.path.join(load_directory, custom_head_name)
+        if not os.path.exists(custom_layers_path):
+            raise FileNotFoundError(f"Custom task heads file not found at {custom_layers_path}")
+        checkpoint = torch.load(custom_layers_path, map_location=("cuda" if torch.cuda.is_available() else "cpu"))
+        self.prediction_layers.load_state_dict(checkpoint['prediction_layers'])
+        if self.use_mtlora:
+            try:
+                self.mtl_layer.load_state_dict(checkpoint['mtl_layer'][0])
+            except KeyError:
+                self.mtl_layer.load_state_dict(checkpoint['mtl_layer'])
+            self.attn_pool.load_state_dict(checkpoint['mtl_attn_pool'])
+        print("Successfully loaded PEFT backbone and custom task heads.")
+    def save_trained(self, filepath: str):
+        trainable_param_names = {name for name, param in self.named_parameters() if param.requires_grad}
+        trainable_module_paths = {'.'.join(name.split('.')[:-1]) for name in trainable_param_names}
+        state_to_save = {}
+        full_state_dict = self.state_dict()
+        for key, value in full_state_dict.items():
+            if key in trainable_param_names:
+                state_to_save[key] = value
+                continue
+            current_module_path = '.'.join(key.split('.')[:-1])
+            if current_module_path in trainable_module_paths:
+                state_to_save[key] = value
+        print(f"Saving {len(state_to_save)} state entries (parameters and buffers) to {filepath}")
+        torch.save(state_to_save, filepath)
+    def load_trained_legacy(self, filepath: str, device='cuda'):
+        """The training of some checkpoint where done with a different model class,
+        so there is the need of remapping the key names, so they match with this new model class"""
+        print(f"Loading trained states from structured checkpoint: {filepath}")
+        checkpoint = torch.load(filepath, map_location=device)
+        model_state_dict = self.state_dict()
+        loaded_keys_count = 0
+        skipped_keys = []
+        remapped_keys_examples = {}
+        if "backbone_state_dict" in checkpoint:
+            print("\n--- Processing Backbone Weights ---")
+            for loaded_key, value in checkpoint["backbone_state_dict"].items():
+                new_key = loaded_key
+                if new_key.startswith('strategy.backbone.'):
+                    new_key = new_key.replace('strategy.backbone.', 'backbone.')
+                if 'attn.in_proj_weight' in new_key and 'attn.in_proj.weight' not in new_key:
+                    new_key = new_key.replace('attn.in_proj_weight', 'attn.in_proj.weight')
+                if 'attn.in_proj_bias' in new_key and 'attn.in_proj.bias' not in new_key:
+                    new_key = new_key.replace('attn.in_proj_bias', 'attn.in_proj.bias')
+                if new_key in model_state_dict:
+                    if model_state_dict[new_key].shape == value.shape:
+                        model_state_dict[new_key].copy_(value)
+                        loaded_keys_count += 1
+                        if loaded_key != new_key and len(remapped_keys_examples) < 5:
+                            remapped_keys_examples[loaded_key] = new_key
+                    else:
+                        skipped_keys.append(f"{loaded_key} (Shape Mismatch: Model {model_state_dict[new_key].shape} vs Ckpt {value.shape})")
+                else:
+                    skipped_keys.append(f"{loaded_key} (as {new_key}) -> Not found in model")
+        if "prediction_layers" in checkpoint:
+            print("\n--- Processing Prediction Head Weights ---")
+            for loaded_key, value in checkpoint["prediction_layers"].items():
+                new_key = loaded_key
+                if new_key.startswith('heads.emotion.'):
+                    new_key = new_key.replace('heads.emotion.', 'prediction_layers.Emotion.')
+                if new_key.startswith('heads.age.'):
+                    new_key = new_key.replace('heads.age.', 'prediction_layers.Age.')
+                if new_key.startswith('heads.gender.'):
+                    new_key = new_key.replace('heads.gender.', 'prediction_layers.Gender.')
+                if '.5.' in new_key:
+                    new_key = new_key.replace('.5.', '.4.')
+                # Validate, load, and update trackers
+                if new_key in model_state_dict:
+                    if model_state_dict[new_key].shape == value.shape:
+                        model_state_dict[new_key].copy_(value)
+                        loaded_keys_count += 1
+                        if loaded_key != new_key and len(remapped_keys_examples) < 10:
+                            remapped_keys_examples[loaded_key] = new_key
+                    else:
+                        skipped_keys.append(f"{loaded_key} (Shape Mismatch: Model {model_state_dict[new_key].shape} vs Ckpt {value.shape})")
+                else:
+                    skipped_keys.append(f"{loaded_key} (as {new_key}) -> Not found in model")
+        if "attn_pool" in checkpoint:
+            print("\n--- Processing Attention Pool Weights ---")
+            for loaded_key, value in checkpoint["attn_pool"].items():
+                # The attn_pool keys in the source file also have the 'strategy.backbone' prefix
+                new_key = loaded_key.replace('strategy.backbone.attn_pool.', 'backbone.attn_pool.')
+                # Validate, load, and update trackers
+                if new_key in model_state_dict:
+                    if model_state_dict[new_key].shape == value.shape:
+                        model_state_dict[new_key].copy_(value)
+                        loaded_keys_count += 1
+                        if loaded_key != new_key and len(remapped_keys_examples) < 15:
+                            remapped_keys_examples[loaded_key] = new_key
+                    else:
+                        skipped_keys.append(f"{loaded_key} (Shape Mismatch: Model {model_state_dict[new_key].shape} vs Ckpt {value.shape})")
+                else:
+                    skipped_keys.append(f"{loaded_key} (as {new_key}) -> Not found in model")
+        if loaded_keys_count == 0:
+            print('LAODED 0')
+            self.load_state_dict(torch.load(filepath, map_location=device), strict=False)
+class MTLoRAResidualAttentionBlock(nn.Module):
+    """Adaptation of Perception Encoder ResidualAttentionBlock with MTLora, to produce t-task specific feature-maps and a shared feature map"""
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer =  nn.GELU,
+        norm_layer = nn.LayerNorm,
+        drop_path: float = 0.0,
+        rope: Optional[nn.Module] = None,
+        r: Union[int, Mapping[str, int]] = 0,
+        lora_shared_scale: float = 1.0,
+        lora_task_scale: float = 1.0,
+        lora_dropout: float = DROPOUT_P,
+        tasks=None,
+        trainable_scale_shared=False,
+        trainable_scale_per_task=False,
+        shared_mode: str = 'matrix',
+    ):
+        super().__init__()
+        self.tasks = tasks
+        self.num_heads = n_head
+        self.head_dim = d_model // n_head
+        self.scale = self.head_dim ** -0.5
+        self.rope = rope
+        task_scales = {t: lora_task_scale for t in tasks}
+        # MultiTask Lora for QKV matrices
+        # (MTLoRAQKV does not actually compute attention, but returns the shared QKV matrices and the task-specific QKV matrices)
+        self.attn = MTLoRAQKV(
+            in_features=d_model,
+            out_features=d_model,
+            r=r, lora_shared_scale=lora_shared_scale, lora_task_scale=task_scales,
+            lora_dropout=lora_dropout, tasks=tasks, trainable_scale_shared=trainable_scale_shared,
+            trainable_scale_per_task=trainable_scale_per_task, shared_mode=shared_mode
+        )
+        # MultiTask Lora for projection matrices in mha
+        self.out_proj = MTLoRALinear(
+            in_features=d_model,
+            out_features=d_model,
+            r=r, lora_shared_scale=lora_shared_scale, lora_task_scale=task_scales,
+            lora_dropout=lora_dropout, tasks=tasks, trainable_scale_shared=trainable_scale_shared,
+            trainable_scale_per_task=trainable_scale_per_task, shared_mode=shared_mode
+        )
+        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+        self.ln_1 = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        # LoRA-enabled MLP
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict([
+                ("c_fc", MTLoRALinear(
+                    d_model, mlp_width, r=r, lora_shared_scale=lora_shared_scale,
+                    lora_task_scale=task_scales, lora_dropout=lora_dropout, tasks=tasks,
+                    trainable_scale_shared=trainable_scale_shared, trainable_scale_per_task=trainable_scale_per_task,
+                    shared_mode=shared_mode
+                )),
+                ("gelu", act_layer()),
+                ("c_proj", MTLoRALinear(
+                    mlp_width, d_model, r=r, lora_shared_scale=lora_shared_scale,
+                    lora_task_scale=task_scales, lora_dropout=lora_dropout, tasks=tasks,
+                    trainable_scale_shared=trainable_scale_shared, trainable_scale_per_task=trainable_scale_per_task,
+                    shared_mode=shared_mode
+                )),
+            ])
+        )
+    def _call_attn(
+        self,
+        x_shared: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        x_tasks: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        # s is the number of patches/tokens, sequence length
+        proj, proj_tasks = self.attn(x_shared, x_tasks) # proj is (b s 3*d_model), proj_tasks is dict of (b s 3*d_model), one entry per task
+        def compute_attention(projection_tensor):
+            # Reshape Q, K, V
+            # projection_tensor is (b s 3*d_model), need to split and rearrange
+            _, s, _ = projection_tensor.shape
+            # output_features from MTLoRAQKV is d_model, so 3 * d_model
+            split_size = self.attn.q.linear.out_features # This should be d_model
+            # Unflatten into (b s 3 d_model) then transpose to get (3 b s d_model)
+            q, k, v = projection_tensor.unflatten(-1, (3, split_size)).permute(2, 0, 1, 3).contiguous()
+            # Rearrange for multi-head attention (b h s d)
+            q = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
+            k = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
+            v = rearrange(v, "b s (h d) -> b h s d", h=self.num_heads)
+            if self.rope:
+                q, k = self.rope(q, k)
+            attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, scale=self.scale)
+            return rearrange(attn_output, "b h s d -> b s (h d)")
+        # Process shared path
+        attn_result = compute_attention(proj)
+        # Process task-specific paths
+        attn_tasks_results = {}
+        if proj_tasks:
+            for task, task_proj in proj_tasks.items():
+                attn_tasks_results[task] = compute_attention(task_proj)
+        # Apply output projection
+        # out_proj is an MTLoRALinear, so its forward expects (x, x_tasks)
+        shared_out, tasks_out = self.out_proj(attn_result, x_tasks=attn_tasks_results if attn_tasks_results else None)
+        return shared_out, tasks_out
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        x_tasks: Optional[Dict[str, torch.Tensor]] = None,
+    ):
+        # Attention block
+        norm_x = self.ln_1(x)
+        norm_x_tasks = {task: self.ln_1(x_tasks[task]) for task in self.tasks} if x_tasks else None
+        attn_out, attn_tasks_out = self._call_attn(norm_x, attn_mask=attn_mask, x_tasks=norm_x_tasks)
+        x = x + self.drop_path1(self.ls_1(attn_out))
+        if attn_tasks_out and x_tasks:
+            for task in self.tasks:
+                x_tasks[task] = x_tasks[task] + self.drop_path1(self.ls_1(attn_tasks_out[task]))
+        # MLP block
+        norm_x = self.ln_2(x)
+        norm_x_tasks = {task: self.ln_2(x_tasks[task]) for task in self.tasks} if x_tasks else None
+        # The MTLoRALinear forward needs to be called directly for the sequential MLP
+        mlp_fc_out, mlp_fc_tasks_out = self.mlp.c_fc(norm_x, norm_x_tasks)
+        gelu_out = self.mlp.gelu(mlp_fc_out)
+        gelu_tasks_out = {task: self.mlp.gelu(mlp_fc_tasks_out[task]) for task in self.tasks} if mlp_fc_tasks_out else None
+        mlp_proj_out, mlp_proj_tasks_out = self.mlp.c_proj(gelu_out, gelu_tasks_out)
+        x = x + self.drop_path2(self.ls_2(mlp_proj_out))
+        if mlp_proj_tasks_out and x_tasks:
+            for task in self.tasks:
+                x_tasks[task] = x_tasks[task] + self.drop_path2(self.ls_2(mlp_proj_tasks_out[task]))
+        return x, x_tasks
+    def load_from_original_block(self, original_block):
+        """
+        Initializes the weights of this block from a pre-trained ResidualAttentionBlock.
+        The LoRA-specific parameters are reset to their initial state.
+        """
+        with torch.no_grad():
+            # Copy LayerNorm and LayerScale weights
+            self.ln_1.load_state_dict(original_block.ln_1.state_dict())
+            self.ln_2.load_state_dict(original_block.ln_2.state_dict())
+            self.ls_1.load_state_dict(original_block.ls_1.state_dict())
+            self.ls_2.load_state_dict(original_block.ls_2.state_dict())
+            # Copy MLP weights into the .linear attribute of the MTLoRALinear layers
+            self.mlp.c_fc.linear.load_state_dict(original_block.mlp.c_fc.state_dict())
+            self.mlp.c_proj.linear.load_state_dict(original_block.mlp.c_proj.state_dict())
+            # Copy Attention weights
+            # Both SelfAttention and nn.MultiheadAttention store QKV weights combined
+            if isinstance(original_block.attn, SelfAttention):
+                # Using migrate_weights ensures the Parameters are copied to the Linear layer first
+                # Then we can extract from the Linear layer
+                original_block.attn.migrate_weights() # Ensure weights are in .in_proj and .out_proj
+                # Split the combined weight and bias tensors into Q, K, V from .in_proj
+                qkv_weight = original_block.attn.in_proj.weight
+                qkv_bias = original_block.attn.in_proj.bias
+                q_w, k_w, v_w = qkv_weight.chunk(3)
+                q_b, k_b, v_b = qkv_bias.chunk(3)
+                # Load into the .linear attributes of the MTLoRAQKV module
+                self.attn.q.linear.weight.copy_(q_w)
+                self.attn.q.linear.bias.copy_(q_b)
+                self.attn.k.linear.weight.copy_(k_w)
+                self.attn.k.linear.bias.copy_(k_b)
+                self.attn.v.linear.weight.copy_(v_w)
+                self.attn.v.linear.bias.copy_(v_b)
+                # Load the output projection weights
+                self.out_proj.linear.load_state_dict(original_block.attn.out_proj.state_dict())
+            elif isinstance(original_block.attn, nn.MultiheadAttention):
+                self.attn.q.linear.weight.copy_(original_block.attn.in_proj_weight[:self.attn.q.linear.out_features, :])
+                self.attn.q.linear.bias.copy_(original_block.attn.in_proj_bias[:self.attn.q.linear.out_features])
+                self.attn.k.linear.weight.copy_(original_block.attn.in_proj_weight[self.attn.q.linear.out_features:2*self.attn.q.linear.out_features, :])
+                self.attn.k.linear.bias.copy_(original_block.attn.in_proj_bias[self.attn.q.linear.out_features:2*self.attn.q.linear.out_features])
+                self.attn.v.linear.weight.copy_(original_block.attn.in_proj_weight[2*self.attn.q.linear.out_features:3*self.attn.q.linear.out_features, :])
+                self.attn.v.linear.bias.copy_(original_block.attn.in_proj_bias[2*self.attn.q.linear.out_features:3*self.attn.q.linear.out_features])
+                self.out_proj.linear.weight.copy_(original_block.attn.out_proj.weight)
+                self.out_proj.linear.bias.copy_(original_block.attn.out_proj.bias)
+            else:
+                raise TypeError(f"Unsupported attention module type in original_block: {type(original_block.attn)}")
+        # After loading pretrained weights, re-initialize LoRA-specific parameters
+        # This ensures that at the start of finetuning, the LoRA adjustment is zero.
+        self.attn.reset_parameters()
+        self.out_proj.reset_parameters()
+        self.mlp.c_fc.reset_parameters()
+        self.mlp.c_proj.reset_parameters()
+        print("Successfully loaded weights from original ResidualAttentionBlock and reset LoRA parameters.")
+class MTLoRAAttentionPooling(nn.Module):
+    """
+    A  MT-LoRA equivalent of the AttentionPooling transformer block.
+    This module replicates the full original architecture:
+    1. Task-specific probes for attention pooling.
+    2. MT-LoRA enabled Q/K/V and Output projections.
+    3. A LayerNorm layer.
+    4. An MLP block with MT-LoRA enabled linear layers.
+    5. A final residual connection, matching the original's structure.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        tasks: List[str],
+        r: Union[int, Mapping[str, int]] = 0,
+        lora_shared_scale: float = 1.0,
+        lora_task_scale: float = 1.0,
+        lora_dropout: float = 0.0,
+        mlp_ratio: int = 4,
+        act_layer =  nn.GELU,
+        norm_layer = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.tasks = tasks
+        self.num_heads = num_heads
+        self.probe = nn.ParameterDict({
+            task: nn.Parameter(torch.randn(1, 1, embed_dim))
+            for task in tasks
+        })
+        task_scales = {t: lora_task_scale for t in tasks}
+        self.q_proj = MTLoRALinear(
+            embed_dim, embed_dim, r=r, lora_shared_scale=lora_shared_scale, lora_task_scale=task_scales,
+            lora_dropout=lora_dropout, tasks=tasks
+        )
+        self.k_proj = MTLoRALinear(
+            embed_dim, embed_dim, r=r, lora_shared_scale=lora_shared_scale, lora_task_scale=task_scales,
+            lora_dropout=lora_dropout, tasks=tasks
+        )
+        self.v_proj = MTLoRALinear(
+            embed_dim, embed_dim, r=r, lora_shared_scale=lora_shared_scale, lora_task_scale=task_scales,
+            lora_dropout=lora_dropout, tasks=tasks
+        )
+        self.out_proj = MTLoRALinear(
+            embed_dim, embed_dim, r=r, lora_shared_scale=lora_shared_scale, lora_task_scale=task_scales,
+            lora_dropout=lora_dropout, tasks=tasks
+        )
+        self.layernorm = norm_layer(embed_dim)
+        mlp_width = int(embed_dim * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict([
+                ("c_fc", MTLoRALinear(
+                    embed_dim, mlp_width, r=r, lora_shared_scale=lora_shared_scale,
+                    lora_task_scale=task_scales, lora_dropout=lora_dropout, tasks=tasks
+                )),
+                ("gelu", nn.GELU()),
+                ("c_proj", MTLoRALinear(
+                    mlp_width, embed_dim, r=r, lora_shared_scale=lora_shared_scale,
+                    lora_task_scale=task_scales, lora_dropout=lora_dropout, tasks=tasks
+                )),
+            ])
+        )
+    def load_from_original(self, original_pool: AttentionPooling):
+        """Initializes all weights from the pretrained AttentionPooling block."""
+        with torch.no_grad():
+            original_attn = original_pool.attn
+            for task in self.tasks:
+                self.probe[task].copy_(original_pool.probe)
+            q_w, k_w, v_w = original_attn.in_proj_weight.chunk(3)
+            q_b, k_b, v_b = original_attn.in_proj_bias.chunk(3)
+            self.q_proj.linear.weight.copy_(q_w)
+            self.q_proj.linear.bias.copy_(q_b)
+            self.k_proj.linear.weight.copy_(k_w)
+            self.k_proj.linear.bias.copy_(k_b)
+            self.v_proj.linear.weight.copy_(v_w)
+            self.v_proj.linear.bias.copy_(v_b)
+            self.out_proj.linear.load_state_dict(original_attn.out_proj.state_dict())
+            self.layernorm.load_state_dict(original_pool.layernorm.state_dict())
+            self.mlp.c_fc.linear.load_state_dict(original_pool.mlp.c_fc.state_dict())
+            self.mlp.c_proj.linear.load_state_dict(original_pool.mlp.c_proj.state_dict())
+        self.q_proj.reset_parameters()
+        self.k_proj.reset_parameters()
+        self.v_proj.reset_parameters()
+        self.out_proj.reset_parameters()
+        self.mlp.c_fc.reset_parameters()
+        self.mlp.c_proj.reset_parameters()
+        print("Full MT-LoRA Attention Pooling block created and initialized from pretrained weights.")
+    def forward(self, x_tasks: Dict[str, torch.Tensor]):
+        """
+        Forward pass that correctly handles unique inputs for each task.
+        In this version, K and V are calculated inside the loop based on
+        the task-specific input 'x', and the each task has it's unique probe.
+        """
+        final_outputs = {}
+        for task, x in x_tasks.items():
+            B, N, C = x.shape
+            probe = self.probe[task].repeat(B, 1, 1)
+            _, q_task_dict = self.q_proj(probe, x_tasks={task: probe})
+            q = q_task_dict[task]
+            _, k_task_dict = self.k_proj(x, x_tasks={task: x})
+            k = k_task_dict[task]
+            _, v_task_dict = self.v_proj(x, x_tasks={task: x})
+            v = v_task_dict[task]
+            q = rearrange(q, 'b n (h d) -> b h n d', h=self.num_heads)
+            k = rearrange(k, 'b n (h d) -> b h n d', h=self.num_heads)
+            v = rearrange(v, 'b n (h d) -> b h n d', h=self.num_heads)
+            attn_out = F.scaled_dot_product_attention(q, k, v)
+            attn_out_rearranged = rearrange(attn_out, 'b h n d -> b n (h d)')
+            _, out_proj_dict = self.out_proj(attn_out_rearranged, x_tasks={task: attn_out_rearranged})
+            x_attn = out_proj_dict[task]
+            norm_attn = self.layernorm(x_attn)
+            _, fc_task_dict = self.mlp.c_fc(norm_attn, x_tasks={task: norm_attn})
+            gelu_out = self.mlp.gelu(fc_task_dict[task])
+            _, proj_task_dict = self.mlp.c_proj(gelu_out, x_tasks={task: gelu_out})
+            mlp_out = proj_task_dict[task]
+            final_outputs[task] = x_attn + mlp_out
+        return final_outputs

utils/__pycache__/commons.cpython-313.pyc ADDED Viewed

Binary file (8.06 kB). View file

utils/__pycache__/dataset.cpython-313.pyc ADDED Viewed

Binary file (26.6 kB). View file

utils/__pycache__/face_detector.cpython-313.pyc ADDED Viewed

Binary file (6.43 kB). View file

utils/__pycache__/task_config.cpython-313.pyc ADDED Viewed

Binary file (1.19 kB). View file

utils/commons.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""Contains functions used for loading and logging models """
+import sys
+import os
+from transformers import AutoModel, AutoProcessor
+import os
+import core.vision_encoder.pe as pe
+import core.vision_encoder.transforms as transforms_pe
+from core.vision_encoder.config import PE_VISION_CONFIG
+import torchvision.transforms as transforms
+from PIL import Image
+import requests
+def print_trainable_params(model):
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    percent = (trainable_params / total_params * 100) if total_params > 0 else 0
+    print("\n--- Summary ---")
+    print(f"Trainable parameters: {trainable_params:,}")
+    print(f"Total parameters:     {total_params:,}")
+    print(f"Percentage:           {percent:.2f}%")
+def get_backbone_pe(version, print_info=False, apply_migration_flag=False,pretrained=True):
+    """
+    Load PE ViT model, return model, transforms and size of output (dimension of embedding of last token)
+    """
+    print(f'Loading {version}...')
+    backbone = pe.VisionTransformer.from_config(version, pretrained=pretrained)
+    backbone_config = PE_VISION_CONFIG[version]
+    transform = transforms_pe.get_image_transform_fix(image_size=backbone_config.image_size)
+    print("\nYou can ignore the Missing keys list above.")
+    print(f"Applying migration = {apply_migration_flag}")
+    if print_info:
+        attnpool= backbone.attn_pool
+        print(f'embed_dim={attnpool.embed_dim}\nnum_heads={attnpool.num_heads}')
+        print(f'OUTPUT DIM = {backbone_config.output_dim}')
+    def apply_migration(m):
+        if isinstance(m, pe.SelfAttention):
+            m.migrate_weights()
+    if apply_migration_flag == True: # when testing/resuming no migration should be used
+        print('[MIGRATION] Migrating weights for PEFT compatibiltyy')
+        backbone.apply(apply_migration)
+    return backbone, transform, backbone_config.output_dim
+def get_backbone_dinov3(model_name: str="facebook/dinov3-vitb16-pretrain-lvd1689m", print_info=False):
+    print(f"Loading Hugging Face model: {model_name}")
+    processor = AutoProcessor.from_pretrained(model_name)
+    # Extract image processing configuration from the loaded processor
+    image_processor_config = processor
+    image_size = image_processor_config.size['height']
+    image_mean = image_processor_config.image_mean
+    image_std = image_processor_config.image_std
+    transform = transforms.Compose([
+        transforms.Lambda(_convert_to_rgb),
+        transforms.Resize((image_size, image_size), antialias=True),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=image_mean, std=image_std)
+    ])
+    # Load the model and return only the vision backbone
+    vision_model = AutoModel.from_pretrained(model_name)
+    if print_info:
+        print(f'\nVISION CONFIGS:\n{vision_model.config}')
+        print(f'\n\n\n{vision_model}')
+    return vision_model, transform, vision_model.config.hidden_size
+def get_backbone_siglip2(model_name: str='google/siglip2-base-patch16-224',print_info=False):
+    """
+    Load siglip2 ViT model, return model, transforms and size of output (dimension of embedding of last token)
+    """
+    print(f"Loading Hugging Face model: {model_name}")
+    processor = AutoProcessor.from_pretrained(model_name)
+    # Extract image processing configuration from the loaded processor
+    image_processor_config = processor.image_processor
+    image_size = image_processor_config.size['height']
+    image_mean = image_processor_config.image_mean
+    image_std = image_processor_config.image_std
+    transform = transforms.Compose([
+        transforms.Lambda(_convert_to_rgb),
+        transforms.Resize((image_size, image_size), antialias=True),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=image_mean, std=image_std)
+    ])
+    # Load the model and return only the vision backbone
+    model = AutoModel.from_pretrained(model_name)
+    vision_model = model.vision_model
+    if print_info:
+        print(f'\nVISION CONFIGS:\n{vision_model.config}')
+        print(f'\n\n***************MHAP\n{vision_model.head}')
+    return vision_model, transform, vision_model.config.hidden_size
+def _convert_to_rgb(image: Image.Image) -> Image.Image:
+    """Converts a PIL Image to RGB format."""
+    return image.convert("RGB")
+def get_backbone(version: str, apply_migration : bool = False):
+    """
+    Returns vision transformer backbone
+    Args:
+        version: Name of the backbone to use, PE-Core or Siglip
+        ckpt: if different from null, loads backbone from .pt file specified, only for PE
+    """
+    if 'PE-Core-' in version:
+        return get_backbone_pe(version, False, apply_migration)
+    elif 'siglip2' in version:
+        print('[LOADING SIGLIP2]')
+        return get_backbone_siglip2(version)
+    elif 'dinov3' in version:
+        return get_backbone_dinov3(version)
+def send_telegram_message(message: str):
+    """Sends a message to a Telegram chat using credentials from the config."""
+    # Get credentials from your config object. Use getattr for safety.
+    token = os.getenv("BOT_TOKEN")
+    chat_id = "1220514183"
+    if not token or not chat_id:
+        # Silently fail if credentials are not set
+        return
+    api_url = f"https://api.telegram.org/bot{token}/sendMessage"
+    payload = {
+        'chat_id': chat_id,
+        'text': message,
+        'parse_mode': 'Markdown'  # For nice formatting (bold, italics, etc.)
+    }
+    try:
+        response = requests.post(api_url, data=payload, timeout=10)
+        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
+    except requests.exceptions.RequestException as e:
+        # Don't crash the training loop if Telegram is down
+        print(f"\nWarning: Could not send Telegram message. Error: {e}")

utils/deploy.prototxt ADDED Viewed

	@@ -0,0 +1,1789 @@

+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 300
+  dim: 300
+}
+layer {
+  name: "data_bn"
+  type: "BatchNorm"
+  bottom: "data"
+  top: "data_bn"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "data_scale"
+  type: "Scale"
+  bottom: "data_bn"
+  top: "data_bn"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "conv1_h"
+  type: "Convolution"
+  bottom: "data_bn"
+  top: "conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 3
+    kernel_size: 7
+    stride: 2
+    weight_filler {
+      type: "msra"
+      variance_norm: FAN_OUT
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "conv1_bn_h"
+  type: "BatchNorm"
+  bottom: "conv1_h"
+  top: "conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "conv1_scale_h"
+  type: "Scale"
+  bottom: "conv1_h"
+  top: "conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "conv1_relu"
+  type: "ReLU"
+  bottom: "conv1_h"
+  top: "conv1_h"
+}
+layer {
+  name: "conv1_pool"
+  type: "Pooling"
+  bottom: "conv1_h"
+  top: "conv1_pool"
+  pooling_param {
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "layer_64_1_conv1_h"
+  type: "Convolution"
+  bottom: "conv1_pool"
+  top: "layer_64_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 32
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_64_1_bn2_h"
+  type: "BatchNorm"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_64_1_scale2_h"
+  type: "Scale"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_64_1_relu2"
+  type: "ReLU"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv1_h"
+}
+layer {
+  name: "layer_64_1_conv2_h"
+  type: "Convolution"
+  bottom: "layer_64_1_conv1_h"
+  top: "layer_64_1_conv2_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 32
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_64_1_sum"
+  type: "Eltwise"
+  bottom: "layer_64_1_conv2_h"
+  bottom: "conv1_pool"
+  top: "layer_64_1_sum"
+}
+layer {
+  name: "layer_128_1_bn1_h"
+  type: "BatchNorm"
+  bottom: "layer_64_1_sum"
+  top: "layer_128_1_bn1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_128_1_scale1_h"
+  type: "Scale"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_bn1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_128_1_relu1"
+  type: "ReLU"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_bn1_h"
+}
+layer {
+  name: "layer_128_1_conv1_h"
+  type: "Convolution"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_128_1_bn2"
+  type: "BatchNorm"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_128_1_scale2"
+  type: "Scale"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_128_1_relu2"
+  type: "ReLU"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv1_h"
+}
+layer {
+  name: "layer_128_1_conv2"
+  type: "Convolution"
+  bottom: "layer_128_1_conv1_h"
+  top: "layer_128_1_conv2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_128_1_conv_expand_h"
+  type: "Convolution"
+  bottom: "layer_128_1_bn1_h"
+  top: "layer_128_1_conv_expand_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 0
+    kernel_size: 1
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_128_1_sum"
+  type: "Eltwise"
+  bottom: "layer_128_1_conv2"
+  bottom: "layer_128_1_conv_expand_h"
+  top: "layer_128_1_sum"
+}
+layer {
+  name: "layer_256_1_bn1"
+  type: "BatchNorm"
+  bottom: "layer_128_1_sum"
+  top: "layer_256_1_bn1"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_256_1_scale1"
+  type: "Scale"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_bn1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_256_1_relu1"
+  type: "ReLU"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_bn1"
+}
+layer {
+  name: "layer_256_1_conv1"
+  type: "Convolution"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_conv1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_256_1_bn2"
+  type: "BatchNorm"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv1"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_256_1_scale2"
+  type: "Scale"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_256_1_relu2"
+  type: "ReLU"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv1"
+}
+layer {
+  name: "layer_256_1_conv2"
+  type: "Convolution"
+  bottom: "layer_256_1_conv1"
+  top: "layer_256_1_conv2"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_256_1_conv_expand"
+  type: "Convolution"
+  bottom: "layer_256_1_bn1"
+  top: "layer_256_1_conv_expand"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 0
+    kernel_size: 1
+    stride: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_256_1_sum"
+  type: "Eltwise"
+  bottom: "layer_256_1_conv2"
+  bottom: "layer_256_1_conv_expand"
+  top: "layer_256_1_sum"
+}
+layer {
+  name: "layer_512_1_bn1"
+  type: "BatchNorm"
+  bottom: "layer_256_1_sum"
+  top: "layer_512_1_bn1"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_512_1_scale1"
+  type: "Scale"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_bn1"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_512_1_relu1"
+  type: "ReLU"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_bn1"
+}
+layer {
+  name: "layer_512_1_conv1_h"
+  type: "Convolution"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 128
+    bias_term: false
+    pad: 1
+    kernel_size: 3
+    stride: 1 # 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_512_1_bn2_h"
+  type: "BatchNorm"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv1_h"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "layer_512_1_scale2_h"
+  type: "Scale"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv1_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "layer_512_1_relu2"
+  type: "ReLU"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv1_h"
+}
+layer {
+  name: "layer_512_1_conv2_h"
+  type: "Convolution"
+  bottom: "layer_512_1_conv1_h"
+  top: "layer_512_1_conv2_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 2 # 1
+    kernel_size: 3
+    stride: 1
+    dilation: 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_512_1_conv_expand_h"
+  type: "Convolution"
+  bottom: "layer_512_1_bn1"
+  top: "layer_512_1_conv_expand_h"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  convolution_param {
+    num_output: 256
+    bias_term: false
+    pad: 0
+    kernel_size: 1
+    stride: 1 # 2
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.0
+    }
+  }
+}
+layer {
+  name: "layer_512_1_sum"
+  type: "Eltwise"
+  bottom: "layer_512_1_conv2_h"
+  bottom: "layer_512_1_conv_expand_h"
+  top: "layer_512_1_sum"
+}
+layer {
+  name: "last_bn_h"
+  type: "BatchNorm"
+  bottom: "layer_512_1_sum"
+  top: "layer_512_1_sum"
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+  param {
+    lr_mult: 0.0
+  }
+}
+layer {
+  name: "last_scale_h"
+  type: "Scale"
+  bottom: "layer_512_1_sum"
+  top: "layer_512_1_sum"
+  param {
+    lr_mult: 1.0
+    decay_mult: 1.0
+  }
+  param {
+    lr_mult: 2.0
+    decay_mult: 1.0
+  }
+  scale_param {
+    bias_term: true
+  }
+}
+layer {
+  name: "last_relu"
+  type: "ReLU"
+  bottom: "layer_512_1_sum"
+  top: "fc7"
+}
+layer {
+  name: "conv6_1_h"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "conv6_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_1_relu"
+  type: "ReLU"
+  bottom: "conv6_1_h"
+  top: "conv6_1_h"
+}
+layer {
+  name: "conv6_2_h"
+  type: "Convolution"
+  bottom: "conv6_1_h"
+  top: "conv6_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_relu"
+  type: "ReLU"
+  bottom: "conv6_2_h"
+  top: "conv6_2_h"
+}
+layer {
+  name: "conv7_1_h"
+  type: "Convolution"
+  bottom: "conv6_2_h"
+  top: "conv7_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_1_relu"
+  type: "ReLU"
+  bottom: "conv7_1_h"
+  top: "conv7_1_h"
+}
+layer {
+  name: "conv7_2_h"
+  type: "Convolution"
+  bottom: "conv7_1_h"
+  top: "conv7_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_relu"
+  type: "ReLU"
+  bottom: "conv7_2_h"
+  top: "conv7_2_h"
+}
+layer {
+  name: "conv8_1_h"
+  type: "Convolution"
+  bottom: "conv7_2_h"
+  top: "conv8_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_1_relu"
+  type: "ReLU"
+  bottom: "conv8_1_h"
+  top: "conv8_1_h"
+}
+layer {
+  name: "conv8_2_h"
+  type: "Convolution"
+  bottom: "conv8_1_h"
+  top: "conv8_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_relu"
+  type: "ReLU"
+  bottom: "conv8_2_h"
+  top: "conv8_2_h"
+}
+layer {
+  name: "conv9_1_h"
+  type: "Convolution"
+  bottom: "conv8_2_h"
+  top: "conv9_1_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 0
+    kernel_size: 1
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_1_relu"
+  type: "ReLU"
+  bottom: "conv9_1_h"
+  top: "conv9_1_h"
+}
+layer {
+  name: "conv9_2_h"
+  type: "Convolution"
+  bottom: "conv9_1_h"
+  top: "conv9_2_h"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_relu"
+  type: "ReLU"
+  bottom: "conv9_2_h"
+  top: "conv9_2_h"
+}
+layer {
+  name: "conv4_3_norm"
+  type: "Normalize"
+  bottom: "layer_256_1_bn1"
+  top: "conv4_3_norm"
+  norm_param {
+    across_spatial: false
+    scale_filler {
+      type: "constant"
+      value: 20
+    }
+    channel_shared: false
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_loc"
+  top: "conv4_3_norm_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_loc_perm"
+  top: "conv4_3_norm_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf"
+  type: "Convolution"
+  bottom: "conv4_3_norm"
+  top: "conv4_3_norm_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 8 # 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv4_3_norm_mbox_conf"
+  top: "conv4_3_norm_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv4_3_norm_mbox_conf_perm"
+  top: "conv4_3_norm_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv4_3_norm_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv4_3_norm"
+  bottom: "data"
+  top: "conv4_3_norm_mbox_priorbox"
+  prior_box_param {
+    min_size: 30.0
+    max_size: 60.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 8
+    offset: 0.5
+  }
+}
+layer {
+  name: "fc7_mbox_loc"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_loc_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_loc"
+  top: "fc7_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_loc_perm"
+  top: "fc7_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf"
+  type: "Convolution"
+  bottom: "fc7"
+  top: "fc7_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 12 # 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "fc7_mbox_conf_perm"
+  type: "Permute"
+  bottom: "fc7_mbox_conf"
+  top: "fc7_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "fc7_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "fc7_mbox_conf_perm"
+  top: "fc7_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "fc7_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "fc7"
+  bottom: "data"
+  top: "fc7_mbox_priorbox"
+  prior_box_param {
+    min_size: 60.0
+    max_size: 111.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 16
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv6_2_h"
+  top: "conv6_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_loc"
+  top: "conv6_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_loc_perm"
+  top: "conv6_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv6_2_h"
+  top: "conv6_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 12 # 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv6_2_mbox_conf"
+  top: "conv6_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv6_2_mbox_conf_perm"
+  top: "conv6_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv6_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv6_2_h"
+  bottom: "data"
+  top: "conv6_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 111.0
+    max_size: 162.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 32
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv7_2_h"
+  top: "conv7_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_loc"
+  top: "conv7_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_loc_perm"
+  top: "conv7_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv7_2_h"
+  top: "conv7_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 12 # 126
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv7_2_mbox_conf"
+  top: "conv7_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv7_2_mbox_conf_perm"
+  top: "conv7_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv7_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv7_2_h"
+  bottom: "data"
+  top: "conv7_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 162.0
+    max_size: 213.0
+    aspect_ratio: 2
+    aspect_ratio: 3
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 64
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv8_2_h"
+  top: "conv8_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_loc"
+  top: "conv8_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_loc_perm"
+  top: "conv8_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv8_2_h"
+  top: "conv8_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 8 # 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv8_2_mbox_conf"
+  top: "conv8_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv8_2_mbox_conf_perm"
+  top: "conv8_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv8_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv8_2_h"
+  bottom: "data"
+  top: "conv8_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 213.0
+    max_size: 264.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 100
+    offset: 0.5
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc"
+  type: "Convolution"
+  bottom: "conv9_2_h"
+  top: "conv9_2_mbox_loc"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_loc"
+  top: "conv9_2_mbox_loc_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_loc_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_loc_perm"
+  top: "conv9_2_mbox_loc_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf"
+  type: "Convolution"
+  bottom: "conv9_2_h"
+  top: "conv9_2_mbox_conf"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 8 # 84
+    pad: 1
+    kernel_size: 3
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_perm"
+  type: "Permute"
+  bottom: "conv9_2_mbox_conf"
+  top: "conv9_2_mbox_conf_perm"
+  permute_param {
+    order: 0
+    order: 2
+    order: 3
+    order: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_conf_flat"
+  type: "Flatten"
+  bottom: "conv9_2_mbox_conf_perm"
+  top: "conv9_2_mbox_conf_flat"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "conv9_2_mbox_priorbox"
+  type: "PriorBox"
+  bottom: "conv9_2_h"
+  bottom: "data"
+  top: "conv9_2_mbox_priorbox"
+  prior_box_param {
+    min_size: 264.0
+    max_size: 315.0
+    aspect_ratio: 2
+    flip: true
+    clip: false
+    variance: 0.1
+    variance: 0.1
+    variance: 0.2
+    variance: 0.2
+    step: 300
+    offset: 0.5
+  }
+}
+layer {
+  name: "mbox_loc"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_loc_flat"
+  bottom: "fc7_mbox_loc_flat"
+  bottom: "conv6_2_mbox_loc_flat"
+  bottom: "conv7_2_mbox_loc_flat"
+  bottom: "conv8_2_mbox_loc_flat"
+  bottom: "conv9_2_mbox_loc_flat"
+  top: "mbox_loc"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_conf"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_conf_flat"
+  bottom: "fc7_mbox_conf_flat"
+  bottom: "conv6_2_mbox_conf_flat"
+  bottom: "conv7_2_mbox_conf_flat"
+  bottom: "conv8_2_mbox_conf_flat"
+  bottom: "conv9_2_mbox_conf_flat"
+  top: "mbox_conf"
+  concat_param {
+    axis: 1
+  }
+}
+layer {
+  name: "mbox_priorbox"
+  type: "Concat"
+  bottom: "conv4_3_norm_mbox_priorbox"
+  bottom: "fc7_mbox_priorbox"
+  bottom: "conv6_2_mbox_priorbox"
+  bottom: "conv7_2_mbox_priorbox"
+  bottom: "conv8_2_mbox_priorbox"
+  bottom: "conv9_2_mbox_priorbox"
+  top: "mbox_priorbox"
+  concat_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_reshape"
+  type: "Reshape"
+  bottom: "mbox_conf"
+  top: "mbox_conf_reshape"
+  reshape_param {
+    shape {
+      dim: 0
+      dim: -1
+      dim: 2
+    }
+  }
+}
+layer {
+  name: "mbox_conf_softmax"
+  type: "Softmax"
+  bottom: "mbox_conf_reshape"
+  top: "mbox_conf_softmax"
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "mbox_conf_flatten"
+  type: "Flatten"
+  bottom: "mbox_conf_softmax"
+  top: "mbox_conf_flatten"
+  flatten_param {
+    axis: 1
+  }
+}
+layer {
+  name: "detection_out"
+  type: "DetectionOutput"
+  bottom: "mbox_loc"
+  bottom: "mbox_conf_flatten"
+  bottom: "mbox_priorbox"
+  top: "detection_out"
+  include {
+    phase: TEST
+  }
+  detection_output_param {
+    num_classes: 2
+    share_location: true
+    background_label_id: 0
+    nms_param {
+      nms_threshold: 0.45
+      top_k: 400
+    }
+    code_type: CENTER_SIZE
+    keep_top_k: 200
+    confidence_threshold: 0.01
+  }
+}

utils/face_detector.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Face detector, used only for the demo to crop faces, as datasets have already been face-cropped"""
+import os
+import cv2
+import numpy as np
+DEFAULT_FACE_DETECTOR = "utils/res10_300x300_ssd_iter_140000_fp16.caffemodel"
+DEFAULT_DEPLOY = "utils/deploy.prototxt"
+def enclosing_square(rect):
+    # Crea un quadrato che contiene il rettangolo passato in ingresso
+    x, y, w, h = rect
+    side = max(w, h)
+    # Centra il quadrato sulla bbox originale
+    cx = x + w // 2
+    cy = y + h // 2
+    x_new = cx - side // 2
+    y_new = cy - side // 2
+    return (x_new, y_new, side, side)
+def cut(frame, roi):
+    pA = (int(roi[0]), int(roi[1]))
+    pB = (int(roi[0] + roi[2]), int(roi[1] + roi[3]))  # pB will be an internal point
+    W, H = frame.shape[1], frame.shape[0]
+    A0 = pA[0] if pA[0] >= 0 else 0
+    A1 = pA[1] if pA[1] >= 0 else 0
+    data = frame[A1:pB[1], A0:pB[0]]
+    if pB[0] < W and pB[1] < H and pA[0] >= 0 and pA[1] >= 0:
+        return data
+    w, h = int(roi[2]), int(roi[3])
+    img = np.zeros((h, w, frame.shape[2]), dtype=np.uint8)
+    offX = int(-roi[0]) if roi[0] < 0 else 0
+    offY = int(-roi[1]) if roi[1] < 0 else 0
+    np.copyto(img[offY:offY + data.shape[0], offX:offX + data.shape[1]], data)
+    return img
+class FaceDetector:
+    """Face detector to spot faces inside a picture."""
+    def __init__(self, face_detector = DEFAULT_FACE_DETECTOR, deploy=DEFAULT_DEPLOY, confidence_threshold=0.8):
+        self.detector = cv2.dnn.readNetFromCaffe(deploy, face_detector)
+        self.confidence_threshold = confidence_threshold
+    def detect(self, image, pad_rect=True):
+        blob = cv2.dnn.blobFromImage(image, 1.0, (300, 300), [104, 117, 123], False, False)
+        frameHeight, frameWidth, channels = image.shape
+        self.detector.setInput(blob)
+        detections = self.detector.forward()
+        faces_result = []
+        for i in range(detections.shape[2]):
+            confidence = detections[0, 0, i, 2]
+            if confidence > self.confidence_threshold:
+                x1 = int(detections[0, 0, i, 3] * frameWidth)
+                y1 = int(detections[0, 0, i, 4] * frameHeight)
+                x2 = int(detections[0, 0, i, 5] * frameWidth)
+                y2 = int(detections[0, 0, i, 6] * frameHeight)
+                f = (x1, y1, x2 - x1, y2 - y1)  # bbox: (x, y, w, h)
+                if f[2] > 1 and f[3] > 1:
+                    rect = enclosing_square(f) if pad_rect else f
+                    img_crop = cut(image, rect)
+                    if img_crop.shape[0] > 0 and img_crop.shape[1] > 0:
+                        faces_result.append((img_crop, confidence, rect))  # usa rect (quadrato) come bbox finale
+        if len(faces_result) == 0:
+            return None
+        return faces_result
+if __name__ == "__main__":
+    input_folder = "src/demo_images"
+    output_crop_folder = "./test/detector/crop"
+    output_bbox_folder = "./test/detector/bbox"
+    os.makedirs(output_crop_folder, exist_ok=True)
+    os.makedirs(output_bbox_folder, exist_ok=True)
+    face_detector = FaceDetector(confidence_threshold=0.8)
+    image_files = sorted([
+        f for f in os.listdir(input_folder)
+        if os.path.isfile(os.path.join(input_folder, f))
+    ])
+    for img_file in image_files:
+        img_path = os.path.join(input_folder, img_file)
+        img = cv2.imread(img_path)
+        if img is None:
+            continue
+        faces = face_detector.detect(img, pad_rect=True)
+        base_name = os.path.splitext(os.path.basename(img_path))[0]
+        if faces is not None:
+            # Salva i crop dei volti
+            for idx, (crop, confidence, bbox) in enumerate(faces):
+                crop_path = os.path.join(output_crop_folder, f"{base_name}_face{idx}.jpg")
+                cv2.imwrite(crop_path, crop)
+            # Salva l'immagine originale con bbox quadrata
+            img_bbox = img.copy()
+            for idx, (_, _, bbox) in enumerate(faces):
+                x, y, w, h = bbox  # bbox è già quadrata
+                cv2.rectangle(img_bbox, (x, y), (x + w, y + h), (0, 0, 255), 2)  # rosso BGR
+            bbox_path = os.path.join(output_bbox_folder, f"{base_name}_bbox.jpg")
+            cv2.imwrite(bbox_path, img_bbox)

utils/task_config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Task class definition """
+from dataclasses import dataclass
+from typing import  List, Type
+import torch.nn as nn
+@dataclass
+class Task:
+    """Encapsulates all configuration for a single task."""
+    name: str
+    class_labels: List[str]
+    criterion: Type[nn.Module]
+    weight: float = 1.0
+    use_weighted_loss: bool = False
+    @property
+    def num_classes(self) -> int:
+        return len(self.class_labels)