|
|
""" |
|
|
Pipeline de ML Profissional com MLOps |
|
|
Sistema completo de treinamento, versionamento e deployment de modelos |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
import logging |
|
|
import os |
|
|
import pickle |
|
|
from src.core import json_utils |
|
|
import hashlib |
|
|
from typing import Dict, List, Optional, Any, Union, Tuple, Type |
|
|
from datetime import datetime, timedelta |
|
|
from pathlib import Path |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from dataclasses import dataclass, field |
|
|
from enum import Enum |
|
|
import tempfile |
|
|
import shutil |
|
|
|
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.optim as optim |
|
|
from torch.utils.data import DataLoader, Dataset, random_split |
|
|
import torch.nn.functional as F |
|
|
from transformers import AutoTokenizer, AutoModel, AutoConfig |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score |
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder |
|
|
import joblib |
|
|
|
|
|
|
|
|
try: |
|
|
import mlflow |
|
|
import mlflow.pytorch |
|
|
MLFLOW_AVAILABLE = True |
|
|
except ImportError: |
|
|
MLFLOW_AVAILABLE = False |
|
|
|
|
|
try: |
|
|
import wandb |
|
|
WANDB_AVAILABLE = True |
|
|
except ImportError: |
|
|
WANDB_AVAILABLE = False |
|
|
|
|
|
from pydantic import BaseModel, Field |
|
|
import structlog |
|
|
|
|
|
logger = structlog.get_logger(__name__) |
|
|
|
|
|
|
|
|
class ModelType(Enum): |
|
|
"""Tipos de modelo""" |
|
|
ANOMALY_DETECTOR = "anomaly_detector" |
|
|
FINANCIAL_ANALYZER = "financial_analyzer" |
|
|
LEGAL_COMPLIANCE = "legal_compliance" |
|
|
ENSEMBLE = "ensemble" |
|
|
|
|
|
|
|
|
class TrainingStatus(Enum): |
|
|
"""Status do treinamento""" |
|
|
PENDING = "pending" |
|
|
PREPROCESSING = "preprocessing" |
|
|
TRAINING = "training" |
|
|
VALIDATING = "validating" |
|
|
COMPLETED = "completed" |
|
|
FAILED = "failed" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ModelMetrics: |
|
|
"""Métricas do modelo""" |
|
|
accuracy: float = 0.0 |
|
|
precision: float = 0.0 |
|
|
recall: float = 0.0 |
|
|
f1_score: float = 0.0 |
|
|
auc_roc: float = 0.0 |
|
|
loss: float = 0.0 |
|
|
val_accuracy: float = 0.0 |
|
|
val_loss: float = 0.0 |
|
|
inference_time_ms: float = 0.0 |
|
|
model_size_mb: float = 0.0 |
|
|
timestamp: datetime = field(default_factory=datetime.utcnow) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class TrainingRun: |
|
|
"""Execução de treinamento""" |
|
|
id: str |
|
|
model_type: ModelType |
|
|
status: TrainingStatus |
|
|
config: Dict[str, Any] |
|
|
metrics: Optional[ModelMetrics] = None |
|
|
artifacts_path: Optional[str] = None |
|
|
error_message: Optional[str] = None |
|
|
created_at: datetime = field(default_factory=datetime.utcnow) |
|
|
started_at: Optional[datetime] = None |
|
|
completed_at: Optional[datetime] = None |
|
|
experiment_id: Optional[str] = None |
|
|
|
|
|
|
|
|
class MLPipelineConfig(BaseModel): |
|
|
"""Configuração do pipeline ML""" |
|
|
|
|
|
|
|
|
model_name: str = "cidadao-transparency-model" |
|
|
model_version: str = "1.0.0" |
|
|
base_model: str = "neuralmind/bert-base-portuguese-cased" |
|
|
|
|
|
|
|
|
learning_rate: float = 2e-5 |
|
|
batch_size: int = 16 |
|
|
num_epochs: int = 10 |
|
|
warmup_steps: int = 500 |
|
|
weight_decay: float = 0.01 |
|
|
max_length: int = 512 |
|
|
|
|
|
|
|
|
train_split: float = 0.7 |
|
|
val_split: float = 0.15 |
|
|
test_split: float = 0.15 |
|
|
min_samples_per_class: int = 100 |
|
|
data_augmentation: bool = True |
|
|
|
|
|
|
|
|
device: str = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
num_workers: int = 4 |
|
|
pin_memory: bool = True |
|
|
mixed_precision: bool = True |
|
|
|
|
|
|
|
|
experiment_tracking: bool = True |
|
|
model_registry: bool = True |
|
|
auto_deployment: bool = False |
|
|
artifacts_dir: str = "./models/artifacts" |
|
|
models_dir: str = "./models/trained" |
|
|
|
|
|
|
|
|
early_stopping_patience: int = 3 |
|
|
gradient_accumulation_steps: int = 1 |
|
|
max_grad_norm: float = 1.0 |
|
|
|
|
|
|
|
|
eval_steps: int = 500 |
|
|
save_steps: int = 1000 |
|
|
logging_steps: int = 100 |
|
|
|
|
|
|
|
|
class TransparencyDataset(Dataset): |
|
|
"""Dataset para dados de transparência""" |
|
|
|
|
|
def __init__(self, texts: List[str], labels: List[int], tokenizer, max_length: int = 512): |
|
|
self.texts = texts |
|
|
self.labels = labels |
|
|
self.tokenizer = tokenizer |
|
|
self.max_length = max_length |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.texts) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
text = self.texts[idx] |
|
|
label = self.labels[idx] |
|
|
|
|
|
encoding = self.tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
padding='max_length', |
|
|
max_length=self.max_length, |
|
|
return_tensors='pt' |
|
|
) |
|
|
|
|
|
return { |
|
|
'input_ids': encoding['input_ids'].flatten(), |
|
|
'attention_mask': encoding['attention_mask'].flatten(), |
|
|
'label': torch.tensor(label, dtype=torch.long) |
|
|
} |
|
|
|
|
|
|
|
|
class TransparencyClassifier(nn.Module): |
|
|
"""Classificador especializado para transparência""" |
|
|
|
|
|
def __init__(self, model_name: str, num_labels: int = 3, dropout: float = 0.3): |
|
|
super().__init__() |
|
|
|
|
|
self.bert = AutoModel.from_pretrained(model_name) |
|
|
self.dropout = nn.Dropout(dropout) |
|
|
|
|
|
|
|
|
hidden_size = self.bert.config.hidden_size |
|
|
|
|
|
|
|
|
self.anomaly_classifier = nn.Sequential( |
|
|
nn.Linear(hidden_size, hidden_size // 2), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(dropout), |
|
|
nn.Linear(hidden_size // 2, num_labels) |
|
|
) |
|
|
|
|
|
|
|
|
self.financial_classifier = nn.Sequential( |
|
|
nn.Linear(hidden_size, hidden_size // 2), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(dropout), |
|
|
nn.Linear(hidden_size // 2, 5) |
|
|
) |
|
|
|
|
|
|
|
|
self.legal_classifier = nn.Sequential( |
|
|
nn.Linear(hidden_size, hidden_size // 4), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(dropout), |
|
|
nn.Linear(hidden_size // 4, 2) |
|
|
) |
|
|
|
|
|
|
|
|
self.confidence_head = nn.Sequential( |
|
|
nn.Linear(hidden_size, hidden_size // 4), |
|
|
nn.ReLU(), |
|
|
nn.Linear(hidden_size // 4, 1), |
|
|
nn.Sigmoid() |
|
|
) |
|
|
|
|
|
def forward(self, input_ids, attention_mask, labels=None, task="anomaly"): |
|
|
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) |
|
|
pooled_output = outputs.pooler_output |
|
|
pooled_output = self.dropout(pooled_output) |
|
|
|
|
|
|
|
|
anomaly_logits = self.anomaly_classifier(pooled_output) |
|
|
financial_logits = self.financial_classifier(pooled_output) |
|
|
legal_logits = self.legal_classifier(pooled_output) |
|
|
confidence = self.confidence_head(pooled_output) |
|
|
|
|
|
outputs = { |
|
|
'anomaly_logits': anomaly_logits, |
|
|
'financial_logits': financial_logits, |
|
|
'legal_logits': legal_logits, |
|
|
'confidence': confidence |
|
|
} |
|
|
|
|
|
|
|
|
if labels is not None: |
|
|
if task == "anomaly": |
|
|
loss = F.cross_entropy(anomaly_logits, labels) |
|
|
elif task == "financial": |
|
|
loss = F.cross_entropy(financial_logits, labels) |
|
|
elif task == "legal": |
|
|
loss = F.cross_entropy(legal_logits, labels) |
|
|
else: |
|
|
|
|
|
loss = 0 |
|
|
if 'anomaly' in labels: |
|
|
loss += F.cross_entropy(anomaly_logits, labels['anomaly']) |
|
|
if 'financial' in labels: |
|
|
loss += F.cross_entropy(financial_logits, labels['financial']) |
|
|
if 'legal' in labels: |
|
|
loss += F.cross_entropy(legal_logits, labels['legal']) |
|
|
|
|
|
outputs['loss'] = loss |
|
|
|
|
|
return outputs |
|
|
|
|
|
|
|
|
class MLPipelineManager: |
|
|
"""Gerenciador avançado de pipeline ML""" |
|
|
|
|
|
def __init__(self, config: MLPipelineConfig): |
|
|
self.config = config |
|
|
self.device = torch.device(config.device) |
|
|
|
|
|
|
|
|
Path(config.artifacts_dir).mkdir(parents=True, exist_ok=True) |
|
|
Path(config.models_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
self.training_runs: Dict[str, TrainingRun] = {} |
|
|
self.models: Dict[str, Any] = {} |
|
|
|
|
|
|
|
|
self._setup_experiment_tracking() |
|
|
|
|
|
def _setup_experiment_tracking(self): |
|
|
"""Configurar experiment tracking""" |
|
|
|
|
|
if not self.config.experiment_tracking: |
|
|
return |
|
|
|
|
|
if MLFLOW_AVAILABLE: |
|
|
try: |
|
|
mlflow.set_experiment(f"cidadao-ai-{self.config.model_name}") |
|
|
logger.info("✅ MLflow experiment tracking configurado") |
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ MLflow setup falhou: {e}") |
|
|
|
|
|
if WANDB_AVAILABLE: |
|
|
try: |
|
|
|
|
|
logger.info("✅ W&B tracking disponível") |
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ W&B setup falhou: {e}") |
|
|
|
|
|
async def prepare_data(self, |
|
|
contracts_data: List[Dict[str, Any]], |
|
|
model_type: ModelType = ModelType.ANOMALY_DETECTOR) -> Tuple[DataLoader, DataLoader, DataLoader]: |
|
|
"""Preparar dados para treinamento""" |
|
|
|
|
|
logger.info(f"🔄 Preparando dados para {model_type.value}...") |
|
|
|
|
|
|
|
|
texts = [] |
|
|
labels = [] |
|
|
|
|
|
for contract in contracts_data: |
|
|
|
|
|
text = self._create_contract_text(contract) |
|
|
texts.append(text) |
|
|
|
|
|
|
|
|
if model_type == ModelType.ANOMALY_DETECTOR: |
|
|
label = self._generate_anomaly_label(contract) |
|
|
elif model_type == ModelType.FINANCIAL_ANALYZER: |
|
|
label = self._generate_financial_label(contract) |
|
|
elif model_type == ModelType.LEGAL_COMPLIANCE: |
|
|
label = self._generate_legal_label(contract) |
|
|
else: |
|
|
label = 0 |
|
|
|
|
|
labels.append(label) |
|
|
|
|
|
|
|
|
train_texts, temp_texts, train_labels, temp_labels = train_test_split( |
|
|
texts, labels, |
|
|
test_size=(1 - self.config.train_split), |
|
|
random_state=42, |
|
|
stratify=labels |
|
|
) |
|
|
|
|
|
val_size = self.config.val_split / (self.config.val_split + self.config.test_split) |
|
|
val_texts, test_texts, val_labels, test_labels = train_test_split( |
|
|
temp_texts, temp_labels, |
|
|
test_size=(1 - val_size), |
|
|
random_state=42, |
|
|
stratify=temp_labels |
|
|
) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(self.config.base_model) |
|
|
|
|
|
|
|
|
train_dataset = TransparencyDataset(train_texts, train_labels, tokenizer, self.config.max_length) |
|
|
val_dataset = TransparencyDataset(val_texts, val_labels, tokenizer, self.config.max_length) |
|
|
test_dataset = TransparencyDataset(test_texts, test_labels, tokenizer, self.config.max_length) |
|
|
|
|
|
|
|
|
train_loader = DataLoader( |
|
|
train_dataset, |
|
|
batch_size=self.config.batch_size, |
|
|
shuffle=True, |
|
|
num_workers=self.config.num_workers, |
|
|
pin_memory=self.config.pin_memory |
|
|
) |
|
|
|
|
|
val_loader = DataLoader( |
|
|
val_dataset, |
|
|
batch_size=self.config.batch_size, |
|
|
shuffle=False, |
|
|
num_workers=self.config.num_workers, |
|
|
pin_memory=self.config.pin_memory |
|
|
) |
|
|
|
|
|
test_loader = DataLoader( |
|
|
test_dataset, |
|
|
batch_size=self.config.batch_size, |
|
|
shuffle=False, |
|
|
num_workers=self.config.num_workers, |
|
|
pin_memory=self.config.pin_memory |
|
|
) |
|
|
|
|
|
logger.info(f"✅ Dados preparados: {len(train_dataset)} treino, {len(val_dataset)} validação, {len(test_dataset)} teste") |
|
|
|
|
|
return train_loader, val_loader, test_loader |
|
|
|
|
|
def _create_contract_text(self, contract: Dict[str, Any]) -> str: |
|
|
"""Criar texto descritivo do contrato""" |
|
|
|
|
|
parts = [] |
|
|
|
|
|
if 'objeto' in contract: |
|
|
parts.append(f"Objeto: {contract['objeto']}") |
|
|
|
|
|
if 'valor' in contract or 'valorInicial' in contract: |
|
|
valor = contract.get('valor', contract.get('valorInicial', 0)) |
|
|
parts.append(f"Valor: R$ {valor:,.2f}") |
|
|
|
|
|
if 'nomeRazaoSocialFornecedor' in contract: |
|
|
parts.append(f"Fornecedor: {contract['nomeRazaoSocialFornecedor']}") |
|
|
|
|
|
if 'modalidadeLicitacao' in contract: |
|
|
parts.append(f"Modalidade: {contract['modalidadeLicitacao']}") |
|
|
|
|
|
if 'situacao' in contract: |
|
|
parts.append(f"Situação: {contract['situacao']}") |
|
|
|
|
|
return ". ".join(parts) |
|
|
|
|
|
def _generate_anomaly_label(self, contract: Dict[str, Any]) -> int: |
|
|
"""Gerar label de anomalia (0=Normal, 1=Suspeito, 2=Anômalo)""" |
|
|
|
|
|
valor = contract.get('valor', contract.get('valorInicial', 0)) |
|
|
modalidade = contract.get('modalidadeLicitacao', '').lower() |
|
|
|
|
|
|
|
|
score = 0 |
|
|
|
|
|
|
|
|
if valor > 50_000_000: |
|
|
score += 1 |
|
|
|
|
|
|
|
|
if any(word in modalidade for word in ['emergencial', 'dispensa', 'inexigibilidade']): |
|
|
score += 1 |
|
|
|
|
|
|
|
|
if not contract.get('objeto') or len(contract.get('objeto', '')) < 10: |
|
|
score += 1 |
|
|
|
|
|
return min(score, 2) |
|
|
|
|
|
def _generate_financial_label(self, contract: Dict[str, Any]) -> int: |
|
|
"""Gerar label de risco financeiro (0=Muito Baixo, 1=Baixo, 2=Médio, 3=Alto, 4=Muito Alto)""" |
|
|
|
|
|
valor = contract.get('valor', contract.get('valorInicial', 0)) |
|
|
|
|
|
if valor < 100_000: |
|
|
return 0 |
|
|
elif valor < 1_000_000: |
|
|
return 1 |
|
|
elif valor < 10_000_000: |
|
|
return 2 |
|
|
elif valor < 50_000_000: |
|
|
return 3 |
|
|
else: |
|
|
return 4 |
|
|
|
|
|
def _generate_legal_label(self, contract: Dict[str, Any]) -> int: |
|
|
"""Gerar label de conformidade legal (0=Não Conforme, 1=Conforme)""" |
|
|
|
|
|
modalidade = contract.get('modalidadeLicitacao', '').lower() |
|
|
|
|
|
|
|
|
if 'pregao' in modalidade or 'concorrencia' in modalidade: |
|
|
return 1 |
|
|
else: |
|
|
return 0 |
|
|
|
|
|
async def train_model(self, |
|
|
train_loader: DataLoader, |
|
|
val_loader: DataLoader, |
|
|
model_type: ModelType = ModelType.ANOMALY_DETECTOR) -> str: |
|
|
"""Treinar modelo""" |
|
|
|
|
|
run_id = f"{model_type.value}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" |
|
|
|
|
|
training_run = TrainingRun( |
|
|
id=run_id, |
|
|
model_type=model_type, |
|
|
status=TrainingStatus.TRAINING, |
|
|
config=self.config.dict() |
|
|
) |
|
|
|
|
|
self.training_runs[run_id] = training_run |
|
|
|
|
|
try: |
|
|
logger.info(f"🚀 Iniciando treinamento {run_id}...") |
|
|
|
|
|
|
|
|
if WANDB_AVAILABLE and self.config.experiment_tracking: |
|
|
wandb.init( |
|
|
project="cidadao-ai", |
|
|
name=run_id, |
|
|
config=self.config.dict() |
|
|
) |
|
|
|
|
|
if MLFLOW_AVAILABLE and self.config.experiment_tracking: |
|
|
mlflow.start_run(run_name=run_id) |
|
|
|
|
|
|
|
|
num_labels = 3 if model_type == ModelType.ANOMALY_DETECTOR else (5 if model_type == ModelType.FINANCIAL_ANALYZER else 2) |
|
|
model = TransparencyClassifier(self.config.base_model, num_labels) |
|
|
model.to(self.device) |
|
|
|
|
|
|
|
|
optimizer = optim.AdamW( |
|
|
model.parameters(), |
|
|
lr=self.config.learning_rate, |
|
|
weight_decay=self.config.weight_decay |
|
|
) |
|
|
|
|
|
|
|
|
total_steps = len(train_loader) * self.config.num_epochs |
|
|
scheduler = optim.lr_scheduler.LinearLR( |
|
|
optimizer, |
|
|
start_factor=1.0, |
|
|
end_factor=0.1, |
|
|
total_iters=total_steps |
|
|
) |
|
|
|
|
|
|
|
|
scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision else None |
|
|
|
|
|
|
|
|
best_val_acc = 0.0 |
|
|
patience_counter = 0 |
|
|
global_step = 0 |
|
|
|
|
|
training_run.started_at = datetime.utcnow() |
|
|
|
|
|
|
|
|
for epoch in range(self.config.num_epochs): |
|
|
logger.info(f"📚 Época {epoch + 1}/{self.config.num_epochs}") |
|
|
|
|
|
|
|
|
model.train() |
|
|
train_loss = 0.0 |
|
|
train_correct = 0 |
|
|
train_total = 0 |
|
|
|
|
|
for batch_idx, batch in enumerate(train_loader): |
|
|
input_ids = batch['input_ids'].to(self.device) |
|
|
attention_mask = batch['attention_mask'].to(self.device) |
|
|
labels = batch['label'].to(self.device) |
|
|
|
|
|
optimizer.zero_grad() |
|
|
|
|
|
|
|
|
if self.config.mixed_precision and scaler: |
|
|
with torch.cuda.amp.autocast(): |
|
|
outputs = model(input_ids, attention_mask, labels, task=model_type.value.split('_')[0]) |
|
|
loss = outputs['loss'] |
|
|
else: |
|
|
outputs = model(input_ids, attention_mask, labels, task=model_type.value.split('_')[0]) |
|
|
loss = outputs['loss'] |
|
|
|
|
|
|
|
|
if self.config.mixed_precision and scaler: |
|
|
scaler.scale(loss).backward() |
|
|
scaler.unscale_(optimizer) |
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.max_grad_norm) |
|
|
scaler.step(optimizer) |
|
|
scaler.update() |
|
|
else: |
|
|
loss.backward() |
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.max_grad_norm) |
|
|
optimizer.step() |
|
|
|
|
|
scheduler.step() |
|
|
|
|
|
|
|
|
train_loss += loss.item() |
|
|
|
|
|
|
|
|
task_key = f"{model_type.value.split('_')[0]}_logits" |
|
|
if task_key in outputs: |
|
|
_, predicted = torch.max(outputs[task_key], 1) |
|
|
train_total += labels.size(0) |
|
|
train_correct += (predicted == labels).sum().item() |
|
|
|
|
|
global_step += 1 |
|
|
|
|
|
|
|
|
if global_step % self.config.logging_steps == 0: |
|
|
current_lr = scheduler.get_last_lr()[0] |
|
|
logger.info(f"Step {global_step}, Loss: {loss.item():.4f}, LR: {current_lr:.2e}") |
|
|
|
|
|
if WANDB_AVAILABLE and self.config.experiment_tracking: |
|
|
wandb.log({ |
|
|
"train_loss": loss.item(), |
|
|
"learning_rate": current_lr, |
|
|
"step": global_step |
|
|
}) |
|
|
|
|
|
|
|
|
if epoch % 1 == 0: |
|
|
val_metrics = await self._validate_model(model, val_loader, model_type) |
|
|
|
|
|
logger.info(f"📊 Validação - Acc: {val_metrics.val_accuracy:.4f}, Loss: {val_metrics.val_loss:.4f}") |
|
|
|
|
|
|
|
|
if val_metrics.val_accuracy > best_val_acc: |
|
|
best_val_acc = val_metrics.val_accuracy |
|
|
patience_counter = 0 |
|
|
|
|
|
|
|
|
model_path = Path(self.config.models_dir) / f"{run_id}_best.pt" |
|
|
torch.save({ |
|
|
'model_state_dict': model.state_dict(), |
|
|
'optimizer_state_dict': optimizer.state_dict(), |
|
|
'config': self.config.dict(), |
|
|
'metrics': val_metrics.__dict__, |
|
|
'epoch': epoch |
|
|
}, model_path) |
|
|
|
|
|
else: |
|
|
patience_counter += 1 |
|
|
|
|
|
if patience_counter >= self.config.early_stopping_patience: |
|
|
logger.info(f"⏹️ Early stopping após {epoch + 1} épocas") |
|
|
break |
|
|
|
|
|
|
|
|
if WANDB_AVAILABLE and self.config.experiment_tracking: |
|
|
wandb.log({ |
|
|
"val_accuracy": val_metrics.val_accuracy, |
|
|
"val_loss": val_metrics.val_loss, |
|
|
"val_f1": val_metrics.f1_score, |
|
|
"epoch": epoch |
|
|
}) |
|
|
|
|
|
if MLFLOW_AVAILABLE and self.config.experiment_tracking: |
|
|
mlflow.log_metrics({ |
|
|
"val_accuracy": val_metrics.val_accuracy, |
|
|
"val_loss": val_metrics.val_loss, |
|
|
"val_f1": val_metrics.f1_score |
|
|
}, step=epoch) |
|
|
|
|
|
|
|
|
final_metrics = await self._validate_model(model, val_loader, model_type) |
|
|
training_run.metrics = final_metrics |
|
|
training_run.status = TrainingStatus.COMPLETED |
|
|
training_run.completed_at = datetime.utcnow() |
|
|
|
|
|
|
|
|
final_model_path = Path(self.config.models_dir) / f"{run_id}_final.pt" |
|
|
torch.save({ |
|
|
'model_state_dict': model.state_dict(), |
|
|
'config': self.config.dict(), |
|
|
'metrics': final_metrics.__dict__, |
|
|
'run_id': run_id |
|
|
}, final_model_path) |
|
|
|
|
|
training_run.artifacts_path = str(final_model_path) |
|
|
|
|
|
|
|
|
if self.config.model_registry: |
|
|
await self._register_model(run_id, final_model_path, final_metrics) |
|
|
|
|
|
logger.info(f"✅ Treinamento {run_id} concluído com sucesso!") |
|
|
|
|
|
return run_id |
|
|
|
|
|
except Exception as e: |
|
|
training_run.status = TrainingStatus.FAILED |
|
|
training_run.error_message = str(e) |
|
|
training_run.completed_at = datetime.utcnow() |
|
|
logger.error(f"❌ Treinamento {run_id} falhou: {e}") |
|
|
raise |
|
|
|
|
|
finally: |
|
|
|
|
|
if WANDB_AVAILABLE and self.config.experiment_tracking: |
|
|
wandb.finish() |
|
|
|
|
|
if MLFLOW_AVAILABLE and self.config.experiment_tracking: |
|
|
mlflow.end_run() |
|
|
|
|
|
async def _validate_model(self, model, val_loader: DataLoader, model_type: ModelType) -> ModelMetrics: |
|
|
"""Validar modelo""" |
|
|
|
|
|
model.eval() |
|
|
val_loss = 0.0 |
|
|
all_predictions = [] |
|
|
all_labels = [] |
|
|
all_confidences = [] |
|
|
|
|
|
with torch.no_grad(): |
|
|
for batch in val_loader: |
|
|
input_ids = batch['input_ids'].to(self.device) |
|
|
attention_mask = batch['attention_mask'].to(self.device) |
|
|
labels = batch['label'].to(self.device) |
|
|
|
|
|
outputs = model(input_ids, attention_mask, labels, task=model_type.value.split('_')[0]) |
|
|
|
|
|
val_loss += outputs['loss'].item() |
|
|
|
|
|
|
|
|
task_key = f"{model_type.value.split('_')[0]}_logits" |
|
|
if task_key in outputs: |
|
|
_, predicted = torch.max(outputs[task_key], 1) |
|
|
|
|
|
all_predictions.extend(predicted.cpu().numpy()) |
|
|
all_labels.extend(labels.cpu().numpy()) |
|
|
all_confidences.extend(outputs['confidence'].cpu().numpy()) |
|
|
|
|
|
|
|
|
val_loss /= len(val_loader) |
|
|
|
|
|
accuracy = accuracy_score(all_labels, all_predictions) |
|
|
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted') |
|
|
|
|
|
|
|
|
auc = 0.0 |
|
|
if len(set(all_labels)) == 2: |
|
|
try: |
|
|
auc = roc_auc_score(all_labels, all_confidences) |
|
|
except: |
|
|
pass |
|
|
|
|
|
return ModelMetrics( |
|
|
accuracy=accuracy, |
|
|
precision=precision, |
|
|
recall=recall, |
|
|
f1_score=f1, |
|
|
auc_roc=auc, |
|
|
val_accuracy=accuracy, |
|
|
val_loss=val_loss, |
|
|
inference_time_ms=0.0 |
|
|
) |
|
|
|
|
|
async def _register_model(self, run_id: str, model_path: Path, metrics: ModelMetrics): |
|
|
"""Registrar modelo no registry""" |
|
|
|
|
|
try: |
|
|
if MLFLOW_AVAILABLE: |
|
|
|
|
|
mlflow.pytorch.log_model( |
|
|
pytorch_model=model_path, |
|
|
artifact_path="model", |
|
|
registered_model_name=f"{self.config.model_name}-{run_id}" |
|
|
) |
|
|
logger.info(f"✅ Modelo {run_id} registrado no MLflow") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Erro ao registrar modelo: {e}") |
|
|
|
|
|
async def load_model(self, run_id: str) -> Optional[TransparencyClassifier]: |
|
|
"""Carregar modelo treinado""" |
|
|
|
|
|
model_path = Path(self.config.models_dir) / f"{run_id}_best.pt" |
|
|
if not model_path.exists(): |
|
|
model_path = Path(self.config.models_dir) / f"{run_id}_final.pt" |
|
|
|
|
|
if not model_path.exists(): |
|
|
logger.error(f"❌ Modelo {run_id} não encontrado") |
|
|
return None |
|
|
|
|
|
try: |
|
|
checkpoint = torch.load(model_path, map_location=self.device) |
|
|
|
|
|
|
|
|
model = TransparencyClassifier(self.config.base_model) |
|
|
model.load_state_dict(checkpoint['model_state_dict']) |
|
|
model.to(self.device) |
|
|
model.eval() |
|
|
|
|
|
self.models[run_id] = model |
|
|
|
|
|
logger.info(f"✅ Modelo {run_id} carregado") |
|
|
return model |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Erro ao carregar modelo {run_id}: {e}") |
|
|
return None |
|
|
|
|
|
async def predict(self, model: TransparencyClassifier, text: str, model_type: ModelType) -> Dict[str, Any]: |
|
|
"""Fazer predição""" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(self.config.base_model) |
|
|
|
|
|
|
|
|
encoding = tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
padding='max_length', |
|
|
max_length=self.config.max_length, |
|
|
return_tensors='pt' |
|
|
) |
|
|
|
|
|
input_ids = encoding['input_ids'].to(self.device) |
|
|
attention_mask = encoding['attention_mask'].to(self.device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(input_ids, attention_mask) |
|
|
|
|
|
|
|
|
results = {} |
|
|
|
|
|
|
|
|
if 'anomaly_logits' in outputs: |
|
|
anomaly_probs = F.softmax(outputs['anomaly_logits'], dim=-1) |
|
|
anomaly_pred = torch.argmax(anomaly_probs, dim=-1) |
|
|
|
|
|
labels = ["Normal", "Suspeito", "Anômalo"] |
|
|
results["anomaly"] = { |
|
|
"label": labels[anomaly_pred.item()], |
|
|
"confidence": anomaly_probs.max().item(), |
|
|
"probabilities": anomaly_probs.squeeze().tolist() |
|
|
} |
|
|
|
|
|
|
|
|
if 'financial_logits' in outputs: |
|
|
financial_probs = F.softmax(outputs['financial_logits'], dim=-1) |
|
|
financial_pred = torch.argmax(financial_probs, dim=-1) |
|
|
|
|
|
labels = ["Muito Baixo", "Baixo", "Médio", "Alto", "Muito Alto"] |
|
|
results["financial"] = { |
|
|
"label": labels[financial_pred.item()], |
|
|
"confidence": financial_probs.max().item(), |
|
|
"probabilities": financial_probs.squeeze().tolist() |
|
|
} |
|
|
|
|
|
|
|
|
if 'legal_logits' in outputs: |
|
|
legal_probs = F.softmax(outputs['legal_logits'], dim=-1) |
|
|
legal_pred = torch.argmax(legal_probs, dim=-1) |
|
|
|
|
|
labels = ["Não Conforme", "Conforme"] |
|
|
results["legal"] = { |
|
|
"label": labels[legal_pred.item()], |
|
|
"confidence": legal_probs.max().item(), |
|
|
"probabilities": legal_probs.squeeze().tolist() |
|
|
} |
|
|
|
|
|
|
|
|
if 'confidence' in outputs: |
|
|
results["overall_confidence"] = outputs['confidence'].item() |
|
|
|
|
|
return results |
|
|
|
|
|
def get_training_status(self, run_id: str) -> Optional[TrainingRun]: |
|
|
"""Obter status do treinamento""" |
|
|
return self.training_runs.get(run_id) |
|
|
|
|
|
def list_models(self) -> List[Dict[str, Any]]: |
|
|
"""Listar modelos disponíveis""" |
|
|
|
|
|
models = [] |
|
|
models_dir = Path(self.config.models_dir) |
|
|
|
|
|
for model_file in models_dir.glob("*.pt"): |
|
|
try: |
|
|
checkpoint = torch.load(model_file, map_location='cpu') |
|
|
models.append({ |
|
|
"filename": model_file.name, |
|
|
"run_id": checkpoint.get('run_id', 'unknown'), |
|
|
"metrics": checkpoint.get('metrics', {}), |
|
|
"created": datetime.fromtimestamp(model_file.stat().st_mtime) |
|
|
}) |
|
|
except: |
|
|
continue |
|
|
|
|
|
return models |
|
|
|
|
|
|
|
|
|
|
|
_ml_pipeline_manager: Optional[MLPipelineManager] = None |
|
|
|
|
|
async def get_ml_pipeline_manager() -> MLPipelineManager: |
|
|
"""Obter instância singleton do ML pipeline manager""" |
|
|
|
|
|
global _ml_pipeline_manager |
|
|
|
|
|
if _ml_pipeline_manager is None: |
|
|
config = MLPipelineConfig() |
|
|
_ml_pipeline_manager = MLPipelineManager(config) |
|
|
|
|
|
return _ml_pipeline_manager |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
import asyncio |
|
|
|
|
|
async def test_ml_pipeline(): |
|
|
"""Teste do pipeline ML""" |
|
|
|
|
|
print("🧪 Testando pipeline ML...") |
|
|
|
|
|
|
|
|
pipeline = await get_ml_pipeline_manager() |
|
|
|
|
|
|
|
|
mock_contracts = [ |
|
|
{ |
|
|
"objeto": "Aquisição de equipamentos médicos", |
|
|
"valor": 5000000, |
|
|
"nomeRazaoSocialFornecedor": "Empresa XYZ", |
|
|
"modalidadeLicitacao": "Pregão Eletrônico" |
|
|
}, |
|
|
{ |
|
|
"objeto": "Obra de construção hospitalar", |
|
|
"valor": 100000000, |
|
|
"nomeRazaoSocialFornecedor": "Construtora ABC", |
|
|
"modalidadeLicitacao": "Dispensa de Licitação" |
|
|
} |
|
|
] * 50 |
|
|
|
|
|
try: |
|
|
|
|
|
train_loader, val_loader, test_loader = await pipeline.prepare_data( |
|
|
mock_contracts, |
|
|
ModelType.ANOMALY_DETECTOR |
|
|
) |
|
|
|
|
|
print(f"✅ Dados preparados: {len(train_loader)} batches de treino") |
|
|
|
|
|
|
|
|
pipeline.config.num_epochs = 1 |
|
|
|
|
|
run_id = await pipeline.train_model( |
|
|
train_loader, |
|
|
val_loader, |
|
|
ModelType.ANOMALY_DETECTOR |
|
|
) |
|
|
|
|
|
print(f"✅ Modelo treinado: {run_id}") |
|
|
|
|
|
|
|
|
model = await pipeline.load_model(run_id) |
|
|
if model: |
|
|
result = await pipeline.predict( |
|
|
model, |
|
|
"Contrato emergencial de R$ 50 milhões sem licitação", |
|
|
ModelType.ANOMALY_DETECTOR |
|
|
) |
|
|
print(f"✅ Predição: {result}") |
|
|
|
|
|
|
|
|
models = pipeline.list_models() |
|
|
print(f"✅ Modelos disponíveis: {len(models)}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Erro no teste: {e}") |
|
|
|
|
|
print("✅ Teste concluído!") |
|
|
|
|
|
asyncio.run(test_ml_pipeline()) |