Spaces:

Sigdev
/

comment_validator

Sleeping

aurelien

Edit script for GPU

845c5fd 3 months ago

4.21 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from typing import List
	import joblib
	import pandas as pd
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	import torch

	app = FastAPI(title="Comment Validator API")

	# =====================================
	# 🔹 Chargement des modèles
	# =====================================

	if torch.cuda.is_available():
	device = "cuda"
	elif torch.backends.mps.is_available():
	device = "mps" # pour ton Mac local
	else:
	device = "cpu"
	print(f"🧠 Using device: {device}")

	print("Loading model embedding")
	text_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device=device)
	print("Loading model classifier")
	clf = joblib.load("models/classifier.joblib")
	print("Loading model encoder")
	encoder = joblib.load("models/encoder.joblib")
	print("Loading model sentiment-analysis")
	sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=device)
	print("Loading model toxicity")
	toxicity_analyzer = pipeline("text-classification", model="unitary/toxic-bert", return_all_scores=True, device=device)

	def analyze_comment(comment: str, category: str, country: str) -> dict:
	reasons = []

	# --- Analyse du sentiment ---
	try:
	sentiment = sentiment_analyzer(comment[:512])[0]
	label = sentiment["label"]
	score = sentiment["score"]
	except Exception:
	label, score = "unknown", 0.0

	if "1" in label or "2" in label:
	sentiment_score = -1
	reasons.append("Le ton semble négatif ou insatisfait.")
	elif "4" in label or "5" in label:
	sentiment_score = 1
	else:
	sentiment_score = 0

	# --- Encodage du texte ---
	X_text = text_model.encode([comment])

	# --- Encodage catégorie/pays ---
	df_cat = pd.DataFrame([[category, country]], columns=["category", "country"])
	try:
	X_cat = encoder.transform(df_cat)
	except ValueError:
	reasons.append(f"Catégorie ou pays inconnus : {category}, {country}")
	n_features = sum(len(cats) for cats in encoder.categories_)
	X_cat = np.zeros((1, n_features))

	# --- Concaténation ---
	X = np.concatenate([X_text, X_cat], axis=1)

	# --- Prédiction validité ---
	proba = clf.predict_proba(X)[0][1]
	prediction = proba >= 0.5

	if len(comment.split()) < 3:
	reasons.append("Le commentaire est trop court.")
	if sentiment_score < 0:
	reasons.append("Le ton global est négatif.")
	if proba < 0.4:
	reasons.append("Le modèle estime une faible probabilité de validité.")

	# --- Analyse toxicité ---
	try:
	tox_scores = toxicity_analyzer(comment[:512])[0] # tronquer pour sécurité
	tags = {f"tag_{item['label']}": round(item['score'], 3) for item in tox_scores}
	except Exception:
	tags = {f"tag_{label}": 0.0 for label in ["toxicity","severe_toxicity","obscene","identity_attack","insult","threat"]}

	# --- Résultat final ---
	result = {
	"is_valid": bool(prediction),
	"confidence": round(float(proba), 3),
	"sentiment": label,
	"sentiment_score": round(float(score), 3),
	"reasons": "; ".join(reasons) if reasons else "Aucune anomalie détectée."
	}

	result.update(tags)
	return result


	# =====================================
	# 🔸 Modèles de requête/réponse
	# =====================================

	class CommentRequest(BaseModel):
	comment: str
	category: str
	country: str

	class BatchRequest(BaseModel):
	items: List[CommentRequest]

	# =====================================
	# 🔹 Routes
	# =====================================

	@app.post("/predict")
	def predict(item: CommentRequest):
	"""Analyse un seul commentaire"""
	result = analyze_comment(item.comment, item.category, item.country)
	return result


	@app.post("/batch_predict")
	def batch_predict(request: BatchRequest):
	"""Analyse plusieurs commentaires à la fois"""
	results = []
	for item in request.items:
	results.append(analyze_comment(item.comment, item.category, item.country))
	return {"results": results}