anderson-ufrj commited on
Commit
340f8ae
·
1 Parent(s): 6930a0b

fix: resolve HuggingFace deployment errors with Prometheus metrics and OpenTelemetry

Browse files

- Fix Prometheus metrics duplication error by using custom registry
- Implement metrics caching to prevent duplicate registrations
- Make all OpenTelemetry dependencies fully optional for minimal deployment
- Add robust fallbacks for when OpenTelemetry modules are not available
- Separate OpenTelemetry imports into basic, exporters, instrumentors, and propagators
- Add defensive checks for all OpenTelemetry method calls
- Update monitoring_minimal.py to use CollectorRegistry for isolated metrics

src/core/monitoring_minimal.py CHANGED
@@ -13,7 +13,7 @@ from contextlib import asynccontextmanager
13
  import logging
14
  import functools
15
 
16
- from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST, REGISTRY
17
 
18
  from src.core.config import get_settings
19
  from src.core import get_logger
@@ -21,23 +21,29 @@ from src.core import get_logger
21
  logger = get_logger(__name__)
22
  settings = get_settings()
23
 
 
 
 
 
24
 
25
  def get_or_create_metric(metric_type, name, description, labels=None, **kwargs):
26
- """Get existing metric or create new one."""
27
- # Check if metric already exists in the default registry
28
- for collector in REGISTRY._collector_to_names:
29
- if hasattr(collector, '_name') and collector._name == name:
30
- return collector
31
 
32
- # Create new metric
33
  if metric_type == Counter:
34
- return Counter(name, description, labels or [], **kwargs)
35
  elif metric_type == Histogram:
36
- return Histogram(name, description, labels or [], **kwargs)
37
  elif metric_type == Gauge:
38
- return Gauge(name, description, labels or [], **kwargs)
39
  else:
40
  raise ValueError(f"Unknown metric type: {metric_type}")
 
 
 
41
 
42
 
43
  # Prometheus metrics - with duplicate checking
@@ -187,7 +193,7 @@ class MetricsCollector:
187
 
188
  def get_metrics(self) -> str:
189
  """Get Prometheus metrics."""
190
- return generate_latest()
191
 
192
 
193
  # Global metrics collector instance
 
13
  import logging
14
  import functools
15
 
16
+ from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST, REGISTRY, CollectorRegistry
17
 
18
  from src.core.config import get_settings
19
  from src.core import get_logger
 
21
  logger = get_logger(__name__)
22
  settings = get_settings()
23
 
24
+ # Create a custom registry to avoid conflicts
25
+ _metrics_registry = CollectorRegistry()
26
+ _metrics_cache = {}
27
+
28
 
29
  def get_or_create_metric(metric_type, name, description, labels=None, **kwargs):
30
+ """Get existing metric or create new one using custom registry."""
31
+ # Check if metric already exists in our cache
32
+ if name in _metrics_cache:
33
+ return _metrics_cache[name]
 
34
 
35
+ # Create new metric with custom registry
36
  if metric_type == Counter:
37
+ metric = Counter(name, description, labels or [], registry=_metrics_registry, **kwargs)
38
  elif metric_type == Histogram:
39
+ metric = Histogram(name, description, labels or [], registry=_metrics_registry, **kwargs)
40
  elif metric_type == Gauge:
41
+ metric = Gauge(name, description, labels or [], registry=_metrics_registry, **kwargs)
42
  else:
43
  raise ValueError(f"Unknown metric type: {metric_type}")
44
+
45
+ _metrics_cache[name] = metric
46
+ return metric
47
 
48
 
49
  # Prometheus metrics - with duplicate checking
 
193
 
194
  def get_metrics(self) -> str:
195
  """Get Prometheus metrics."""
196
+ return generate_latest(_metrics_registry)
197
 
198
 
199
  # Global metrics collector instance
src/infrastructure/observability/tracing.py CHANGED
@@ -13,39 +13,82 @@ import uuid
13
  from functools import wraps
14
 
15
  # Try to import OpenTelemetry, use stubs if not available
 
16
  try:
17
  from opentelemetry import trace, context, baggage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from opentelemetry.exporter.jaeger.thrift import JaegerExporter
19
  from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 
 
 
 
 
 
 
 
 
 
20
  from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
21
  from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
22
  from opentelemetry.instrumentation.redis import RedisInstrumentor
23
  from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
24
  from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
25
- from opentelemetry.propagate import set_global_textmap
26
- from opentelemetry.propagators.b3 import B3MultiFormat
27
- from opentelemetry.propagators.jaeger import JaegerPropagator
28
- from opentelemetry.propagators.composite import CompositeHTTPPropagator
29
- from opentelemetry.sdk.trace import TracerProvider
30
- from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
31
- from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
32
- from opentelemetry.semconv.trace import SpanAttributes
33
- OPENTELEMETRY_AVAILABLE = True
34
  except ImportError:
35
- OPENTELEMETRY_AVAILABLE = False
36
- # Use minimal implementation
37
- from src.core.monitoring_minimal import MockTracer as trace
38
-
39
  class MockInstrumentor:
40
  @staticmethod
41
  def instrument(*args, **kwargs):
42
  pass
43
-
44
  FastAPIInstrumentor = HTTPXClientInstrumentor = RedisInstrumentor = SQLAlchemyInstrumentor = AsyncPGInstrumentor = MockInstrumentor
45
-
46
- # Mock constants
47
- SERVICE_NAME = "service_name"
48
- SERVICE_VERSION = "service_version"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
  from opentelemetry.trace.status import Status, StatusCode
51
  except ImportError:
@@ -111,22 +154,35 @@ class TracingManager:
111
  logger.warning("Tracing already initialized")
112
  return
113
 
 
 
 
 
 
 
114
  # Create resource
115
- resource = Resource.create({
116
- SERVICE_NAME: self.config.service_name,
117
- SERVICE_VERSION: self.config.service_version,
118
- "service.instance.id": str(uuid.uuid4()),
119
- "deployment.environment": getattr(settings, 'app_env', 'development')
120
- })
 
 
 
121
 
122
  # Create tracer provider
123
- self.tracer_provider = TracerProvider(resource=resource)
124
-
125
- # Add span processors/exporters
126
- self._setup_exporters()
127
 
128
- # Set global tracer provider
129
- trace.set_tracer_provider(self.tracer_provider)
 
 
 
 
130
 
131
  # Create tracer
132
  self.tracer = trace.get_tracer(
@@ -145,18 +201,20 @@ class TracingManager:
145
 
146
  def _setup_exporters(self):
147
  """Setup span exporters."""
148
- if not self.tracer_provider:
 
 
149
  return
150
 
151
  # Console exporter for development
152
- if self.config.enable_console_export:
153
  console_exporter = ConsoleSpanExporter()
154
  console_processor = BatchSpanProcessor(console_exporter)
155
  self.tracer_provider.add_span_processor(console_processor)
156
  logger.info("Console span exporter enabled")
157
 
158
  # Jaeger exporter
159
- if self.config.jaeger_endpoint:
160
  jaeger_exporter = JaegerExporter(
161
  agent_host_name="localhost",
162
  agent_port=14268,
@@ -167,7 +225,7 @@ class TracingManager:
167
  logger.info(f"Jaeger exporter configured: {self.config.jaeger_endpoint}")
168
 
169
  # OTLP exporter (for generic OpenTelemetry collectors)
170
- if self.config.otlp_endpoint:
171
  otlp_exporter = OTLPSpanExporter(
172
  endpoint=self.config.otlp_endpoint,
173
  insecure=True
@@ -178,16 +236,23 @@ class TracingManager:
178
 
179
  def _setup_propagators(self):
180
  """Setup trace context propagators."""
 
 
 
 
181
  # Support multiple propagation formats
182
- propagators = [
183
- B3MultiFormat(),
184
- JaegerPropagator()
185
- ]
186
-
187
- composite_propagator = CompositeHTTPPropagator(propagators)
188
- set_global_textmap(composite_propagator)
189
 
190
- logger.info("Trace propagators configured")
 
 
 
 
 
191
 
192
  def _setup_auto_instrumentation(self):
193
  """Setup automatic instrumentation for common libraries."""
@@ -249,49 +314,60 @@ class TraceContext:
249
  @staticmethod
250
  def get_correlation_id() -> str:
251
  """Get correlation ID from current trace context."""
252
- span = trace.get_current_span()
253
- if span and span.get_span_context().is_valid:
254
- return f"{span.get_span_context().trace_id:032x}"
 
 
 
255
  return str(uuid.uuid4())
256
 
257
  @staticmethod
258
  def get_span_id() -> str:
259
  """Get current span ID."""
260
- span = trace.get_current_span()
261
- if span and span.get_span_context().is_valid:
262
- return f"{span.get_span_context().span_id:016x}"
 
 
 
263
  return ""
264
 
265
  @staticmethod
266
  def set_user_context(user_id: str, user_email: Optional[str] = None):
267
  """Set user context in current trace."""
268
- span = trace.get_current_span()
269
- if span:
270
- span.set_attribute("user.id", user_id)
271
- if user_email:
272
- span.set_attribute("user.email", user_email)
 
273
 
274
  # Also set in baggage for propagation
275
- ctx = baggage.set_baggage("user.id", user_id)
276
- context.attach(ctx)
 
277
 
278
  @staticmethod
279
  def set_investigation_context(investigation_id: str):
280
  """Set investigation context in current trace."""
281
- span = trace.get_current_span()
282
- if span:
283
- span.set_attribute("investigation.id", investigation_id)
 
284
 
285
  # Set in baggage
286
- ctx = baggage.set_baggage("investigation.id", investigation_id)
287
- context.attach(ctx)
 
288
 
289
  @staticmethod
290
  def add_event(name: str, attributes: Optional[Dict[str, Any]] = None):
291
  """Add event to current span."""
292
- span = trace.get_current_span()
293
- if span:
294
- span.add_event(name, attributes or {})
 
295
 
296
 
297
  def trace_function(
 
13
  from functools import wraps
14
 
15
  # Try to import OpenTelemetry, use stubs if not available
16
+ OPENTELEMETRY_AVAILABLE = False
17
  try:
18
  from opentelemetry import trace, context, baggage
19
+ from opentelemetry.sdk.trace import TracerProvider
20
+ from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
21
+ OPENTELEMETRY_BASIC = True
22
+ except ImportError:
23
+ OPENTELEMETRY_BASIC = False
24
+ # Use minimal implementation
25
+ from src.core.monitoring_minimal import MockTracer
26
+
27
+ class MockTrace:
28
+ Tracer = MockTracer
29
+ def get_current_span(self):
30
+ return None
31
+ def set_tracer_provider(self, provider):
32
+ pass
33
+ def get_tracer(self, name, version=None):
34
+ return MockTracer()
35
+
36
+ trace = MockTrace()
37
+ context = None
38
+ baggage = None
39
+ TracerProvider = None
40
+ Resource = None
41
+ SERVICE_NAME = "service_name"
42
+ SERVICE_VERSION = "service_version"
43
+
44
+ # Try to import optional exporters and instrumentors
45
+ try:
46
  from opentelemetry.exporter.jaeger.thrift import JaegerExporter
47
  from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
48
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
49
+ OPENTELEMETRY_EXPORTERS = True
50
+ except ImportError:
51
+ OPENTELEMETRY_EXPORTERS = False
52
+ JaegerExporter = None
53
+ OTLPSpanExporter = None
54
+ BatchSpanProcessor = None
55
+ ConsoleSpanExporter = None
56
+
57
+ try:
58
  from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
59
  from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
60
  from opentelemetry.instrumentation.redis import RedisInstrumentor
61
  from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
62
  from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
63
+ OPENTELEMETRY_INSTRUMENTORS = True
 
 
 
 
 
 
 
 
64
  except ImportError:
65
+ OPENTELEMETRY_INSTRUMENTORS = False
 
 
 
66
  class MockInstrumentor:
67
  @staticmethod
68
  def instrument(*args, **kwargs):
69
  pass
 
70
  FastAPIInstrumentor = HTTPXClientInstrumentor = RedisInstrumentor = SQLAlchemyInstrumentor = AsyncPGInstrumentor = MockInstrumentor
71
+
72
+ try:
73
+ from opentelemetry.propagate import set_global_textmap
74
+ from opentelemetry.propagators.b3 import B3MultiFormat
75
+ from opentelemetry.propagators.jaeger import JaegerPropagator
76
+ from opentelemetry.propagators.composite import CompositeHTTPPropagator
77
+ OPENTELEMETRY_PROPAGATORS = True
78
+ except ImportError:
79
+ OPENTELEMETRY_PROPAGATORS = False
80
+ set_global_textmap = lambda x: None
81
+ B3MultiFormat = None
82
+ JaegerPropagator = None
83
+ CompositeHTTPPropagator = None
84
+
85
+ try:
86
+ from opentelemetry.semconv.trace import SpanAttributes
87
+ except ImportError:
88
+ SpanAttributes = None
89
+
90
+ # Set availability flag based on basic functionality
91
+ OPENTELEMETRY_AVAILABLE = OPENTELEMETRY_BASIC
92
  try:
93
  from opentelemetry.trace.status import Status, StatusCode
94
  except ImportError:
 
154
  logger.warning("Tracing already initialized")
155
  return
156
 
157
+ if not OPENTELEMETRY_BASIC:
158
+ logger.warning("OpenTelemetry not available - using mock tracer")
159
+ self.tracer = trace.get_tracer(__name__)
160
+ self._initialized = True
161
+ return
162
+
163
  # Create resource
164
+ if Resource:
165
+ resource = Resource.create({
166
+ SERVICE_NAME: self.config.service_name,
167
+ SERVICE_VERSION: self.config.service_version,
168
+ "service.instance.id": str(uuid.uuid4()),
169
+ "deployment.environment": getattr(settings, 'app_env', 'development')
170
+ })
171
+ else:
172
+ resource = None
173
 
174
  # Create tracer provider
175
+ if TracerProvider and resource:
176
+ self.tracer_provider = TracerProvider(resource=resource)
177
+ else:
178
+ self.tracer_provider = None
179
 
180
+ if self.tracer_provider:
181
+ # Add span processors/exporters
182
+ self._setup_exporters()
183
+
184
+ # Set global tracer provider
185
+ trace.set_tracer_provider(self.tracer_provider)
186
 
187
  # Create tracer
188
  self.tracer = trace.get_tracer(
 
201
 
202
  def _setup_exporters(self):
203
  """Setup span exporters."""
204
+ if not self.tracer_provider or not OPENTELEMETRY_EXPORTERS:
205
+ if not OPENTELEMETRY_EXPORTERS:
206
+ logger.warning("OpenTelemetry exporters not available - skipping exporter setup")
207
  return
208
 
209
  # Console exporter for development
210
+ if self.config.enable_console_export and ConsoleSpanExporter:
211
  console_exporter = ConsoleSpanExporter()
212
  console_processor = BatchSpanProcessor(console_exporter)
213
  self.tracer_provider.add_span_processor(console_processor)
214
  logger.info("Console span exporter enabled")
215
 
216
  # Jaeger exporter
217
+ if self.config.jaeger_endpoint and JaegerExporter:
218
  jaeger_exporter = JaegerExporter(
219
  agent_host_name="localhost",
220
  agent_port=14268,
 
225
  logger.info(f"Jaeger exporter configured: {self.config.jaeger_endpoint}")
226
 
227
  # OTLP exporter (for generic OpenTelemetry collectors)
228
+ if self.config.otlp_endpoint and OTLPSpanExporter:
229
  otlp_exporter = OTLPSpanExporter(
230
  endpoint=self.config.otlp_endpoint,
231
  insecure=True
 
236
 
237
  def _setup_propagators(self):
238
  """Setup trace context propagators."""
239
+ if not OPENTELEMETRY_PROPAGATORS:
240
+ logger.warning("OpenTelemetry propagators not available - skipping propagator setup")
241
+ return
242
+
243
  # Support multiple propagation formats
244
+ propagators = []
245
+ if B3MultiFormat:
246
+ propagators.append(B3MultiFormat())
247
+ if JaegerPropagator:
248
+ propagators.append(JaegerPropagator())
 
 
249
 
250
+ if propagators and CompositeHTTPPropagator:
251
+ composite_propagator = CompositeHTTPPropagator(propagators)
252
+ set_global_textmap(composite_propagator)
253
+ logger.info("Trace propagators configured")
254
+ else:
255
+ logger.warning("No propagators available")
256
 
257
  def _setup_auto_instrumentation(self):
258
  """Setup automatic instrumentation for common libraries."""
 
314
  @staticmethod
315
  def get_correlation_id() -> str:
316
  """Get correlation ID from current trace context."""
317
+ if hasattr(trace, 'get_current_span'):
318
+ span = trace.get_current_span()
319
+ if span and hasattr(span, 'get_span_context'):
320
+ span_context = span.get_span_context()
321
+ if hasattr(span_context, 'is_valid') and span_context.is_valid:
322
+ return f"{span_context.trace_id:032x}"
323
  return str(uuid.uuid4())
324
 
325
  @staticmethod
326
  def get_span_id() -> str:
327
  """Get current span ID."""
328
+ if hasattr(trace, 'get_current_span'):
329
+ span = trace.get_current_span()
330
+ if span and hasattr(span, 'get_span_context'):
331
+ span_context = span.get_span_context()
332
+ if hasattr(span_context, 'is_valid') and span_context.is_valid:
333
+ return f"{span_context.span_id:016x}"
334
  return ""
335
 
336
  @staticmethod
337
  def set_user_context(user_id: str, user_email: Optional[str] = None):
338
  """Set user context in current trace."""
339
+ if hasattr(trace, 'get_current_span'):
340
+ span = trace.get_current_span()
341
+ if span and hasattr(span, 'set_attribute'):
342
+ span.set_attribute("user.id", user_id)
343
+ if user_email:
344
+ span.set_attribute("user.email", user_email)
345
 
346
  # Also set in baggage for propagation
347
+ if baggage and context:
348
+ ctx = baggage.set_baggage("user.id", user_id)
349
+ context.attach(ctx)
350
 
351
  @staticmethod
352
  def set_investigation_context(investigation_id: str):
353
  """Set investigation context in current trace."""
354
+ if hasattr(trace, 'get_current_span'):
355
+ span = trace.get_current_span()
356
+ if span and hasattr(span, 'set_attribute'):
357
+ span.set_attribute("investigation.id", investigation_id)
358
 
359
  # Set in baggage
360
+ if baggage and context:
361
+ ctx = baggage.set_baggage("investigation.id", investigation_id)
362
+ context.attach(ctx)
363
 
364
  @staticmethod
365
  def add_event(name: str, attributes: Optional[Dict[str, Any]] = None):
366
  """Add event to current span."""
367
+ if hasattr(trace, 'get_current_span'):
368
+ span = trace.get_current_span()
369
+ if span and hasattr(span, 'add_event'):
370
+ span.add_event(name, attributes or {})
371
 
372
 
373
  def trace_function(