Skip to main content

Advanced Observability & Monitoring Strategies - Part B

This guide covers advanced Prometheus configurations, cost optimization strategies, and comprehensive monitoring architectures for high-scale production systems.

Table of Contents

  1. Prometheus Scaling Strategies
  2. Long-term Storage with Thanos
  3. Query Performance Optimization
  4. Cardinality Management
  5. Cost Optimization

Prometheus Scaling Strategies

Horizontal Sharding Implementation

class PrometheusScalingStrategy:
def __init__(self):
self.current_metrics = {
"samples_per_second": 10_000_000, # 10M samples/sec
"storage_growth": "100GB/day",
"query_latency_p99": "15s",
"cardinality": 50_000_000 # 50M active series
}

def implement_horizontal_sharding(self):
"""Shard Prometheus by service or region"""
return {
"sharding_strategy": {
"shard_by_service": {
"prometheus_frontend": {
"services": ["api-gateway", "frontend-app", "cdn"],
"expected_samples": "2M/sec",
"retention": "15d"
},
"prometheus_backend": {
"services": ["user-service", "order-service", "payment-service"],
"expected_samples": "5M/sec",
"retention": "15d"
},
"prometheus_infrastructure": {
"services": ["kubernetes", "node-exporter", "network"],
"expected_samples": "3M/sec",
"retention": "30d" # Longer for infrastructure
}
},
"coordination": {
"service_discovery": "Consul/etcd for shard registration",
"query_routing": "Thanos Query for federated queries",
"alerting": "Global AlertManager with routing rules"
}
},
"implementation": self.create_sharded_prometheus_config()
}

def create_sharded_prometheus_config(self):
"""Configuration for sharded Prometheus setup"""
return """
# Prometheus Frontend Shard
global:
scrape_interval: 15s
external_labels:
shard: frontend
replica: A

scrape_configs:
- job_name: 'api-gateway'
kubernetes_sd_configs:
- role: pod
namespaces:
names: ['frontend']
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
regex: 'api-gateway'
action: keep

- job_name: 'frontend-app'
kubernetes_sd_configs:
- role: pod
namespaces:
names: ['frontend']
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
regex: 'frontend-app'
action: keep

# Storage optimization
storage:
tsdb:
retention.time: 15d
retention.size: 100GB
min-block-duration: 2h # Optimize for write performance
max-block-duration: 25h # Balance query performance
wal-compression: true # Reduce WAL size

# Query optimization
query:
timeout: 2m
max-concurrency: 20
max-samples: 50000000
"""

def implement_federation_architecture(self):
"""Multi-level federation for scaling"""
return {
"federation_levels": {
"leaf_prometheus": {
"level": 1,
"scope": "Single service or small group of services",
"retention": "7d",
"sample_rate": "15s",
"storage": "Local SSD"
},
"regional_prometheus": {
"level": 2,
"scope": "Aggregate from multiple leaf nodes in region",
"retention": "30d",
"sample_rate": "1m", # Downsampled
"storage": "Network SSD"
},
"global_prometheus": {
"level": 3,
"scope": "Global view across all regions",
"retention": "90d",
"sample_rate": "5m", # Heavily downsampled
"storage": "Object storage (S3/GCS)"
}
},
"federation_config": self.create_federation_config()
}

def create_federation_config(self):
"""Federation configuration examples"""
return """
# Regional Prometheus federating from leaf nodes
scrape_configs:
- job_name: 'federate-leaf-nodes'
scrape_interval: 60s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
# Only federate aggregated metrics and SLIs
- '\{__name__=~".*:.*"\}' # Recording rules
- '\{__name__=~"sli_.*"\}' # SLI metrics
- '\{__name__=~"business_.*"\}' # Business metrics
- '\{job="kubernetes-apiservers"\}' # Critical infrastructure
static_configs:
- targets:
- 'prometheus-leaf-1:9090'
- 'prometheus-leaf-2:9090'
- 'prometheus-leaf-3:9090'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'prometheus-leaf-1:9090'

# Recording rules for federation
rule_files:
- federation_rules.yml
"""

def implement_long_term_storage(self):
"""Long-term storage with Thanos or Cortex"""
return {
"thanos_architecture": {
"components": {
"thanos_sidecar": {
"purpose": "Upload blocks to object storage",
"deployment": "Alongside each Prometheus instance",
"config": self.create_thanos_sidecar_config()
},
"thanos_store": {
"purpose": "Query historical data from object storage",
"deployment": "Separate service",
"replicas": 3
},
"thanos_query": {
"purpose": "Federated query API",
"deployment": "Load balanced service",
"replicas": 2
},
"thanos_compactor": {
"purpose": "Compact and downsample historical data",
"deployment": "Single instance with leader election",
"schedule": "Daily compaction"
}
}
},
"storage_tiers": {
"hot_tier": {
"duration": "7d",
"storage": "Local NVMe SSD",
"resolution": "15s",
"cost_per_gb_month": 0.30
},
"warm_tier": {
"duration": "30d",
"storage": "Network SSD",
"resolution": "1m",
"cost_per_gb_month": 0.10
},
"cold_tier": {
"duration": "2y",
"storage": "Object storage (S3 IA)",
"resolution": "5m",
"cost_per_gb_month": 0.025
},
"archive_tier": {
"duration": "7y",
"storage": "S3 Glacier",
"resolution": "1h",
"cost_per_gb_month": 0.004
}
}
}

def create_thanos_sidecar_config(self):
"""Thanos sidecar configuration"""
return """
# Thanos sidecar deployment
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus-with-thanos
spec:
serviceName: prometheus-headless
replicas: 2
template:
spec:
containers:
- name: prometheus
image: prom/prometheus:v2.40.0
ports:
- containerPort: 9090
args:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=6h # Short retention with Thanos
- --storage.tsdb.min-block-duration=2h
- --storage.tsdb.max-block-duration=2h
- --web.enable-lifecycle
volumeMounts:
- name: prometheus-storage
mountPath: /prometheus

- name: thanos-sidecar
image: thanosio/thanos:v0.28.0
ports:
- containerPort: 10901
- containerPort: 10902
args:
- sidecar
- --tsdb.path=/prometheus
- --prometheus.url=http://localhost:9090
- --grpc-address=0.0.0.0:10901
- --http-address=0.0.0.0:10902
- --objstore.config-file=/etc/thanos/objstore.yml
- --shipper.upload-compacted
volumeMounts:
- name: prometheus-storage
mountPath: /prometheus
- name: thanos-objstore-config
mountPath: /etc/thanos

volumes:
- name: thanos-objstore-config
secret:
secretName: thanos-objstore-config

volumeClaimTemplates:
- metadata:
name: prometheus-storage
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
storageClassName: fast-ssd

---
# Object storage configuration secret
apiVersion: v1
kind: Secret
metadata:
name: thanos-objstore-config
stringData:
objstore.yml: |
type: S3
config:
bucket: "prometheus-long-term-storage"
endpoint: "s3.amazonaws.com"
access_key: "ACCESS_KEY"
secret_key: "SECRET_KEY"
insecure: false
signature_version2: false
encrypt_sse: true
put_user_metadata:
retention: "2y"
"""

def optimize_query_performance(self):
"""Query optimization strategies"""
return {
"recording_rules_optimization": {
"purpose": "Pre-compute expensive queries",
"examples": self.create_optimized_recording_rules(),
"benefits": [
"Reduced query latency from 15s to <1s",
"Lower CPU usage on Prometheus",
"Consistent performance for dashboards"
]
},
"query_optimization_techniques": {
"use_recording_rules": "Pre-aggregate complex calculations",
"limit_time_ranges": "Use __range__ to limit query scope",
"optimize_selectors": "Use specific label selectors",
"avoid_high_cardinality": "Don't group by user_id or request_id",
"use_subqueries_carefully": "Subqueries can be expensive"
},
"caching_strategies": {
"query_result_caching": {
"tool": "Trickster or query result cache",
"ttl": "5m for real-time queries, 1h for historical",
"cache_size": "10GB per cache instance"
},
"metadata_caching": {
"purpose": "Cache label names and values",
"implementation": "Redis cluster",
"ttl": "1h"
}
}
}

def create_optimized_recording_rules(self):
"""High-performance recording rules"""
return """
groups:
- name: sli_recording_rules
interval: 30s
rules:
# Pre-compute service availability (used in many dashboards)
- record: service:availability:rate5m
expr: |
(
sum(rate(http_requests_total\{code!~"5.."\}[5m])) by (service)
/
sum(rate(http_requests_total[5m])) by (service)
)

# Pre-compute P99 latency by service
- record: service:latency:p99:rate5m
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)
)

# Pre-compute error rate by service and status code
- record: service:error_rate:rate5m
expr: |
(
sum(rate(http_requests_total\{code=~"5.."\}[5m])) by (service, code)
/
sum(rate(http_requests_total[5m])) by (service, code)
)

- name: business_recording_rules
interval: 60s # Less frequent for business metrics
rules:
# Revenue per minute (expensive calculation)
- record: business:revenue:rate1m
expr: |
sum(increase(order_total_dollars[1m])) by (region)

# Orders per minute by category
- record: business:orders:rate1m
expr: |
sum(increase(orders_completed_total[1m])) by (category, region)

- name: infrastructure_recording_rules
interval: 30s
rules:
# CPU utilization by node (complex calculation simplified)
- record: node:cpu_utilization:rate5m
expr: |
(
1 - (
sum(rate(node_cpu_seconds_total\{mode="idle"\}[5m])) by (instance)
/
sum(rate(node_cpu_seconds_total[5m])) by (instance)
)
) * 100

# Memory utilization percentage
- record: node:memory_utilization:ratio
expr: |
(
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
/
node_memory_MemTotal_bytes
) * 100
"""

def implement_cardinality_management(self):
"""Manage metric cardinality to prevent explosion"""
return {
"cardinality_monitoring": {
"query": 'prometheus_tsdb_symbol_table_size_bytes',
"alert_threshold": "50M active series",
"investigation_query": """
# Find highest cardinality metrics
topk(20, count by (__name__)(\{__name__=~".+"\}))

# Find highest cardinality labels
topk(20, count by (__name__, job)(\{__name__=~".+"\}))
"""
},
"cardinality_reduction_strategies": {
"relabeling_rules": {
"drop_high_cardinality_labels": """
# Drop user_id labels (too high cardinality)
- source_labels: [__name__]
regex: 'http_requests_total'
target_label: __tmp_drop_user_id
replacement: 'true'
- source_labels: [__tmp_drop_user_id, user_id]
regex: 'true;.*'
target_label: user_id
replacement: ''
""",
"group_similar_values": """
# Group HTTP status codes
- source_labels: [status_code]
regex: '2..'
target_label: status_class
replacement: '2xx'
- source_labels: [status_code]
regex: '4..'
target_label: status_class
replacement: '4xx'
- source_labels: [status_code]
regex: '5..'
target_label: status_class
replacement: '5xx'
"""
},
"metric_filtering": {
"drop_unused_metrics": """
# Drop debug metrics in production
- source_labels: [__name__]
regex: 'debug_.*|test_.*'
action: drop

# Drop high-frequency, low-value metrics
- source_labels: [__name__]
regex: 'go_gc_.*|process_.*'
action: drop
"""
}
}
}

def create_cost_optimization_plan(self):
"""Comprehensive cost optimization"""
return {
"current_costs": {
"compute": "$5,000/month (10 c5.2xlarge instances)",
"storage": "$3,000/month (50TB network SSD)",
"network": "$500/month (cross-AZ traffic)",
"total": "$8,500/month"
},
"optimization_targets": {
"compute_reduction": {
"strategy": "Right-size instances based on actual usage",
"expected_savings": "30% ($1,500/month)",
"implementation": [
"Use spot instances for non-critical Prometheus",
"Implement auto-scaling for query load",
"Use ARM instances where supported"
]
},
"storage_optimization": {
"strategy": "Implement proper data lifecycle management",
"expected_savings": "50% ($1,500/month)",
"implementation": [
"Move to object storage after 7 days",
"Implement aggressive downsampling",
"Use compression for long-term storage"
]
},
"query_optimization": {
"strategy": "Reduce unnecessary queries and improve efficiency",
"expected_savings": "25% compute ($1,250/month)",
"implementation": [
"Implement query result caching",
"Optimize dashboard queries",
"Use recording rules for expensive calculations"
]
}
},
"total_projected_savings": "$4,250/month (50% reduction)"
}

Implementation Timeline

cost_optimization_timeline:
phase_1_immediate:
duration: "2 weeks"
actions:
- "Implement recording rules for expensive queries"
- "Add query result caching layer"
- "Optimize high-frequency dashboard queries"
expected_savings: "$800/month"

phase_2_infrastructure:
duration: "1 month"
actions:
- "Deploy Thanos for long-term storage"
- "Migrate to object storage tiers"
- "Implement data lifecycle policies"
expected_savings: "$2,200/month"

phase_3_advanced:
duration: "6 weeks"
actions:
- "Implement horizontal sharding"
- "Deploy spot instances where appropriate"
- "Optimize cardinality management"
expected_savings: "$1,250/month"

Summary

This advanced observability guide provides:

  • Prometheus scaling strategies with horizontal sharding and federation
  • Long-term storage solutions using Thanos with multi-tier storage
  • Query performance optimization through recording rules and caching
  • Cardinality management to prevent metric explosion
  • Cost optimization strategies with 50% potential savings

These techniques enable monitoring systems to scale efficiently while maintaining performance and controlling costs in high-volume production environments.