|
| 1 | +// Grafana Alloy Configuration for GenLayer Node Telemetry |
| 2 | +// Handles both log collection and metrics forwarding |
| 3 | + |
| 4 | +// ========================================== |
| 5 | +// Log Collection and Forwarding |
| 6 | +// ========================================== |
| 7 | + |
| 8 | +// Discovery component to find log files using local.file_match |
| 9 | +// Supports different log file patterns: |
| 10 | +// - Single node: "/var/log/genlayer/node.log" |
| 11 | +// - Multi-node: "/var/log/genlayer/*/logs/node.log" (each node in subdirectory) |
| 12 | +// - Custom pattern via LOG_FILE_PATTERN env var |
| 13 | +local.file_match "genlayer_logs" { |
| 14 | + path_targets = [{ |
| 15 | + __path__ = coalesce(sys.env("LOG_FILE_PATTERN"), "/var/log/genlayer/node*.log"), |
| 16 | + }] |
| 17 | +} |
| 18 | + |
| 19 | +// Relabel to add metadata labels to log entries |
| 20 | +discovery.relabel "add_labels" { |
| 21 | + targets = local.file_match.genlayer_logs.targets |
| 22 | + |
| 23 | + // Add instance label from environment variable |
| 24 | + rule { |
| 25 | + target_label = "instance" |
| 26 | + replacement = sys.env("NODE_ID") |
| 27 | + } |
| 28 | + |
| 29 | + // Add validator_name label from environment variable |
| 30 | + rule { |
| 31 | + target_label = "validator_name" |
| 32 | + replacement = sys.env("VALIDATOR_NAME") |
| 33 | + } |
| 34 | + |
| 35 | + // Add component label |
| 36 | + rule { |
| 37 | + target_label = "component" |
| 38 | + replacement = "alloy" |
| 39 | + } |
| 40 | + |
| 41 | + // Add job label |
| 42 | + rule { |
| 43 | + target_label = "job" |
| 44 | + replacement = "genlayer-node" |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | +// Source component to read log files |
| 49 | +loki.source.file "genlayer" { |
| 50 | + targets = discovery.relabel.add_labels.output |
| 51 | + forward_to = [loki.write.central.receiver] |
| 52 | + |
| 53 | + // Tail from end to avoid ingesting entire log history on startup |
| 54 | + tail_from_end = true |
| 55 | +} |
| 56 | + |
| 57 | +// Write logs to central Loki instance |
| 58 | +loki.write "central" { |
| 59 | + endpoint { |
| 60 | + url = sys.env("CENTRAL_LOKI_URL") |
| 61 | + |
| 62 | + // HTTP Basic Authentication |
| 63 | + basic_auth { |
| 64 | + username = sys.env("CENTRAL_LOKI_USERNAME") |
| 65 | + password = sys.env("CENTRAL_LOKI_PASSWORD") |
| 66 | + } |
| 67 | + |
| 68 | + // Enable retry with default exponential backoff |
| 69 | + // Note: Alloy's loki.write doesn't have a retry block; retries are handled automatically |
| 70 | + // with exponential backoff by default when the endpoint is unreachable |
| 71 | + |
| 72 | + // Configurable batch settings for efficient log sending |
| 73 | + batch_size = coalesce(sys.env("LOKI_BATCH_SIZE"), "1MiB") // Maximum batch size before sending |
| 74 | + batch_wait = coalesce(sys.env("LOKI_BATCH_WAIT"), "60s") // Maximum wait time before sending partial batch |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +// ========================================== |
| 79 | +// Prometheus Metrics Collection and Forwarding |
| 80 | +// ========================================== |
| 81 | + |
| 82 | +// Scrape metrics from GenLayer node(s) |
| 83 | +// Supports both single node and multi-node configurations |
| 84 | +// |
| 85 | +// Single Node Mode: |
| 86 | +// Set NODE_METRICS_ENDPOINT, NODE_ID, VALIDATOR_NAME |
| 87 | +// |
| 88 | +// Multi-Node Mode: |
| 89 | +// Set SCRAPE_TARGETS_JSON with JSON array of target objects |
| 90 | +// Example: [{"__address__":"host.docker.internal:9250","instance":"0x...","validator_name":"node-1"}] |
| 91 | +// |
| 92 | +// Note: The "network" label is emitted by the node itself (auto-detected from consensus address), |
| 93 | +// so it does not need to be configured here. |
| 94 | +prometheus.scrape "genlayer_node" { |
| 95 | + // Dynamic targets based on environment variable |
| 96 | + // If SCRAPE_TARGETS_JSON is set, use it (multi-node mode) |
| 97 | + // Otherwise, build single target from individual env vars (single node mode) |
| 98 | + targets = encoding.from_json(coalesce(sys.env("SCRAPE_TARGETS_JSON"), string.format("[{\"__address__\":\"%s\",\"instance\":\"%s\",\"validator_name\":\"%s\"}]", coalesce(sys.env("NODE_METRICS_ENDPOINT"), "host.docker.internal:9153"), coalesce(sys.env("NODE_ID"), "local"), coalesce(sys.env("VALIDATOR_NAME"), "default")))) |
| 99 | + |
| 100 | + forward_to = [prometheus.relabel.metrics.receiver] |
| 101 | + |
| 102 | + // Configurable scrape intervals |
| 103 | + scrape_interval = coalesce(sys.env("METRICS_SCRAPE_INTERVAL"), "60s") |
| 104 | + scrape_timeout = coalesce(sys.env("METRICS_SCRAPE_TIMEOUT"), "10s") |
| 105 | +} |
| 106 | + |
| 107 | +// Relabel metrics to filter before forwarding |
| 108 | +prometheus.relabel "metrics" { |
| 109 | + forward_to = [prometheus.remote_write.central.receiver] |
| 110 | + |
| 111 | + // Option 1: Forward all metrics (default) |
| 112 | + // Currently forwarding all metrics from the node. |
| 113 | + |
| 114 | + // Option 2: Only keep genlayer_node_* metrics to reduce bandwidth (recommended) |
| 115 | + // To enable filtering and reduce bandwidth, uncomment the following rule: |
| 116 | + /* |
| 117 | + rule { |
| 118 | + source_labels = ["__name__"] |
| 119 | + regex = "genlayer_node_.*" |
| 120 | + action = "keep" |
| 121 | + } |
| 122 | + */ |
| 123 | +} |
| 124 | + |
| 125 | +// Remote write configuration for sending metrics to central Prometheus |
| 126 | +prometheus.remote_write "central" { |
| 127 | + endpoint { |
| 128 | + url = sys.env("CENTRAL_MONITORING_URL") |
| 129 | + |
| 130 | + // HTTP Basic Authentication |
| 131 | + basic_auth { |
| 132 | + username = sys.env("CENTRAL_MONITORING_USERNAME") |
| 133 | + password = sys.env("CENTRAL_MONITORING_PASSWORD") |
| 134 | + } |
| 135 | + |
| 136 | + // Queue configuration for reliability |
| 137 | + queue_config { |
| 138 | + capacity = 10000 |
| 139 | + max_shards = 5 |
| 140 | + max_samples_per_send = 500 |
| 141 | + batch_send_deadline = coalesce(sys.env("METRICS_BATCH_SEND_DEADLINE"), "60s") |
| 142 | + } |
| 143 | + } |
| 144 | +} |
| 145 | + |
| 146 | +// ========================================== |
| 147 | +// Alloy Self-Monitoring |
| 148 | +// ========================================== |
| 149 | + |
| 150 | +// Alloy internal exporter for health monitoring |
| 151 | +prometheus.exporter.self "alloy" {} |
| 152 | + |
| 153 | +// Expose Alloy's own metrics on the HTTP server |
| 154 | +prometheus.scrape "alloy" { |
| 155 | + targets = prometheus.exporter.self.alloy.targets |
| 156 | + forward_to = [] // Not forwarding Alloy metrics to reduce noise |
| 157 | + |
| 158 | + // Configurable scrape interval for Alloy's internal health monitoring |
| 159 | + scrape_interval = coalesce(sys.env("ALLOY_SELF_MONITORING_INTERVAL"), "60s") |
| 160 | +} |
0 commit comments