@@ -53,7 +53,7 @@ func (r *Reconciler) prometheusRules() (runtime.Object, reconciler.DesiredState,
5353 },
5454 {
5555 Alert : "SyslogNGQueueLength" ,
56- Expr : intstr .FromString (fmt .Sprintf ("rate(syslog_ng_status_buffer_queue_length {%s}[5m]) > 0.3" , nsJobLabel )),
56+ Expr : intstr .FromString (fmt .Sprintf ("max(syslogng_memory_queue_events {%s}) / max(syslogng_memory_queue_capacity{%s}) > 0.3" , nsJobLabel , nsJobLabel )),
5757 For : prometheus_operator .Duration ("1m" ),
5858 Labels : map [string ]string {
5959 "rulegroup" : ruleGroupName ,
@@ -62,12 +62,12 @@ func (r *Reconciler) prometheusRules() (runtime.Object, reconciler.DesiredState,
6262 },
6363 Annotations : map [string ]string {
6464 "summary" : `syslog-ng node are failing` ,
65- "description" : `In the last 5 minutes, syslog -ng queues increased 30%. Current value is "{{ $value }}".` ,
65+ "description" : `Syslog -ng queue usage is above 30%. Current value is "{{ $value }}".` ,
6666 },
6767 },
6868 {
6969 Alert : "SyslogNGQueueLength" ,
70- Expr : intstr .FromString (fmt .Sprintf ("rate(syslog_ng_status_buffer_queue_length {%s}[5m]) > 0.5" , nsJobLabel )),
70+ Expr : intstr .FromString (fmt .Sprintf ("max(syslogng_memory_queue_events {%s}) / max(syslogng_memory_queue_capacity{%s}) > 0.5" , nsJobLabel , nsJobLabel )),
7171 For : prometheus_operator .Duration ("1m" ),
7272 Labels : map [string ]string {
7373 "rulegroup" : ruleGroupName ,
@@ -76,12 +76,12 @@ func (r *Reconciler) prometheusRules() (runtime.Object, reconciler.DesiredState,
7676 },
7777 Annotations : map [string ]string {
7878 "summary" : `Syslog-NG nodes buffer queue length are critical` ,
79- "description" : `In the last 5 minutes, Syslog-NG queues increased 50%. Current value is "{{ $value }}".` ,
79+ "description" : `Syslog-ng queue usage is above 50%. Current value is "{{ $value }}".` ,
8080 },
8181 },
8282 {
8383 Alert : "SyslogNGRecordsCountsHigh" ,
84- Expr : intstr .FromString (fmt .Sprintf ("sum(rate(syslog_ng_output_status_emit_records {%[1]s}[5m])) by (job,pod,namespace) > (3 * sum(rate(syslog_ng_output_status_emit_records {%[1]s}[15m])) by (job,pod,namespace))" , nsJobLabel )),
84+ Expr : intstr .FromString (fmt .Sprintf ("sum(rate(syslogng_output_events_total {%[1]s}[5m])) by (job,pod,namespace) > (3 * sum(rate(syslogng_output_events_total {%[1]s}[15m])) by (job,pod,namespace))" , nsJobLabel )),
8585 For : prometheus_operator .Duration ("1m" ),
8686 Labels : map [string ]string {
8787 "rulegroup" : ruleGroupName ,
@@ -95,7 +95,7 @@ func (r *Reconciler) prometheusRules() (runtime.Object, reconciler.DesiredState,
9595 },
9696 {
9797 Alert : "SyslogNGRetry" ,
98- Expr : intstr .FromString (fmt .Sprintf ("increase(syslog_ng_status_retry_count {%s}[10m] ) > 0" , nsJobLabel )),
98+ Expr : intstr .FromString (fmt .Sprintf ("max(syslogng_output_event_retries_total {%s}) > 0" , nsJobLabel )),
9999 For : prometheus_operator .Duration ("20m" ),
100100 Labels : map [string ]string {
101101 "rulegroup" : ruleGroupName ,
@@ -109,7 +109,7 @@ func (r *Reconciler) prometheusRules() (runtime.Object, reconciler.DesiredState,
109109 },
110110 {
111111 Alert : "SyslogNGOutputError" ,
112- Expr : intstr .FromString (fmt .Sprintf ("increase(syslog_ng_output_status_num_errors {%s}[10m]) > 0" , nsJobLabel )),
112+ Expr : intstr .FromString (fmt .Sprintf ("increase(syslogng_output_events_total {%s,result= \" dropped \" }[10m]) > 0" , nsJobLabel )),
113113 For : prometheus_operator .Duration ("1s" ),
114114 Labels : map [string ]string {
115115 "rulegroup" : ruleGroupName ,
@@ -123,7 +123,7 @@ func (r *Reconciler) prometheusRules() (runtime.Object, reconciler.DesiredState,
123123 },
124124 {
125125 Alert : "SyslogNGPredictedBufferGrowth" ,
126- Expr : intstr .FromString (fmt .Sprintf ("predict_linear(syslog_ng_output_status_buffer_total_bytes {%[1]s}[10m], 600) > syslog_ng_output_status_buffer_total_bytes {%[1]s}" , nsJobLabel )),
126+ Expr : intstr .FromString (fmt .Sprintf ("predict_linear(syslogng_memory_queue_memory_usage_bytes {%[1]s}[10m], 600) > syslogng_memory_queue_memory_usage_bytes {%[1]s}" , nsJobLabel )),
127127 For : prometheus_operator .Duration ("10m" ),
128128 Labels : map [string ]string {
129129 "rulegroup" : ruleGroupName ,
0 commit comments