Hello,
I was wondering the same thing. Using a small task template like this:
var application string
var message = '{{ index .Tags "node" }} of {{ index .Tags "application" }} is {{ index .Fields "up" }}!'
batch
|query('SELECT "up" FROM "<DB>"."<RP>"."status" WHERE "application"= \'' + application + '\'')
.groupBy('application','node')
.period(10s)
.every(10s)
|log()
|alert()
.id('Status {{ index .Tags "application" }}-{{ index .Tags "node" }}')
.crit(lambda: "up" == FALSE)
.message(message)
.noRecoveries()
.teams()
.log('/dev/stdout')
I emulated a scenario like this:
CRITICAL -> OK -> CRITICAL
and got the following log
ts=2022-04-05T05:53:27.570Z lvl=debug msg="starting next batch query" service=kapacitor task_master=main task=acme-up-alert-task node=query1 query="SELECT up FROM \"<DB>\".<RP>.status WHERE application = 'acme' AND time >= '2022-04-05T05:53:17.570443921Z' AND time < '2022-04-05T05:53:27.570443921Z' GROUP BY application, node"
ts=2022-04-05T05:53:27.574Z lvl=info msg="begin batch" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 time=2022-04-05T05:53:27.570443921Z
ts=2022-04-05T05:53:27.574Z lvl=info msg="batch point" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 field_up=false time=2022-04-05T05:53:24.0323641Z
ts=2022-04-05T05:53:27.574Z lvl=info msg="end batch" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 time=2022-04-05T05:53:27.570443921Z
ts=2022-04-05T05:53:27.575Z lvl=debug msg="alert triggered" service=kapacitor task_master=main task=acme-up-alert-task node=alert3 level=CRITICAL id="Status acme-node1" event_message="node1 of acme is false!" data="&{status map[application:acme node:node1] [time up] [[2022-04-05 05:53:24.0323641 +0000 UTC false]]}"
{"id":"Status acme-node1","message":"node1 of acme is false!","details":"{\u0026#34;Name\u0026#34;:\u0026#34;status\u0026#34;,\u0026#34;TaskName\u0026#34;:\u0026#34;acme-up-alert-task\u0026#34;,\u0026#34;Group\u0026#34;:\u0026#34;application=acme,node=node1\u0026#34;,\u0026#34;Tags\u0026#34;:{\u0026#34;application\u0026#34;:\u0026#34;acme\u0026#34;,\u0026#34;node\u0026#34;:\u0026#34;node1\u0026#34;},\u0026#34;ServerInfo\u0026#34;:{\u0026#34;Hostname\u0026#34;:\u0026#34;kapacitor.mlqr-test\u0026#34;,\u0026#34;ClusterID\u0026#34;:\u0026#34;8f56376a-c975-4ea0-83c5-16ba71f40fce\u0026#34;,\u0026#34;ServerID\u0026#34;:\u0026#34;ab3e86de-78d9-452e-8a97-38a98a6b2833\u0026#34;},\u0026#34;ID\u0026#34;:\u0026#34;Status acme-node1\u0026#34;,\u0026#34;Fields\u0026#34;:{\u0026#34;up\u0026#34;:false},\u0026#34;Level\u0026#34;:\u0026#34;CRITICAL\u0026#34;,\u0026#34;Time\u0026#34;:\u0026#34;2022-04-05T05:53:24.0323641Z\u0026#34;,\u0026#34;Duration\u0026#34;:0,\u0026#34;Message\u0026#34;:\u0026#34;node1 of acme is false!\u0026#34;}\n","time":"2022-04-05T05:53:24.0323641Z","duration":0,"level":"CRITICAL","data":{"series":[{"name":"status","tags":{"application":"acme","node":"node1"},"columns":["time","up"],"values":[["2022-04-05T05:53:24.0323641Z",false]]}]},"previousLevel":"OK","recoverable":false}
ts=2022-04-05T05:53:37.570Z lvl=debug msg="starting next batch query" service=kapacitor task_master=main task=acme-up-alert-task node=query1 query="SELECT up FROM \"<DB>\".<RP>.status WHERE application = 'acme' AND time >= '2022-04-05T05:53:27.570212103Z' AND time < '2022-04-05T05:53:37.570212103Z' GROUP BY application, node"
ts=2022-04-05T05:53:47.570Z lvl=debug msg="starting next batch query" service=kapacitor task_master=main task=acme-up-alert-task node=query1 query="SELECT up FROM \"<DB>\".<RP>.status WHERE application = 'acme' AND time >= '2022-04-05T05:53:37.570425599Z' AND time < '2022-04-05T05:53:47.570425599Z' GROUP BY application, node"
ts=2022-04-05T05:53:47.573Z lvl=info msg="begin batch" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 time=2022-04-05T05:53:47.570425599Z
ts=2022-04-05T05:53:47.573Z lvl=info msg="batch point" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 field_up=true time=2022-04-05T05:53:39.3161006Z
ts=2022-04-05T05:53:47.573Z lvl=info msg="end batch" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 time=2022-04-05T05:53:47.570425599Z
ts=2022-04-05T05:53:57.570Z lvl=debug msg="starting next batch query" service=kapacitor task_master=main task=acme-up-alert-task node=query1 query="SELECT up FROM \"<DB>\".<RP>.status WHERE application = 'acme' AND time >= '2022-04-05T05:53:47.57017437Z' AND time < '2022-04-05T05:53:57.57017437Z' GROUP BY application, node"
ts=2022-04-05T05:53:57.574Z lvl=info msg="begin batch" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 time=2022-04-05T05:53:57.57017437Z
ts=2022-04-05T05:53:57.574Z lvl=info msg="batch point" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 field_up=false time=2022-04-05T05:53:54.7846331Z
ts=2022-04-05T05:53:57.574Z lvl=info msg="end batch" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 time=2022-04-05T05:53:57.57017437Z
ts=2022-04-05T05:53:57.574Z lvl=debug msg="alert triggered" service=kapacitor task_master=main task=acme-up-alert-task node=alert3 level=CRITICAL id="Status acme-node1" event_message="node1 of acme is false!" data="&{status map[application:acme node:node1] [time up] [[2022-04-05 05:53:54.7846331 +0000 UTC false]]}"
{"id":"Status acme-node1","message":"node1 of acme is false!","details":"{\u0026#34;Name\u0026#34;:\u0026#34;status\u0026#34;,\u0026#34;TaskName\u0026#34;:\u0026#34;acme-up-alert-task\u0026#34;,\u0026#34;Group\u0026#34;:\u0026#34;application=acme,node=node1\u0026#34;,\u0026#34;Tags\u0026#34;:{\u0026#34;application\u0026#34;:\u0026#34;acme\u0026#34;,\u0026#34;node\u0026#34;:\u0026#34;node1\u0026#34;},\u0026#34;ServerInfo\u0026#34;:{\u0026#34;Hostname\u0026#34;:\u0026#34;kapacitor.mlqr-test\u0026#34;,\u0026#34;ClusterID\u0026#34;:\u0026#34;8f56376a-c975-4ea0-83c5-16ba71f40fce\u0026#34;,\u0026#34;ServerID\u0026#34;:\u0026#34;ab3e86de-78d9-452e-8a97-38a98a6b2833\u0026#34;},\u0026#34;ID\u0026#34;:\u0026#34;Status acme-node1\u0026#34;,\u0026#34;Fields\u0026#34;:{\u0026#34;up\u0026#34;:false},\u0026#34;Level\u0026#34;:\u0026#34;CRITICAL\u0026#34;,\u0026#34;Time\u0026#34;:\u0026#34;2022-04-05T05:53:54.7846331Z\u0026#34;,\u0026#34;Duration\u0026#34;:0,\u0026#34;Message\u0026#34;:\u0026#34;node1 of acme is false!\u0026#34;}\n","time":"2022-04-05T05:53:54.7846331Z","duration":0,"level":"CRITICAL","data":{"series":[{"name":"status","tags":{"application":"acme","node":"node1"},"columns":["time","up"],"values":[["2022-04-05T05:53:54.7846331Z",false]]}]},"previousLevel":"CRITICAL","recoverable":false}
ts=2022-04-05T05:54:07.570Z lvl=debug msg="starting next batch query" service=kapacitor task_master=main task=acme-up-alert-task node=query1 query="SELECT up FROM \"<DB>\".<RP>.status WHERE application = 'acme' AND time >= '2022-04-05T05:53:57.57023309Z' AND time < '2022-04-05T05:54:07.57023309Z' GROUP BY application, node"
The first Critical state comes in and as expected on the alert data:
"previousLevel":"OK"
Afterwards the Recovery is received based on the output of the |log()
node:
ts=2022-04-05T05:53:47.573Z lvl=info msg="batch point" service=kapacitor task_master=main task=acme-up-alert-task node=log2 prefix= name=status group=application=acme,node=node1 tag_application=acme tag_node=node1 field_up=true time=2022-04-05T05:53:39.3161006Z
When the second Critical state comes in, on the alert data:
"previousLevel":"OK"
So it is seems .noRecoveries()
does not reset the status of the alert. At this point, I don’t know to if it is a bug or a feature.