AWS Lambda Error Rate Alerts

Hi all,

Trying to create a task to alert on AWS lambda error rates.

Haven’t been able to merge the two requests/errors streams.

Please let me know.

Current script:

var requests = stream
    |from()
        .measurement('cloudwatch_aws_lambda')
        .groupBy('function_name')
    |window()
        .period(3h)
        .every(1m)
    |default()
        .field('invocations_sum', 0.0)

var errors = stream
    |from()
        .measurement('cloudwatch_aws_lambda')
        .groupBy('function_name')
    |window()
        .period(3h)
        .every(1m)
    |default()
        .field('errors_sum', 0.0)

errors|join(requests)
            .as('errors', 'requests')
            .tolerance(1m)
        |where(lambda: "errors.errors_sum" > 0 OR "requests.invocations_sum" > 0)
        |eval(lambda: "errors.errors_sum" * 100.0 / "requests.invocations_sum")
            .as('errors_rate')
        |alert()
            .id('Identity {{ index .Tags "function_name" }}')
            .message('{{ .Level }} {{ .ID }} error rate at {{index .Fields "error_rate"}}% ({{index .Fields "errors_sum"}}/{{index .Fields "invocations_sum"}} requests)')
            .info(lambda: "error_rate" > 2)
            .infoReset(lambda: "error_rate" < 2)
            .warn(lambda: "error_rate" > 10)
            .warnReset(lambda: "error_rate" < 10)
            .crit(lambda: "error_rate" > 25)
            .critReset(lambda: "error_rate" < 25)
            .slack()

@jrxfernandes Can you share a couple of example points in Line Protocol?

Hi @jackzampolin,

In attachment a sample metric point (hope that’s what you were looking for?)

Please do let me know if that makes sense and if you find what I’m doing wrong.

Thanks,

Joao

{
    "results": [
        {
            "statement_id": 0,
            "series": [
                {
                    "name": "cloudwatch_aws_lambda",
                    "columns": [
                        "time",
                        "application",
                        "availabilityZone",
                        "duration_average",
                        "duration_maximum",
                        "duration_minimum",
                        "duration_sample_count",
                        "duration_sum",
                        "errors_average",
                        "errors_maximum",
                        "errors_minimum",
                        "errors_sample_count",
                        "errors_sum",
                        "function_name",
                        "host",
                        "instance",
                        "invocations_average",
                        "invocations_maximum",
                        "invocations_minimum",
                        "invocations_sample_count",
                        "invocations_sum",
                        "iterator_age_average",
                        "iterator_age_maximum",
                        "iterator_age_minimum",
                        "iterator_age_sample_count",
                        "iterator_age_sum",
                        "region",
                        "resource",
                        "throttles_average",
                        "throttles_maximum",
                        "throttles_minimum",
                        "throttles_sample_count",
                        "throttles_sum",
                        "unit"
                    ],
                    "values": [
                        [
                            "2017-07-12T03:04:00Z",
                            "monitoring",
                            "us-east-1a",
                            null,
                            null,
                            null,
                            null,
                            null,
                            1,
                            1,
                            1,
                            1,
                            1,
                            "the function name here",
                            "ip-10-135-168-47",
                            "i-00b88a156de170d55",
                            1,
                            1,
                            1,
                            1,
                            1,
                            null,
                            null,
                            null,
                            null,
                            null,
                            "us-east-1",
                            "the function name here",
                            0,
                            0,
                            0,
                            1,
                            0,
                            "count"
                        ]
                    ]
                }
            ]
        }
    ]
  }

@jrxfernandes That looks like it should work. When you run kapacitor show <task_name> Does it show data streaming through the task? Are you seeing any errors?

Hi @jackzampolin,

There was a typo there but even after fixing that I had to keep on tinker with the script until I got it into a shape that is now working.

I’ll leave it here for future reference in the hope it’ll save some pain to someone else in the future.

Cheers,

Joao


var requests = stream
    |from()
        .measurement('cloudwatch_aws_lambda')
        .groupBy('function_name')
    |default()
        .tag('function_name', '')
        .field('invocations_sum', 0.0)
        .field('errors_sum', 0.0)
    |where(lambda: strHasPrefix("function_name", 'name prefix for logical group of functions'))
    |window()
        .period(1h)
        .every(1m)

var successes = requests
                    |where(lambda: "invocations_sum" > 0.0)

var errors = requests
                |where(lambda: "errors_sum" > 0.0)

errors|join(successes)
            .as('errors', 'successes')
            .streamName('errorRates')
        |eval(lambda: "errors.errors_sum" * 100.0 / "successes.invocations_sum")
            .as('errors_rate')
            .keep('errors_rate', 'errors.errors_sum', 'successes.invocations_sum')
        |alert()
            .id('Logical group {{ index .Tags "function_name" }}')
            .message('{{ .Level }} {{ .ID }} error rate at {{index .Fields "errors_rate" | printf "%0.2f"}}% ({{index .Fields "errors.errors_sum"}}/{{index .Fields "successes.invocations_sum"}} requests)')
            .info(lambda: "errors_rate" > 0)
            .warn(lambda: "errors_rate" > 10)
            .crit(lambda: "errors_rate" > 25)
            .slack()
1 Like

@jrxfernandes Glad you were able to get that working!