Get error rate for an AWS Step Function state machine through CloudWatch

Step Function emits the following execution metrics for each state machine:

  • ExecutionsSucceeded.
  • ExecutionsAborted.
  • ExecutionsFailed.
  • ExecutionsTimedOut.

We can get the overall error rate for a given state machine by doing the following:

aws cloudwatch get-metric-data \
    --region us-east-1 \
    --start-time 2020-06-01T00:00:00Z \
    --end-time 2020-08-01T01:00:00Z \
    --scan-by TimestampAscending \
    --metric-data-queries '
[
    {
        "Id": "executionsAborted1",
        "MetricStat": {
            "Metric": {
                "Namespace": "AWS/States",
                "MetricName": "ExecutionsAborted",
                "Dimensions": [
                    {
                        "Name": "StateMachineArn",
                        "Value": "<state-machine-ARN>"
                    }
                ]
            },
            "Period": 300,
            "Stat": "Sum"
        },
        "ReturnData": false
    },
    {
        "Id": "executionsFailed1",
        "MetricStat": {
            "Metric": {
                "Namespace": "AWS/States",
                "MetricName": "ExecutionsFailed",
                "Dimensions": [
                    {
                        "Name": "StateMachineArn",
                        "Value": "<state-machine-ARN>"
                    }
                ]
            },
            "Period": 300,
            "Stat": "Sum"
        },
        "ReturnData": false
    },
    {
        "Id": "executionsSucceeded1",
        "MetricStat": {
            "Metric": {
                "Namespace": "AWS/States",
                "MetricName": "ExecutionsSucceeded",
                "Dimensions": [
                    {
                        "Name": "StateMachineArn",
                        "Value": "<state-machine-ARN>"
                    }
                ]
            },
            "Period": 300,
            "Stat": "Sum"
        },
        "ReturnData": false
    },
    {
        "Id": "executionsTimedOut1",
        "MetricStat": {
            "Metric": {
                "Namespace": "AWS/States",
                "MetricName": "ExecutionsTimedOut",
                "Dimensions": [
                    {
                        "Name": "StateMachineArn",
                        "Value": "<state-machine-ARN>"
                    }
                ]
            },
            "Period": 300,
            "Stat": "Sum"
        },
        "ReturnData": false
    },
    {
        "Id": "errorRate",
        "Expression": "1 - (executionsSucceeded1)/(executionsSucceeded1 + executionsTimedOut1 + executionsFailed1 + executionsAborted1)"
    }
]
    '