Step Function emits the following execution metrics for each state machine:
ExecutionsSucceeded. ExecutionsAborted. ExecutionsFailed. ExecutionsTimedOut. We can get the overall error rate for a given state machine by doing the following:
aws cloudwatch get-metric-data \ --region us-east-1 \ --start-time 2020-06-01T00:00:00Z \ --end-time 2020-08-01T01:00:00Z \ --scan-by TimestampAscending \ --metric-data-queries ' [ { "Id": "executionsAborted1", "MetricStat": { "Metric": { "Namespace": "AWS/States", "MetricName": "ExecutionsAborted", "Dimensions": [ { "Name": "StateMachineArn", "Value": "<state-machine-ARN>" } ] }, "Period": 300, "Stat": "Sum" }, "ReturnData": false }, { "Id": "executionsFailed1", "MetricStat": { "Metric": { "Namespace": "AWS/States", "MetricName": "ExecutionsFailed", "Dimensions": [ { "Name": "StateMachineArn", "Value": "<state-machine-ARN>" } ] }, "Period": 300, "Stat": "Sum" }, "ReturnData": false }, { "Id": "executionsSucceeded1", "MetricStat": { "Metric": { "Namespace": "AWS/States", "MetricName": "ExecutionsSucceeded", "Dimensions": [ { "Name": "StateMachineArn", "Value": "<state-machine-ARN>" } ] }, "Period": 300, "Stat": "Sum" }, "ReturnData": false }, { "Id": "executionsTimedOut1", "MetricStat": { "Metric": { "Namespace": "AWS/States", "MetricName": "ExecutionsTimedOut", "Dimensions": [ { "Name": "StateMachineArn", "Value": "<state-machine-ARN>" } ] }, "Period": 300, "Stat": "Sum" }, "ReturnData": false }, { "Id": "errorRate", "Expression": "1 - (executionsSucceeded1)/(executionsSucceeded1 + executionsTimedOut1 + executionsFailed1 + executionsAborted1)" } ] '