-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy pathcluster-observability.yaml
67 lines (61 loc) · 2.11 KB
/
cluster-observability.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
AWSTemplateFormatVersion: '2010-09-09'
Description: "Setup to monitor sagemaker hyperpod clusters on AWS. Amazon Managed Prometheus and Amazon Manged Grafana workspaces with associated IAM roles are deployed in the AWS Account. Prometheus and exporter services are set up on Cluster Nodes. Author: Matt Nightingale - nghtm@"
Resources:
AmazonGrafanaWorkspaceIAMRole:
Type: 'AWS::IAM::Role'
Properties:
AssumeRolePolicyDocument:
Version: 2012-10-17
Statement:
- Effect: Allow
Principal:
Service:
- grafana.amazonaws.com
Action:
- 'sts:AssumeRole'
RoleName: !Sub ${AWS::StackName}-Grafana-Role
AmazonGrafanaPrometheusPolicy:
Type: AWS::IAM::Policy
Properties:
PolicyName: AmazonGrafana_Prometheus_policy
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- aps:ListWorkspaces
- aps:DescribeWorkspace
- aps:QueryMetrics
- aps:GetLabels
- aps:GetSeries
- aps:GetMetricMetadata
Resource: "*"
Roles: [!Ref AmazonGrafanaWorkspaceIAMRole]
AmazonGrafanaWorkspace:
Type: 'AWS::Grafana::Workspace'
Properties:
AccountAccessType: CURRENT_ACCOUNT
Name: !Sub ${AWS::StackName}-Dashboard
Description: Amazon Grafana Workspace to monitor SageMaker Cluster
AuthenticationProviders:
- AWS_SSO
PermissionType: SERVICE_MANAGED
RoleArn: !GetAtt
- AmazonGrafanaWorkspaceIAMRole
- Arn
DataSources: ["CLOUDWATCH","PROMETHEUS"]
OrganizationRoleName: "ADMIN"
APSWorkspace:
Type: AWS::APS::Workspace
Properties:
Alias: !Sub ${AWS::StackName}-Hyperpod-WorkSpace
Tags:
- Key: Name
Value: SageMaker Hyperpod PrometheusMetrics
Outputs:
Region:
Value: !Ref "AWS::Region"
AMPRemoteWriteURL:
Value: !Join ["" , [ !GetAtt APSWorkspace.PrometheusEndpoint , "api/v1/remote_write" ]]
GrafanWorkspaceURL:
Value: !Join ["" , [ "https://", !GetAtt AmazonGrafanaWorkspace.Endpoint ]]