diff --git a/README.md b/README.md index 38e0da78..f0fe81c8 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ The stack allowa various combination of OS. Here is a list of what has been test | OL7 | OL7 | | OL7 | OL8 | | OL7 | CentOS7 | +| OL8 | OL8 | +| OL8 | OL7 | | Ubuntu 20.04 | Ubuntu 20.04 | When switching to Ubuntu, make sure the username is changed from opc to Ubuntu in the ORM for both the bastion and compute nodes. @@ -358,3 +360,53 @@ You can combine all the options together such as: validate -n y -p y -g y -e y -cn +## /opt/oci-hpc/scripts/collect_logs.py +This is a script to collect nvidia bug report, sosreport, console history logs. + +The script needs to be run from the bastion. In the case where the host is not ssh-able, it will get only console history logs for the same. + +It requires the below argument. +--hostname + +And --compartment-id is optional (i.e. assumption is the host is in the same compartment as the bastion). + +Where HOSTNAME is the node name for which you need the above logs and COMPARTMENT_ID is the OCID of the compartment where the node is. + +The script will get all the above logs and put them in a folder specific to each node in /home/{user}. It will give the folder name as the output. + +Assumption: For getting the console history logs, the script expects to have the node name in /etc/hosts file. + +Examples: + +python3 collect_logs.py --hostname compute-permanent-node-467 +The nvidia bug report, sosreport, and console history logs for compute-permanent-node-467 are at /home/ubuntu/compute-permanent-node-467_06132023191024 + +python3 collect_logs.py --hostname inst-jxwf6-keen-drake +The nvidia bug report, sosreport, and console history logs for inst-jxwf6-keen-drake are at /home/ubuntu/inst-jxwf6-keen-drake_11112022001138 + +for x in `less /home/opc/hostlist` ; do echo $x ; python3 collect_logs.py --hostname $x; done ; +compute-permanent-node-467 +The nvidia bug report, sosreport, and console history logs for compute-permanent-node-467 are at /home/ubuntu/compute-permanent-node-467_11112022011318 +compute-permanent-node-787 +The nvidia bug report, sosreport, and console history logs for compute-permanent-node-787 are at /home/ubuntu/compute-permanent-node-787_11112022011835 + +Where hostlist had the below contents +compute-permanent-node-467 +compute-permanent-node-787 + + +## Collect RDMA NIC Metrics and Upload to Object Storage + +OCI-HPC is deployed in customer tenancy. So, OCI service teams cannot access metrics from these OCI-HPC stack clusters. Due to overcome this issue, in release, +we introduce a feature to collect RDMA NIC Metrics and upload those metrics to Object Storage. Later on, that Object Storage URL could be shared with OCI service +teams. After that URL, OCI service teams could access metrics and use those metrics for debugging purpose. + +To collect RDMA NIC Metrics and upload those to Object Storage, user needs to follow these following steps: + +Step 1: Create a PAR (PreAuthenticated Request) +For creating a PAR, user needs to select check-box "Create Object Storage PAR" during Resource Manager's stack creation. +By default, this check box is enabled. By selecting, this check-box, a PAR would be created. + +Step 2: Use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. +User needs to use shell script: upload_rdma_nic_metrics.sh to collect metrics and upload to object storage. User could configure metrics +collection limit and interval through config file: rdma_metrics_collection_config.conf. diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index df579eb0..3205d2d2 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -16,7 +16,7 @@ resource "local_file" "hosts" { } resource "local_file" "inventory" { - depends_on = [oci_core_cluster_network.cluster_network] + depends_on = [oci_core_cluster_network.cluster_network, oci_core_cluster_network.cluster_network] content = templatefile("${local.bastion_path}/inventory.tpl", { bastion_name = var.bastion_name, bastion_ip = var.bastion_ip, diff --git a/autoscaling/tf_init/cluster-network-configuration.tf b/autoscaling/tf_init/cluster-network-configuration.tf index ddfdab38..e4f75bd7 100755 --- a/autoscaling/tf_init/cluster-network-configuration.tf +++ b/autoscaling/tf_init/cluster-network-configuration.tf @@ -1,5 +1,5 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configuration" { - count = var.cluster_network ? 1 : 0 + count = ( ! var.compute_cluster ) && var.cluster_network ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription] compartment_id = var.targetCompartment display_name = local.cluster_name diff --git a/autoscaling/tf_init/cluster-network.tf b/autoscaling/tf_init/cluster-network.tf index 457d8da4..d7b0b4f0 100755 --- a/autoscaling/tf_init/cluster-network.tf +++ b/autoscaling/tf_init/cluster-network.tf @@ -1,5 +1,5 @@ resource "oci_core_volume" "nfs-cluster-network-volume" { - count = var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + count = ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 availability_domain = var.ad compartment_id = var.targetCompartment display_name = "${local.cluster_name}-nfs-volume" @@ -9,7 +9,7 @@ resource "oci_core_volume" "nfs-cluster-network-volume" { } resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { - count = var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + count = ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.nfs-cluster-network-volume[0].id instance_id = local.cluster_instances_ids[0] @@ -18,7 +18,7 @@ resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { } resource "oci_core_cluster_network" "cluster_network" { - count = var.cluster_network && var.node_count > 0 ? 1 : 0 + count = ( ! var.compute_cluster ) && var.cluster_network && var.node_count > 0 ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription, oci_core_subnet.private-subnet, oci_core_subnet.public-subnet] compartment_id = var.targetCompartment instance_pools { diff --git a/autoscaling/tf_init/compute-cluster.tf b/autoscaling/tf_init/compute-cluster.tf new file mode 100755 index 00000000..ef9067b8 --- /dev/null +++ b/autoscaling/tf_init/compute-cluster.tf @@ -0,0 +1,13 @@ +resource "oci_core_compute_cluster" "compute_cluster" { + count = var.compute_cluster && var.cluster_network && var.node_count > 0 ? 1 : 0 + #Required + availability_domain = var.ad + compartment_id = var.targetCompartment + + #Optional + display_name = local.cluster_name + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } +} \ No newline at end of file diff --git a/autoscaling/tf_init/compute-nodes.tf b/autoscaling/tf_init/compute-nodes.tf new file mode 100755 index 00000000..eb8a0c22 --- /dev/null +++ b/autoscaling/tf_init/compute-nodes.tf @@ -0,0 +1,53 @@ +resource "oci_core_volume" "nfs-compute-cluster-volume" { + count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + availability_domain = var.ad + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-nfs-volume" + + size_in_gbs = var.cluster_block_volume_size + vpus_per_gb = split(".", var.cluster_block_volume_performance)[0] +} + +resource "oci_core_volume_attachment" "compute_cluster_volume_attachment" { + count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + attachment_type = "iscsi" + volume_id = oci_core_volume.nfs-compute-cluster-volume[0].id + instance_id = oci_core_instance.compute_cluster_instances[0].id + display_name = "${local.cluster_name}-compute-cluster-volume-attachment" + device = "/dev/oracleoci/oraclevdb" +} + +resource "oci_core_instance" "compute_cluster_instances" { + count = var.compute_cluster ? var.node_count : 0 + depends_on = [oci_core_compute_cluster.compute_cluster] + availability_domain = var.ad + compartment_id = var.targetCompartment + shape = var.cluster_network_shape + + agent_config { + is_management_disabled = true + } + + display_name = "${local.cluster_name}-node-${var.compute_cluster_start_index+count.index}" + + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + "user" = var.tags + } + + metadata = { + ssh_authorized_keys = file("/home/${var.bastion_username}/.ssh/id_rsa.pub") + user_data = base64encode(data.template_file.config.rendered) + } + source_details { + source_id = local.cluster_network_image + source_type = "image" + boot_volume_size_in_gbs = var.boot_volume_size + } + compute_cluster_id=length(var.compute_cluster_id) > 2 ? var.compute_cluster_id : oci_core_compute_cluster.compute_cluster[0].id + create_vnic_details { + subnet_id = local.subnet_id + assign_public_ip = false + } +} \ No newline at end of file diff --git a/autoscaling/tf_init/data.tf b/autoscaling/tf_init/data.tf index 52dfba55..f9b04337 100755 --- a/autoscaling/tf_init/data.tf +++ b/autoscaling/tf_init/data.tf @@ -10,7 +10,7 @@ data "oci_core_services" "services" { } data "oci_core_cluster_network_instances" "cluster_network_instances" { - count = var.cluster_network && var.node_count > 0 ? 1 : 0 + count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0 cluster_network_id = oci_core_cluster_network.cluster_network[0].id compartment_id = var.targetCompartment } @@ -22,7 +22,7 @@ data "oci_core_instance_pool_instances" "instance_pool_instances" { } data "oci_core_instance" "cluster_network_instances" { - count = var.cluster_network && var.node_count > 0 ? var.node_count : 0 + count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? var.node_count : 0 instance_id = data.oci_core_cluster_network_instances.cluster_network_instances[0].instances[count.index]["id"] } diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 9ae1cd02..261c1e17 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -1,5 +1,5 @@ [bastion] -${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion +${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion ansible_python_interpreter=/usr/bin/python [slurm_backup] %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${bastion_username} role=bastion%{ endif } [login] diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf index 283f3245..02fd1b0e 100755 --- a/autoscaling/tf_init/locals.tf +++ b/autoscaling/tf_init/locals.tf @@ -1,13 +1,13 @@ locals { // display names of instances - cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id - cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name + cluster_instances_ids = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.id : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id + cluster_instances_names = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.display_name :var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name image_ocid = var.unsupported ? var.image_ocid : var.image shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus // ips of the instances - cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip + cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip // subnet id derived either from created subnet or existing if specified subnet_id = var.private_deployment ? var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 1) : var.use_existing_vcn ? var.private_subnet_id : element(concat(oci_core_subnet.private-subnet.*.id, [""]), 0) diff --git a/autoscaling/tf_init/outputs.tf b/autoscaling/tf_init/outputs.tf index 7d2f362d..edd4e8a2 100755 --- a/autoscaling/tf_init/outputs.tf +++ b/autoscaling/tf_init/outputs.tf @@ -8,5 +8,5 @@ output "ocids" { value = join(",", local.cluster_instances_ids) } output "cluster_ocid" { - value = var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id + value = var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id } diff --git a/bastion.tf b/bastion.tf index 6483e03f..fbfed806 100644 --- a/bastion.tf +++ b/bastion.tf @@ -17,6 +17,25 @@ resource "oci_core_volume_attachment" "bastion_volume_attachment" { device = "/dev/oracleoci/oraclevdb" } +resource "oci_core_volume_backup_policy" "bastion_boot_volume_backup_policy" { + count = var.bastion_boot_volume_backup ? 1 : 0 + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-bastion_boot_volume_daily" + schedules { + backup_type = var.bastion_boot_volume_backup_type + period = var.bastion_boot_volume_backup_period + retention_seconds = var.bastion_boot_volume_backup_retention_seconds + time_zone = var.bastion_boot_volume_backup_time_zone + } +} + +resource "oci_core_volume_backup_policy_assignment" "boot_volume_backup_policy" { + count = var.bastion_boot_volume_backup ? 1 : 0 + depends_on = [oci_core_volume_backup_policy.bastion_boot_volume_backup_policy] + asset_id = oci_core_instance.bastion.boot_volume_id + policy_id = oci_core_volume_backup_policy.bastion_boot_volume_backup_policy[0].id +} + resource "oci_resourcemanager_private_endpoint" "rms_private_endpoint" { count = var.private_deployment ? 1 : 0 compartment_id = var.targetCompartment @@ -26,6 +45,13 @@ resource "oci_resourcemanager_private_endpoint" "rms_private_endpoint" { subnet_id = local.subnet_id } +resource "null_resource" "boot_volume_backup_policy" { + depends_on = [oci_core_instance.bastion, oci_core_volume_backup_policy.bastion_boot_volume_backup_policy, oci_core_volume_backup_policy_assignment.boot_volume_backup_policy] + triggers = { + bastion = oci_core_instance.bastion.id + } +} + resource "oci_core_instance" "bastion" { depends_on = [local.bastion_subnet] availability_domain = var.bastion_ad @@ -150,6 +176,16 @@ resource "null_resource" "bastion" { private_key = tls_private_key.ssh.private_key_pem } } + provisioner "file" { + source = "scripts" + destination = "/opt/oci-hpc/" + connection { + host = local.host + type = "ssh" + user = var.bastion_username + private_key = tls_private_key.ssh.private_key_pem + } + } provisioner "file" { content = templatefile("${path.module}/configure.tpl", { configure = var.configure @@ -175,7 +211,7 @@ resource "null_resource" "bastion" { } } resource "null_resource" "cluster" { - depends_on = [null_resource.bastion, null_resource.backup, oci_core_cluster_network.cluster_network, oci_core_instance.bastion, oci_core_volume_attachment.bastion_volume_attachment ] + depends_on = [null_resource.bastion, null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.bastion, oci_core_volume_attachment.bastion_volume_attachment ] triggers = { cluster_instances = join(", ", local.cluster_instances_names) } @@ -288,6 +324,7 @@ resource "null_resource" "cluster" { provisioner "file" { content = templatefile("${path.module}/queues.conf", { cluster_network = var.cluster_network, + compute_cluster = var.compute_cluster, marketplace_listing = var.use_old_marketplace_image ? var.old_marketplace_listing : var.marketplace_listing, image = local.image_ocid, use_marketplace_image = var.use_marketplace_image, @@ -444,3 +481,55 @@ provisioner "file" { } } } + +data "oci_objectstorage_namespace" "compartment_namespace" { + compartment_id = var.targetCompartment +} + +locals { + rdma_nic_metric_bucket_name = "RDMA_NIC_metrics" + par_path = ".." +} +/* +saving the PAR into file: ../PAR_file_for_metrics. +this PAR is used by the scripts to upload NIC metrics to object storage (i.e. script: upload_rdma_nic_metrics.sh) +*/ + +data "oci_objectstorage_bucket" "RDMA_NIC_Metrics_bucket_check" { + name = local.rdma_nic_metric_bucket_name + namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace +} + + +resource "oci_objectstorage_bucket" "RDMA_NIC_metrics_bucket" { + count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0 + compartment_id = var.targetCompartment + name = local.rdma_nic_metric_bucket_name + namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace + versioning = "Enabled" +} + +resource "oci_objectstorage_preauthrequest" "RDMA_NIC_metrics_par" { + count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0 + depends_on = [oci_objectstorage_bucket.RDMA_NIC_metrics_bucket] + access_type = "AnyObjectWrite" + bucket = local.rdma_nic_metric_bucket_name + name = format("%s-%s", "RDMA_NIC_metrics_bucket", var.tenancy_ocid) + namespace = data.oci_objectstorage_namespace.compartment_namespace.namespace + time_expires = "2030-08-01T00:00:00+00:00" +} + + +output "RDMA_NIC_metrics_url" { + depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] + value = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" : "" +} + + +resource "local_file" "PAR" { + count = (var.bastion_object_storage_par && data.oci_objectstorage_bucket.RDMA_NIC_Metrics_bucket_check.bucket_id == null) ? 1 : 0 + depends_on = [oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par] + content = "https://objectstorage.${var.region}.oraclecloud.com${oci_objectstorage_preauthrequest.RDMA_NIC_metrics_par[0].access_uri}" + filename = "${local.par_path}/PAR_file_for_metrics" + } + diff --git a/bin/bastion.sh b/bin/bastion.sh index e8cf6966..0cfc7d31 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -13,9 +13,14 @@ sudo cloud-init status --wait source /etc/os-release +vid=`echo $VERSION|awk -F. '{print $1}'` if [ $ID == "ol" ] ; then - repo="ol7_developer_EPEL" -elif [ $ID == "centos" ] ; then + if [ $vid == 7 ] ; then + repo="ol7_developer_EPEL" + elif [ $vid == 8 ] ; then + repo="ol8_developer_EPEL" + fi +elif [ $ID == "centos" ] ; then repo="epel" fi @@ -27,10 +32,27 @@ fi # Install ansible and other required packages if [ $ID == "ol" ] || [ $ID == "centos" ] ; then - sudo yum makecache --enablerepo=$repo - sudo yum install --enablerepo=$repo -y ansible python-netaddr + if [ $vid == 7 ]; then + sudo yum-config-manager --save --setopt=ol7_oci_included.skip_if_unavailable=true + sudo yum makecache --enablerepo=$repo + sudo yum install --enablerepo=$repo -y ansible python-netaddr + elif [ $vid == 8 ] ; then + sudo yum makecache --enablerepo=$repo + sudo yum install --enablerepo=$repo -y python38.x86_64 + sudo python3.8 -m pip install ansible cryptography netaddr + sudo mkdir /etc/ansible + sudo ln -s /usr/local/bin/ansible-playbook /bin/ansible-playbook + sudo ln -s /usr/local/bin/ansible /bin/ansible + sudo python3 -m pip install -U pip + sudo python3 -m pip install netaddr --upgrade + sudo python3 -m pip install setuptools_rust --upgrade + sudo python3 -m pip install requests --upgrade + sudo python3 -m pip install urllib3 --upgrade + fi sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo sudo yum install -y terraform + sudo python3 -m pip install oci-cli --upgrade + elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then # checking here as well to be sure that the lock file is not being held @@ -53,7 +75,6 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 93C4A3FD7BB9C367 fi - sudo sed -i 's/"1"/"0"/g' /etc/apt/apt.conf.d/20auto-upgrades sudo apt purge -y --auto-remove unattended-upgrades sudo systemctl disable apt-daily-upgrade.timer @@ -61,8 +82,6 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo systemctl disable apt-daily.timer sudo systemctl mask apt-daily.service - - sleep 10s sudo apt-mark hold linux-oracle linux-headers-oracle linux-image-oracle @@ -128,7 +147,8 @@ fi ansible-galaxy collection install ansible.netcommon:=2.5.1 --force > /dev/null ansible-galaxy collection install community.general:=4.8.1 --force > /dev/null -ansible-galaxy collection install ansible.posix > /dev/null +ansible-galaxy collection install ansible.posix --force > /dev/null +ansible-galaxy collection install community.crypto --force > /dev/null threads=$(nproc) forks=$(($threads * 8)) @@ -140,4 +160,4 @@ sudo sed -i "s/^#bin_ansible_callbacks.*/bin_ansible_callbacks=True/" /etc/ansib sudo sed -i "s/^#stdout_callback.*/stdout_callback=yaml/" /etc/ansible/ansible.cfg sudo sed -i "s/^#retries.*/retries=5/" /etc/ansible/ansible.cfg sudo sed -i "s/^#connect_timeout.*/connect_timeout=300/" /etc/ansible/ansible.cfg -sudo sed -i "s/^#command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg +sudo sed -i "s/^#command_timeout.*/command_timeout=120/" /etc/ansible/ansible.cfg \ No newline at end of file diff --git a/bin/create_cluster.sh b/bin/create_cluster.sh index 486dcdb8..2529a124 100755 --- a/bin/create_cluster.sh +++ b/bin/create_cluster.sh @@ -37,6 +37,7 @@ cp -r $autoscaling_folder/tf_init $autoscaling_folder/clusters/$2 cd $autoscaling_folder/clusters/$2 shape=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.shape " $queues_conf` cluster_network=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.cluster_network " $queues_conf` +compute_cluster=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.compute_cluster " $queues_conf` targetCompartment=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.targetCompartment " $queues_conf` ADNames=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.ad " $queues_conf` boot_volume_size=`yq eval ".queues.[] | select(.name == \"$4\") | .instance_types.[] | select(.name == \"$3\") |.boot_volume_size " $queues_conf` @@ -67,7 +68,7 @@ do echo $1 $3 $4 >> currently_building echo $3 $4 > cluster_options - sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##USEOLDMP##~${use_old_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g" $conf_folder/variables.tf > variables.tf + sed "s~##NODES##~$1~g;s~##NAME##~$2~g;s~##SHAPE##~$shape~g;s~##CN##~$cluster_network~g;s~##QUEUE##~${4}~g;s~##COMP##~${targetCompartment}~g;s~##AD##~${ADName}~g;s~##BOOT##~${boot_volume_size}~g;s~##USEMP##~${use_marketplace_image}~g;s~##USEOLDMP##~${use_old_marketplace_image}~g;s~##IMAGE##~${image}~g;s~##OCPU##~${instance_pool_ocpus}~g;s~##MEM##~${instance_pool_memory}~g;s~##CUSTOM_MEM##~${instance_pool_custom_memory}~g;s~##MP_LIST##~${marketplace_listing}~g;s~##HT##~${hyperthreading}~g;s~##INST_TYPE##~$3~g;s~##TAGS##~$tags~g;s~##REGION##~${region}~g;s~##PRIVATE_SUBNET_ID##~${private_subnet_id}~g;s~##PRIVATE_SUBNET##~${private_subnet}~g;s~##CC##~$compute_cluster~g" $conf_folder/variables.tf > variables.tf echo "Started to build $2" start=`date -u +%s` diff --git a/bin/pcie_el.sh b/bin/pcie_el.sh deleted file mode 100644 index f15061ff..00000000 --- a/bin/pcie_el.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -for dev in `/usr/sbin/lspci | grep ConnectX-5 | awk '{print $1}'` -do - echo ${dev} - sudo lspci -vvv -s ${dev} | grep LnkSta: -done - diff --git a/bin/pcie_ubuntu.sh b/bin/pcie_ubuntu.sh deleted file mode 100644 index 95c5c456..00000000 --- a/bin/pcie_ubuntu.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -for dev in `/usr/bin/lspci | grep ConnectX-5 | awk '{print $1}'` -do - echo ${dev} - sudo lspci -vvv -s ${dev} | grep LnkSta: -done - diff --git a/bin/rdma_metrics_collection_config.conf b/bin/rdma_metrics_collection_config.conf new file mode 100644 index 00000000..1d99ca00 --- /dev/null +++ b/bin/rdma_metrics_collection_config.conf @@ -0,0 +1,3 @@ +hoursAgoFromNow=24 +metricsCollectionIntervalInMinute=5 +parFileName=/opt/oci-hpc/PAR_file_for_metrics \ No newline at end of file diff --git a/bin/resize.py b/bin/resize.py index acc7b43a..48008763 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -20,7 +20,9 @@ def get_metadata(): def wait_for_running_status(cluster_name,comp_ocid,cn_ocid,CN,expected_size=None): while True: - if CN: + if CN == "CC": + break + elif CN == "CN": state = computeManagementClient.get_cluster_network(cn_ocid).data.lifecycle_state instances=computeManagementClient.list_cluster_network_instances(comp_ocid,cn_ocid).data else: @@ -42,20 +44,34 @@ def wait_for_running_status(cluster_name,comp_ocid,cn_ocid,CN,expected_size=None def get_instances(comp_ocid,cn_ocid,CN): cn_instances=[] - if CN: - instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_cluster_network_instances,comp_ocid,cn_ocid).data + if CN == "CC": + instances = computeClient.list_instances(comp_ocid,compute_cluster_id=cn_ocid).data + for instance in instances: + if instance.lifecycle_state == "TERMINATED": + continue + try: + for potential_vnic_attachment in oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data: + if potential_vnic_attachment.display_name is None: + vnic_attachment = potential_vnic_attachment + vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data + except: + continue + cn_instances.append({'display_name':instance.display_name,'ip':vnic.private_ip,'ocid':instance.id}) else: - instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_instance_pool_instances,comp_ocid,cn_ocid).data - for instance_summary in instance_summaries: - try: - instance=computeClient.get_instance(instance_summary.id).data - for potential_vnic_attachment in oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data: - if potential_vnic_attachment.display_name is None: - vnic_attachment = potential_vnic_attachment - vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data - except: - continue - cn_instances.append({'display_name':instance_summary.display_name,'ip':vnic.private_ip,'ocid':instance_summary.id}) + if CN == "CN": + instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_cluster_network_instances,comp_ocid,cn_ocid).data + else: + instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_instance_pool_instances,comp_ocid,cn_ocid).data + for instance_summary in instance_summaries: + try: + instance=computeClient.get_instance(instance_summary.id).data + for potential_vnic_attachment in oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data: + if potential_vnic_attachment.display_name is None: + vnic_attachment = potential_vnic_attachment + vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data + except: + continue + cn_instances.append({'display_name':instance_summary.display_name,'ip':vnic.private_ip,'ocid':instance_summary.id}) return cn_instances def parse_inventory(inventory): @@ -433,7 +449,7 @@ def getNFSnode(inventory): return dict['nfs'][0].split()[0] def get_summary(comp_ocid,cluster_name): - CN = True + CN = "CN" cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data running_clusters = 0 scaling_clusters = 0 @@ -445,25 +461,35 @@ def get_summary(comp_ocid,cluster_name): elif cn_summary_tmp.lifecycle_state == "SCALING": scaling_clusters = scaling_clusters + 1 if running_clusters == 0: - cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data + cn_summaries = computeClient.list_compute_clusters(comp_ocid,display_name=cluster_name).data.items if len(cn_summaries) > 0: - CN = False + CN = "CC" for cn_summary_tmp in cn_summaries: - if cn_summary_tmp.lifecycle_state == "RUNNING": + if cn_summary_tmp.lifecycle_state == "ACTIVE" and cn_summary_tmp.display_name == cluster_name : cn_summary = cn_summary_tmp running_clusters = running_clusters + 1 - elif cn_summary_tmp.lifecycle_state == "SCALING": - scaling_clusters = scaling_clusters + 1 - if running_clusters == 0: - if scaling_clusters: - print("No running cluster was found but there is a cluster in SCALING mode, try rerunning in a moment") - else: - print("The cluster was not found") - return None,None,True + if running_clusters == 0: + cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data + if len(cn_summaries) > 0: + CN = "IP" + for cn_summary_tmp in cn_summaries: + if cn_summary_tmp.lifecycle_state == "RUNNING": + cn_summary = cn_summary_tmp + running_clusters = running_clusters + 1 + elif cn_summary_tmp.lifecycle_state == "SCALING": + scaling_clusters = scaling_clusters + 1 + if running_clusters == 0: + if scaling_clusters: + print("No running cluster was found but there is a cluster in SCALING mode, try rerunning in a moment") + else: + print("The cluster was not found") + return None,None,True if running_clusters > 1: print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) - if CN: + if CN == "CN": ip_summary=cn_summary.instance_pools[0] + elif CN == "CC": + ip_summary=None else: ip_summary=cn_summary return cn_summary,ip_summary,CN @@ -508,6 +534,28 @@ def updateTFState(inventory,cluster_name,size): except: return 0 +def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index): + + agent_config=instance.agent_config + agent_config.__class__ = oci.core.models.LaunchInstanceAgentConfigDetails + + for potential_vnic_attachment in oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data: + if potential_vnic_attachment.display_name is None: + vnic_attachment = potential_vnic_attachment + splitted_name=instance.display_name.split('-') + create_vnic_details=oci.core.models.CreateVnicDetails(assign_public_ip=False,subnet_id=vnic_attachment.subnet_id) + + shape_config=instance.shape_config + try: + nvmes=shape_config.local_disks + launchInstanceShapeConfigDetails = oci.core.models.LaunchInstanceShapeConfigDetails(baseline_ocpu_utilization=shape_config.baseline_ocpu_utilization,memory_in_gbs=shape_config.memory_in_gbs,nvmes=nvmes,ocpus=shape_config.ocpus) + except: + launchInstanceShapeConfigDetails = oci.core.models.LaunchInstanceShapeConfigDetails(baseline_ocpu_utilization=shape_config.baseline_ocpu_utilization,memory_in_gbs=shape_config.memory_in_gbs,ocpus=shape_config.ocpus) + + splitted_name[-1]=str(max_previous_index+1+index) + new_display_name = '-'.join(splitted_name) + launch_instance_details=oci.core.models.LaunchInstanceDetails(agent_config=agent_config,availability_domain=instance.availability_domain, compartment_id=comp_ocid,compute_cluster_id=cn_ocid,shape=instance.shape,shape_config=launchInstanceShapeConfigDetails,source_details=instance.source_details,metadata=instance.metadata,display_name=new_display_name,freeform_tags=instance.freeform_tags,create_vnic_details=create_vnic_details) + return launch_instance_details batchsize=12 inventory="/etc/ansible/hosts" @@ -594,12 +642,14 @@ def updateTFState(inventory,cluster_name,size): if user_logging: config_oci = oci.config.from_file() computeClient = oci.core.ComputeClient(config_oci) + ComputeClientCompositeOperations = oci.core.ComputeClientCompositeOperations(computeClient) computeManagementClient = oci.core.ComputeManagementClient(config_oci) ComputeManagementClientCompositeOperations = oci.core.ComputeManagementClientCompositeOperations(computeManagementClient) virtualNetworkClient = oci.core.VirtualNetworkClient(config_oci) else: signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() computeClient = oci.core.ComputeClient(config={}, signer=signer) + ComputeClientCompositeOperations= oci.core.ComputeClientCompositeOperations(computeClient) computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) ComputeManagementClientCompositeOperations = oci.core.ComputeManagementClientCompositeOperations(computeManagementClient) virtualNetworkClient = oci.core.VirtualNetworkClient(config={}, signer=signer) @@ -608,11 +658,13 @@ def updateTFState(inventory,cluster_name,size): if cn_summary is None: exit() cn_ocid =cn_summary.id -current_size=ip_summary.size -if CN: - ipa_ocid = cn_summary.instance_pools[0].id -else: - ipa_ocid = cn_ocid + +if CN != "CC": + current_size=ip_summary.size + if CN == "CN": + ipa_ocid = cn_summary.instance_pools[0].id + else: + ipa_ocid = cn_ocid if args.mode == 'list': state = cn_summary.lifecycle_state @@ -699,36 +751,74 @@ def updateTFState(inventory,cluster_name,size): exit(1) else: print("STDOUT: Force deleting the nodes") - while len(hostnames_to_remove) > 0: - terminated_instances=0 - if len(hostnames_to_remove) >batchsize: - batch = hostnames_to_remove[:batchsize] - else: - batch = hostnames_to_remove - cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) + terminated_instances=0 + cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) + if CN != "CC": current_size = ip_summary.size - for instanceName in batch: - try: - instance_id = computeClient.list_instances(comp_ocid,display_name=instanceName).data[0].id + for instanceName in hostnames_to_remove: + try: + instance_id = computeClient.list_instances(comp_ocid,display_name=instanceName).data[0].id + if CN == "CC": + ComputeClientCompositeOperations.terminate_instance_and_wait_for_state(instance_id,wait_for_states=["TERMINATING","TERMINATED"]) + else: instance_details = oci.core.models.DetachInstancePoolInstanceDetails(instance_id=instance_id,is_auto_terminate=True,is_decrement_size=True) - print("STDOUT: The instance "+instanceName+" is terminating") ComputeManagementClientCompositeOperations.detach_instance_pool_instance_and_wait_for_work_request(ipa_ocid,instance_details) - terminated_instances = terminated_instances + 1 - except: - print("The instance "+instanceName+" does not exist") - hostnames_to_remove=hostnames_to_remove[batchsize:] + terminated_instances = terminated_instances + 1 + print("STDOUT: The instance "+instanceName+" is terminating") + except: + print("The instance "+instanceName+" does not exist") cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) - newsize=ip_summary.size - updateTFState(inventory,cluster_name,newsize) + if CN == "CC": + instance_id = computeClient.list_instances(comp_ocid,display_name=hostnames_to_remove[-1]).data[0].id + for i in range(10): + try: + instance_state = computeClient.get_instance(instance_id).data.lifecycle_state + if instance_state == "TERMINATED": + break + else: + time.sleep(10) + except: + break + cn_instances = get_instances(comp_ocid,cn_ocid,CN) + newsize=len(cn_instances) + else: + newsize=ip_summary.size + updateTFState(inventory,cluster_name,newsize) print("STDOUT: Resized to "+str(newsize)+" instances") - if error_code != 0 and force: - print("STDOUT: The nodes were forced deleted, trying to reconfigure the left over nodes") - reconfigure(comp_ocid,cn_ocid,inventory,CN) +# if error_code != 0 and force: +# print("STDOUT: The nodes were forced deleted, trying to reconfigure the left over nodes") +# reconfigure(comp_ocid,cn_ocid,inventory,CN) if args.mode == 'add': - size = current_size - hostnames_to_remove_len + args.number - update_size = oci.core.models.UpdateInstancePoolDetails(size=size) - ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING'],waiter_kwargs={'max_wait_seconds':3600}) - updateTFState(inventory,cluster_name,size) + if CN == "CC": + cn_instances = get_instances(comp_ocid,cn_ocid,CN) + current_size=len(cn_instances) + if len(cn_instances) == 0: + print("The resize script cannot work for a compute cluster if the size is there is no node in the cluster") + else: + for cn_instance in cn_instances: + max_index=-1 + if int(cn_instance['display_name'].split('-')[-1]) > max_index: + max_index=int(cn_instance['display_name'].split('-')[-1]) + instance=computeClient.get_instance(cn_instances[0]['ocid']).data + + for i in range(args.number): + launch_instance_details=getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_index,i) + ComputeClientCompositeOperations.launch_instance_and_wait_for_state(launch_instance_details,wait_for_states=["RUNNING"]) + else: + size = current_size - hostnames_to_remove_len + args.number + update_size = oci.core.models.UpdateInstancePoolDetails(size=size) + ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING'],waiter_kwargs={'max_wait_seconds':3600}) + + cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster_name) + if CN == "CC": + cn_instances = get_instances(comp_ocid,cn_ocid,CN) + newsize=len(cn_instances) + else: + newsize=ip_summary.size + updateTFState(inventory,cluster_name,newsize) + if newsize == current_size: + print("No node was added, please check the work requests of the Cluster Network and Instance Pool to see why") + exit(1) if not no_reconfigure: add_reconfigure(comp_ocid,cn_ocid,inventory,CN) \ No newline at end of file diff --git a/bin/upload_rdma_nic_metrics.sh b/bin/upload_rdma_nic_metrics.sh new file mode 100644 index 00000000..6e44671c --- /dev/null +++ b/bin/upload_rdma_nic_metrics.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +scripts=`realpath $0` +folder=`dirname $scripts` + +source "${folder}/rdma_metrics_collection_config.conf" +hours="$hoursAgoFromNow" +interval="$metricsCollectionIntervalInMinute" +par_filename="$parFileName" + +if [ -z "$par_filename" ] +then + echo "Please create a PAR and save into a file. Then, in config file, set the path of PAR-file to parFileName" + exit +fi + +if [ ! -f ${par_filename} ] +then + echo "PAR file:${par_filename} does not exist. Please create PAR file and update the config file" + exit +fi + +dis_help() +{ + echo + echo "Usage:" + echo + echo "./upload_rdma_nic_metrics.sh -l -i " + echo + echo "Options:" + echo "l Hours Ago From Now (optional)" + echo "n Metrics Collection Interval In Minute (optional)" + echo "h Print this help." + echo + echo "RDMA metrics are uploaded to Object Storage using PAR" + echo + echo "e.g., sh ./upload_rdma_nic_metrics.sh -l 24 -i 5 " + echo + echo "Supported releases: 2.10.3+" + echo +} + +#Do this if number of arguments passed is greater than 0 +if [ "$#" -gt "0" ] +then + while getopts "l:i:h" option + do + case $option in + l) hours=${OPTARG};; + i) interval=${OPTARG};; + h) dis_help + exit;; + \?) # Invalid option + echo "Error: Invalid option" + exit;; + esac + done +fi + +monitoring_folder=$folder/../monitoring + +if [ -f $monitoring_folder/activated ] +then + timestamp=$(date +%s) + for i in {0..16} + do + measurementname="infiniband_mlx5_"$i"_hw_counters" + measurementnameBackup="infiniband_mlx5_"$i"_hw_counters_backup" + echo "Checking device mlx5_${i} for RDMA HW metrics...." + query="SELECT MEAN(*) INTO ${measurementnameBackup} FROM ${measurementname} WHERE time < now() AND time > now() - ${hours}h GROUP BY time(${interval}m)" + rows=$(influx -database 'telegraf' -execute "${query}" -format json | jq '.results[0].series[0].values[0][1]') + + if [ "$rows" -eq 0 ]; then + echo "Device mlx5_${i} does not have metrics to collect" + echo "......................................................" + continue + fi + + filename="infiniband_mlx5_${i}_${timestamp}" + filename_csv="${filename}.csv" + filename_zip="${filename}.zip" + + echo "Collecting RDMA HW metrics of device mlx5_${i}...." + query="SELECT * FROM ${measurementnameBackup}" + influx -database 'telegraf' -execute "${query}" -format csv > $filename_csv + filename_csv_path="${folder}/${filename_csv}" + if [ ! -f ${filename_csv_path} ] + then + echo "ERROR:${filename_csv_path} was not created." + continue + fi + + zip ${filename_zip} ${filename_csv} + rm ${filename_csv} + filename_zip_path="${folder}/${filename_zip}" + if [ ! -f ${filename_zip_path} ] + then + echo "ERROR:${filename_zip_path} was not created." + continue + fi + + par=$(cat "${par_filename}") + echo "Uploading RDMA HW Metrics to Object Stroage for device mlx5_${i}" + curl -X PUT --data-binary @${filename_zip} "$par""$filename_zip" + echo "Uploaded RDMA HW metrics to Object Storage for device mlx5_${i}" + echo "Object storage URL for device mlx_5${i}: ${par}${filename_zip}" + + sqldelete="DELETE FROM ${measurementnameBackup}" + influx -database 'telegraf' -execute "${sqldelete}" + echo "......................................................" + done + + measurementname="infiniband" + measurementnameBackup="infiniband_backup" + + echo "Checking for Infiniband counter metrics...." + query="SELECT MEAN(*) INTO ${measurementnameBackup} FROM ${measurementname} WHERE time < now() AND time > now() - ${hours}h GROUP BY time(${interval}m)" + rows=$(influx -database 'telegraf' -execute "${query}" -format json | jq '.results[0].series[0].values[0][1]') + + if [ "$rows" -eq 0 ]; then + echo "It does not have Infiniband counter metrics to collect" + fi + + filename="infiniband_${timestamp}" + filename_csv="${filename}.csv" + filename_zip="${filename}.zip" + + echo "Collecting Infiniband counter metrics...." + query="SELECT * FROM ${measurementnameBackup}" + influx -database 'telegraf' -execute "${query}" -format csv > $filename_csv + filename_csv_path="${folder}/${filename_csv}" + if [ ! -f ${filename_csv_path} ] + then + echo "ERROR:${filename_csv_path} was not created." + continue + fi + + zip ${filename_zip} ${filename_csv} + rm ${filename_csv} + filename_zip_path="${folder}/${filename_zip}" + if [ ! -f ${filename_zip_path} ] + then + echo "ERROR:${filename_zip_path} was not created." + continue + fi + + echo "Uploading Infiniband counter metrics to Object Stroage" + curl -X PUT --data-binary @${filename_zip} "$par""$filename_zip" + echo "Uploaded Infiniband counter metrics to Object Storage" + echo "Object storage URL for Infiniband counter metrics: ${par}${filename_zip}" + + sqldelete="DELETE FROM ${measurementnameBackup}" + influx -database 'telegraf' -execute "${sqldelete}" + +fi diff --git a/cluster-network-configuration.tf b/cluster-network-configuration.tf index 0651ca21..b66b1967 100755 --- a/cluster-network-configuration.tf +++ b/cluster-network-configuration.tf @@ -1,5 +1,5 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configuration" { - count = var.cluster_network ? 1 : 0 + count = ( ! var.compute_cluster ) && var.cluster_network ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription] compartment_id = var.targetCompartment display_name = local.cluster_name diff --git a/cluster-network.tf b/cluster-network.tf index 98689d50..859daf41 100755 --- a/cluster-network.tf +++ b/cluster-network.tf @@ -1,5 +1,5 @@ resource "oci_core_volume" "nfs-cluster-network-volume" { - count = var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + count = ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 availability_domain = var.ad compartment_id = var.targetCompartment display_name = "${local.cluster_name}-nfs-volume" @@ -9,7 +9,7 @@ resource "oci_core_volume" "nfs-cluster-network-volume" { } resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { - count = var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + count = ( ! var.compute_cluster ) && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 attachment_type = "iscsi" volume_id = oci_core_volume.nfs-cluster-network-volume[0].id instance_id = local.cluster_instances_ids[0] @@ -18,7 +18,7 @@ resource "oci_core_volume_attachment" "cluster_network_volume_attachment" { } resource "oci_core_cluster_network" "cluster_network" { - count = var.cluster_network && var.node_count > 0 ? 1 : 0 + count = ( ! var.compute_cluster ) && var.cluster_network && var.node_count > 0 ? 1 : 0 depends_on = [oci_core_app_catalog_subscription.mp_image_subscription, oci_core_subnet.private-subnet, oci_core_subnet.public-subnet, oci_core_instance.bastion] compartment_id = var.targetCompartment instance_pools { diff --git a/compute-cluster.tf b/compute-cluster.tf new file mode 100755 index 00000000..ef9067b8 --- /dev/null +++ b/compute-cluster.tf @@ -0,0 +1,13 @@ +resource "oci_core_compute_cluster" "compute_cluster" { + count = var.compute_cluster && var.cluster_network && var.node_count > 0 ? 1 : 0 + #Required + availability_domain = var.ad + compartment_id = var.targetCompartment + + #Optional + display_name = local.cluster_name + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } +} \ No newline at end of file diff --git a/compute-nodes.tf b/compute-nodes.tf new file mode 100755 index 00000000..c7e21c99 --- /dev/null +++ b/compute-nodes.tf @@ -0,0 +1,52 @@ +resource "oci_core_volume" "nfs-compute-cluster-volume" { + count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + availability_domain = var.ad + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-nfs-volume" + + size_in_gbs = var.cluster_block_volume_size + vpus_per_gb = split(".", var.cluster_block_volume_performance)[0] +} + +resource "oci_core_volume_attachment" "compute_cluster_volume_attachment" { + count = var.compute_cluster && var.scratch_nfs_type_cluster == "block" && var.node_count > 0 ? 1 : 0 + attachment_type = "iscsi" + volume_id = oci_core_volume.nfs-compute-cluster-volume[0].id + instance_id = oci_core_instance.compute_cluster_instances[0].id + display_name = "${local.cluster_name}-compute-cluster-volume-attachment" + device = "/dev/oracleoci/oraclevdb" +} + +resource "oci_core_instance" "compute_cluster_instances" { + count = var.compute_cluster ? var.node_count : 0 + depends_on = [oci_core_compute_cluster.compute_cluster] + availability_domain = var.ad + compartment_id = var.targetCompartment + shape = var.cluster_network_shape + + agent_config { + is_management_disabled = true + } + + display_name = "${local.cluster_name}-node-${var.compute_cluster_start_index+count.index}" + + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } + + metadata = { + ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" + user_data = base64encode(data.template_file.bastion_config.rendered) + } + source_details { + source_id = local.cluster_network_image + source_type = "image" + boot_volume_size_in_gbs = var.boot_volume_size + } + compute_cluster_id=length(var.compute_cluster_id) > 2 ? var.compute_cluster_id : oci_core_compute_cluster.compute_cluster[0].id + create_vnic_details { + subnet_id = local.subnet_id + assign_public_ip = false + } +} \ No newline at end of file diff --git a/conf/queues.conf.example b/conf/queues.conf.example index 5013e935..d75dc088 100644 --- a/conf/queues.conf.example +++ b/conf/queues.conf.example @@ -9,6 +9,7 @@ instance_keyword: hpc permanent: False cluster_network: true + compute_cluster: true max_number_nodes: 100 max_cluster_size: 20 max_cluster_count: 10 @@ -32,6 +33,7 @@ instance_keyword: permanent permanent: true cluster_network: true + compute_cluster: true max_number_nodes: 100 max_cluster_size: 50 max_cluster_count: 10 diff --git a/conf/variables.tpl b/conf/variables.tpl index 1072dbaa..71ffd5cb 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -6,6 +6,10 @@ variable "cluster_network" { default = ##CN##} variable "use_custom_name" {default = true} variable "cluster_name" {default = "##NAME##" } variable "cluster_network_shape" { default = "##SHAPE##" } +variable "compute_cluster" { default = ##CC## } +variable "compute_cluster_exists" { default = false } +variable "compute_cluster_id" { default = "" } +variable "compute_cluster_start_index" { default = 0 } variable "instance_pool_shape" { default = "##SHAPE##" } variable "instance_type" {default = "##INST_TYPE##" } variable "node_count" { default="##NODES##" } diff --git a/data.tf b/data.tf index c90b6ced..39195858 100755 --- a/data.tf +++ b/data.tf @@ -15,7 +15,7 @@ data "oci_core_services" "services" { } } data "oci_core_cluster_network_instances" "cluster_network_instances" { - count = var.cluster_network && var.node_count > 0 ? 1 : 0 + count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? 1 : 0 cluster_network_id = oci_core_cluster_network.cluster_network[0].id compartment_id = var.targetCompartment } @@ -27,7 +27,7 @@ data "oci_core_instance_pool_instances" "instance_pool_instances" { } data "oci_core_instance" "cluster_network_instances" { - count = var.cluster_network && var.node_count > 0 ? var.node_count : 0 + count = (! var.compute_cluster) && var.cluster_network && var.node_count > 0 ? var.node_count : 0 instance_id = data.oci_core_cluster_network_instances.cluster_network_instances[0].instances[count.index]["id"] } diff --git a/inventory.tpl b/inventory.tpl index f60a0aef..735e41b3 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -1,5 +1,5 @@ [bastion] -${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion +${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion ansible_python_interpreter=/usr/bin/python [slurm_backup] %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=bastion%{ endif } [login] @@ -73,5 +73,4 @@ tenancy_ocid = ${tenancy_ocid} inst_prin = ${inst_prin} api_fingerprint = ${api_fingerprint} api_user_ocid = ${api_user_ocid} -sacct_limits=${sacct_limits} - +sacct_limits=${sacct_limits} \ No newline at end of file diff --git a/locals.tf b/locals.tf index 103b2d11..38bac32c 100755 --- a/locals.tf +++ b/locals.tf @@ -1,7 +1,7 @@ locals { // display names of instances - cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id - cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name + cluster_instances_ids = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.id : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id + cluster_instances_names = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.display_name : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name image_ocid = var.unsupported ? var.image_ocid : var.image custom_bastion_image_ocid = var.unsupported_bastion ? var.unsupported_bastion_image : var.custom_bastion_image @@ -12,7 +12,7 @@ locals { bastion_ocpus = var.bastion_shape == "VM.DenseIO.E4.Flex" ? var.bastion_ocpus_denseIO_flex : var.bastion_ocpus login_ocpus = var.login_shape == "VM.DenseIO.E4.Flex" ? var.login_ocpus_denseIO_flex : var.login_ocpus // ips of the instances - cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip + cluster_instances_ips = var.compute_cluster ? oci_core_instance.compute_cluster_instances.*.private_ip : var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip // vcn id derived either from created vcn or existing if specified vcn_id = var.use_existing_vcn ? var.vcn_id : element(concat(oci_core_vcn.vcn.*.id, [""]), 0) @@ -54,11 +54,11 @@ locals { // Cluster OCID - cluster_ocid = var.node_count > 0 ? var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id : "" + cluster_ocid = var.node_count > 0 ? var.compute_cluster ? oci_core_compute_cluster.compute_cluster[0].id : var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id : "" host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.bastion.public_ip bastion_bool_ip = var.private_deployment ? false : true login_bool_ip = var.private_deployment ? false : true - bastion_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.private-subnet + bastion_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.public-subnet private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet] host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none" diff --git a/playbooks/group_vars/all.yml b/playbooks/group_vars/all.yml index de0076a9..67b1d24d 100644 --- a/playbooks/group_vars/all.yml +++ b/playbooks/group_vars/all.yml @@ -1,3 +1,3 @@ ssl_cert_path: '/etc/ssl/certs' ssl_ca_cert: '{{ ssl_cert_path }}/cluster-ca.crt' -ssl_cert_group: ssl +ssl_cert_group: ssl \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 4ddab322..83e57b9c 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -89,7 +89,7 @@ local_path: "{{ cluster_nfs_path }}" export_host: "{{ hostvars[groups['bastion'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/export/cluster" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "all" when: cluster_nfs|default(true)|bool - include_role: @@ -98,7 +98,7 @@ local_path: "{{ scratch_nfs_path }}" export_host: "{{ hostvars[groups['nfs'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/mnt/localdisk/nfs" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "none" when: scratch_nfs|default(true)|bool @@ -133,7 +133,7 @@ local_path: "/home" export_host: "{{ hostvars[groups['bastion'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/home" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "all" when: home_nfs|bool and (not home_fss|bool) - include_role: diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 2f9149fb..5703a1cf 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -88,7 +88,7 @@ local_path: "{{ cluster_nfs_path }}" export_host: "{{ hostvars[groups['bastion'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/export/cluster" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "all" when: cluster_nfs|default(true)|bool - include_role: @@ -97,7 +97,7 @@ local_path: "{{ scratch_nfs_path }}" export_host: "{{ hostvars[groups['nfs'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/mnt/localdisk/nfs" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "none" when: scratch_nfs|default(true)|bool @@ -135,7 +135,7 @@ local_path: "/home" export_host: "{{ hostvars[groups['bastion'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/home" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "all" when: home_nfs|bool and (not home_fss|bool) - include_role: diff --git a/playbooks/roles/cluster-cli/tasks/el.yml b/playbooks/roles/cluster-cli/tasks/el7.yml similarity index 94% rename from playbooks/roles/cluster-cli/tasks/el.yml rename to playbooks/roles/cluster-cli/tasks/el7.yml index 598b263c..68fd77fd 100755 --- a/playbooks/roles/cluster-cli/tasks/el.yml +++ b/playbooks/roles/cluster-cli/tasks/el7.yml @@ -15,4 +15,4 @@ dest: /usr/bin/ owner: root group: root - mode: '0755' + mode: '0755' \ No newline at end of file diff --git a/playbooks/roles/cluster-cli/tasks/el8.yml b/playbooks/roles/cluster-cli/tasks/el8.yml new file mode 100755 index 00000000..783e1c92 --- /dev/null +++ b/playbooks/roles/cluster-cli/tasks/el8.yml @@ -0,0 +1,18 @@ +--- + +- name: install required packages + vars: + package_name: + - python3-click + - python3-ldap + package_repo: "ol8_developer_EPEL,ol8_appstream" + include_role: + name: safe_yum + +- name: copy cluster cli + copy: + src: cluster + dest: /usr/bin/ + owner: root + group: root + mode: '0755' diff --git a/playbooks/roles/cluster-cli/tasks/main.yml b/playbooks/roles/cluster-cli/tasks/main.yml index 150216f8..0ef20964 100755 --- a/playbooks/roles/cluster-cli/tasks/main.yml +++ b/playbooks/roles/cluster-cli/tasks/main.yml @@ -1,5 +1,8 @@ -- include: el.yml - when: ansible_os_family == 'RedHat' +- include: el7.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' + +- include: el8.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' - include: debian.yml when: ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/roles/docker/tasks/main.yml b/playbooks/roles/docker/tasks/main.yml index 64c69959..62c22a6b 100644 --- a/playbooks/roles/docker/tasks/main.yml +++ b/playbooks/roles/docker/tasks/main.yml @@ -1,7 +1,7 @@ --- -- include: oraclelinux-7.yml - when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' +- include: oraclelinux.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' #- include: centos-7.yml # when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' diff --git a/playbooks/roles/docker/tasks/oraclelinux-7.yml b/playbooks/roles/docker/tasks/oraclelinux.yml similarity index 96% rename from playbooks/roles/docker/tasks/oraclelinux-7.yml rename to playbooks/roles/docker/tasks/oraclelinux.yml index 302c1b35..65133d5a 100644 --- a/playbooks/roles/docker/tasks/oraclelinux-7.yml +++ b/playbooks/roles/docker/tasks/oraclelinux.yml @@ -17,6 +17,7 @@ baseurl: http://mirror.centos.org/centos/{{ ansible_distribution_major_version }}/extras/x86_64 enabled: 1 gpgcheck: 0 + when: ansible_distribution_major_version == '7' - name: Add docker-ce repository become: true @@ -50,7 +51,6 @@ name: containerd enabled: yes - - name: create docker group group: name: docker @@ -60,5 +60,3 @@ name: opc groups: docker append: yes - - diff --git a/playbooks/roles/grafana/files/cluster.json b/playbooks/roles/grafana/files/cluster.json index 022e955f..66a38132 100755 --- a/playbooks/roles/grafana/files/cluster.json +++ b/playbooks/roles/grafana/files/cluster.json @@ -3,1590 +3,1704 @@ "list": [ { "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "HPC Cluster dashboard", "editable": true, + "fiscalYearStartMonth": 0, "gnetId": 928, "graphTooltip": 1, "id": null, - "iteration": 1613083475434, + "iteration": 1693307124814, "links": [], + "liveNow": false, "panels": [ { - "collapsed": false, - "datasource": null, + "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, - "id": 62045, - "panels": [], - "repeat": null, - "title": "CPU", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 1 - }, - "height": "300", - "hiddenSeries": false, - "id": 28239, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.4.1", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "id": 64974, + "panels": [ { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": false, + "inspect": false + }, + "links": [ + { + "title": "Details", + "url": "/d/00000012722/cluster-dashboard-3?orgId=1&var-datasource=InfluxDB&var-inter=10s&var-ncores=All&var-server=${__data.fields.Node}&var-mountpoint=All&var-cpu=All&var-disk=All&var-netif=All&var-gpu=All&from=1692806920934&to=1692817720934&viewPanel=62657 " + } ], - "type": "time" + "mappings": [], + "noValue": "No GPU Detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = 'cpu-total' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ + "overrides": [ { - "params": [ - "value" - ], - "type": "field" + "matcher": { + "id": "byName", + "options": "GPU" + }, + "properties": [ + { + "id": "custom.width" + } + ] }, { - "params": [], - "type": "mean" + "matcher": { + "id": "byName", + "options": "Node" + }, + "properties": [ + { + "id": "custom.width", + "value": 306 + } + ] } ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 9 - }, - "height": "350", - "hiddenSeries": false, - "id": 54694, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.4.1", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 65247, + "options": { + "footer": { + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" ], - "type": "time" + "show": false }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.21", + "targets": [ { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "system_load1", - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT mean(load1) as short,mean(load5) as medium,mean(load15) as long FROM \"system\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), * ORDER BY asc", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Load averages", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 9 - }, - "height": "350", - "hiddenSeries": false, - "id": 61852, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.4.1", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "groupBy": [ + "groupBy": [], + "hide": false, + "measurement": "nvidia_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,max(\"temperature_gpu\") FROM \"nvidia_smi\" WHERE host =~/$server/ AND $timeFilter", + "rawQuery": true, + "refId": "A", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_gpu" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + }, { - "params": [ - "$interval" - ], - "type": "time" + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "nvidia_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,min(\"temperature_gpu\") FROM \"nvidia_smi\" WHERE host =~/$server/ AND $timeFilter;", + "rawQuery": true, + "refId": "B", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_gpu" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + } + ], + "title": "Temperature: Highest/Lowest", + "transformations": [ + { + "id": "merge", + "options": {} }, { - "params": [ - "null" - ], - "type": "fill" + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "host": "Node", + "max": "Highest", + "min": "Lowest", + "uuid": "GPU" + } + } } ], - "measurement": "processes", - "policy": "default", - "query": "SELECT mean(running) as running, mean(blocked) as blocked, mean(sleeping) as sleeping, mean(stopped) as stopped, mean(zombies) as zombies, mean(paging) as paging, mean(unknown) as unknown FROM \"processes\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "color-text", + "filterable": false, + "inspect": false + }, + "links": [ + { + "title": "utilization", + "url": "/d/00000012722/cluster-dashboard-3?orgId=1&var-datasource=InfluxDB&var-inter=10s&var-ncores=All&var-server=${__data.fields.Node}&var-mountpoint=All&var-cpu=All&var-disk=All&var-netif=All&var-gpu=GPU-04865380-bc86-dfbd-4d46-5d550037acd4&from=1692806920934&to=1692817720934&viewPanel=62521" + } + ], + "mappings": [], + "noValue": "No GPU Detected", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [ { - "params": [ - "blocked" - ], - "type": "field" + "matcher": { + "id": "byName", + "options": "GPU" + }, + "properties": [ + { + "id": "custom.width", + "value": 195 + } + ] }, { - "params": [], - "type": "mean" + "matcher": { + "id": "byName", + "options": "Node" + }, + "properties": [ + { + "id": "custom.width", + "value": 248 + } + ] } ] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 65383, + "links": [], + "options": { + "footer": { + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 2, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "nvidia_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,max(\"utilization_gpu\") FROM \"nvidia_smi\" WHERE host =~/$server/ AND $timeFilter;", + "rawQuery": true, + "refId": "A", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_gpu" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + }, + { + "alias": "", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "groupBy": [], + "hide": false, + "measurement": "nvidia_smi", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT uuid,host,min(\"utilization_gpu\") FROM \"nvidia_smi\" WHERE host =~/$server/ AND $timeFilter;", + "rawQuery": true, + "refId": "B", + "resultFormat": "table", + "select": [ + [ + { + "params": [ + "temperature_gpu" + ], + "type": "field" + }, + { + "params": [], + "type": "max" + } + ] + ] + } ], - "tags": [ + "title": "Utilization: Highest/Lowest", + "transformations": [ { - "key": "host", - "operator": "=~", - "value": "/^$server$/" + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "indexByName": {}, + "renameByName": { + "host": "Node", + "max": "Highest", + "min": "Lowest", + "uuid": "GPU" + } + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "" + } } - ] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Processes", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + ], + "type": "table" }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": false, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 18 - }, - "id": 62046, - "panels": [], - "repeat": null, - "title": "Memory", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 0, + "y": 5 + }, + "id": 64291, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SHOW TAG VALUES CARDINALITY FROM nvidia_smi with KEY=\"uuid\" where uuid =~ /$gpu/ ", + "rawQuery": true, + "refId": "A", + "resultFormat": "table" + } + ], + "title": "# GPU", + "type": "stat" }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 19 - }, - "height": "400", - "hiddenSeries": false, - "id": 12054, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.4.1", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/total/", - "color": "#BF1B00", - "fill": 0, - "linewidth": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 2, + "y": 5 + }, + "id": 63202, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" ], - "type": "time" + "show": false }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SHOW TAG VALUES FROM nvidia_smi with KEY=\"name\" where uuid =~ /$gpu/ ", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "measurement": "mem_inactive", - "policy": "default", - "query": "SELECT mean(total) as total, mean(used) as used, mean(cached) as cached, mean(free) as free, mean(buffered) as buffered FROM \"mem\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" + "title": "GPU", + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ { - "params": [], - "type": "mean" + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] } ] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 63610, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SHOW TAG VALUES FROM nvidia_smi with KEY=\"pstate\" WHERE (\"uuid\" =~ /^$gpu$/) ", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": 0, - "show": true + "title": "P-State", + "type": "table" }, { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 29 - }, - "id": 62047, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "fieldConfig": { "defaults": { - "custom": {}, - "links": [] + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 8 - }, - "height": "", - "hiddenSeries": false, - "id": 61855, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true + "h": 4, + "w": 3, + "x": 9, + "y": 5 }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pluginVersion": "7.1.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "id": 65110, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "8.5.21", "targets": [ { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "groupBy": [ - { - "params": [ - "$interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "measurement": "processes", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(context_switches),1s)as \"context switches\" FROM \"kernel\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"utilization_gpu\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "GPU Utilization %", + "type": "gauge" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ { - "params": [ - "blocked" - ], - "type": "field" + "color": "green", + "value": null }, { - "params": [], - "type": "mean" + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 } ] - ], - "tags": [ - { - "key": "host", - "operator": "=~", - "value": "/^$server$/" - } - ] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Context switches", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" + } + }, + "overrides": [] }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 5 }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "id": 63747, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "8.5.21", + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"utilization_memory\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "title": "Memory Utilization %", + "type": "gauge" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "fieldConfig": { "defaults": { - "custom": {}, - "links": [] + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + } }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 8 + "h": 4, + "w": 3, + "x": 15, + "y": 5 }, - "height": "", - "hiddenSeries": false, - "id": 61960, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true + "id": 64019, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pluginVersion": "7.1.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "pluginVersion": "8.5.21", "targets": [ { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "groupBy": [ - { - "params": [ - "$interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "measurement": "kernel", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(processes_forked),1s) as forks FROM \"kernel\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"temperature_gpu\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "processes_forked" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [ - { - "key": "host", - "operator": "=~", - "value": "/^$server$/" - } - ] + "refId": "A", + "resultFormat": "time_series" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Forks", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" + "title": "Temperature", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 5 + }, + "id": 63883, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"fan_speed\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "title": "Fan Speed", + "type": "stat" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "fieldConfig": { "defaults": { - "custom": {}, - "links": [] + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 8 + "h": 4, + "w": 3, + "x": 21, + "y": 5 }, - "height": "", - "hiddenSeries": false, - "id": 62042, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pluginVersion": "7.1.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/max/", - "color": "#890F02", - "fill": 0 + "id": 64155, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ { - "alias": "/opened/", - "color": "#0A437C" + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"power_draw\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "groupBy": [ - { - "params": [ - "$interval" - ], - "type": "time" + "title": "Power draw", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "measurement": "kernel", - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT mean(\"file-max\") as max FROM \"linux_sysctl_fs\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "processes_forked" - ], - "type": "field" - }, + "properties": [ { - "params": [], - "type": "mean" + "id": "custom.hidden", + "value": true } ] + } + ] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 2, + "y": 7 + }, + "id": 63338, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" ], - "tags": [ - { - "key": "host", - "operator": "=~", - "value": "/^$server$/" - } - ] + "show": false }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "groupBy": [ - { - "params": [ - "$interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "measurement": "kernel", - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT mean(\"file-nr\") as opened FROM \"linux_sysctl_fs\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT last(\"driver_version\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"index\" SLIMIT 1", "rawQuery": true, "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "processes_forked" - ], - "type": "field" - }, + "resultFormat": "time_series" + } + ], + "title": "Driver Version", + "type": "table" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "color-text", + "filterable": true, + "inspect": false + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byType", + "options": "time" + }, + "properties": [ { - "params": [], - "type": "mean" + "id": "custom.hidden", + "value": true } ] - ], - "tags": [ - { - "key": "host", - "operator": "=~", - "value": "/^$server$/" - } - ] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "File descriptors", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" + } + ] }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 7 }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "id": 63474, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false }, + "showHeader": false + }, + "pluginVersion": "8.5.21", + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT last(\"cuda_version\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"index\" SLIMIT 1", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": null, - "title": "Kernel", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 30 - }, - "id": 62048, - "panels": [ + "title": "CUDA Version", + "type": "table" + }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "fieldConfig": { "defaults": { - "custom": {}, - "links": [] + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + } }, "overrides": [] }, - "fill": 1, - "fillGradient": 0, "gridPos": { - "h": 7, - "w": 24, + "h": 5, + "w": 3, "x": 0, "y": 9 }, - "hiddenSeries": false, - "id": 62043, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true + "id": 64428, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pluginVersion": "7.1.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "pluginVersion": "8.5.21", "targets": [ { - "alias": "$tag_host: $tag_irq", - "dsType": "influxdb", - "groupBy": [ - { - "params": [ - "$__interval" - ], - "type": "time" - }, - { - "params": [ - "irq" - ], - "type": "tag" - }, - { - "params": [ - "host" - ], - "type": "tag" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "measurement": "interrupts", - "orderByTime": "ASC", - "policy": "default", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"pcie_link_width_current\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "total" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - }, - { - "params": [ - "10s" - ], - "type": "non_negative_derivative" + "resultFormat": "time_series" + } + ], + "title": "Current link width", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null } ] - ], - "tags": [ - { - "key": "host", - "operator": "=~", - "value": "/^$server$/" - } - ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 9 + }, + "id": 64427, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"pcie_link_gen_current\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Interrupts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "title": "Current link generation", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 9 + }, + "id": 64837, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_graphics\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" } ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": null, - "title": "Interrupts", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 31 - }, - "id": 62049, - "panels": [ + "title": "Graphics clock frequency", + "type": "stat" + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 6 + "h": 10, + "w": 15, + "x": 9, + "y": 9 }, - "height": "", "hiddenSeries": false, - "id": 61868, - "interval": "$inter", + "id": 62521, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, "max": true, - "min": false, - "rightSide": false, + "min": true, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", + "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pointradius": 5, + "pluginVersion": "8.5.21", + "pointradius": 2, "points": false, "renderer": "flot", - "repeat": "cpu", - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu0", - "value": "cpu0" - } - }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"utilization_gpu\" FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + "refId": "A", + "resultFormat": "time_series" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", + "title": "GPU Utilization", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { + "$$hashKey": "object:58", "format": "percent", + "label": "", "logBase": 1, - "max": 100, - "min": 0, + "min": "0", "show": true }, { + "$$hashKey": "object:59", "format": "short", "logBase": 1, - "max": null, - "min": null, + "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 6 + "h": 5, + "w": 3, + "x": 0, + "y": 14 }, - "height": "", - "hiddenSeries": false, - "id": 62070, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, + "id": 64701, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_video\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Video clock frequency", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 3, + "y": 14 + }, + "id": 64565, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_sm\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "SM clock frequency", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 14 + }, + "id": 64564, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.5.21", + "targets": [ + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT mean(\"clocks_current_memory\") FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + } + ], + "title": "Memory clock frequency", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "decmbytes" + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "hiddenSeries": false, + "id": 62793, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", + "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pointradius": 5, + "pluginVersion": "8.5.21", + "pointradius": 2, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu1", - "value": "cpu1" - } - }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"memory_total\" FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series" + }, + { + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", + "query": "SELECT \"memory_used\" FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", "rawQuery": true, "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + "resultFormat": "time_series" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", + "title": "Memory (Total & Used)", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "percent", + "$$hashKey": "object:58", + "format": "decmbytes", + "label": "", "logBase": 1, - "max": 100, - "min": 0, + "min": "0", "show": true }, { + "$$hashKey": "object:59", "format": "short", "logBase": 1, - "max": null, - "min": null, + "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1594,146 +1708,119 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "celsius" + }, + "overrides": [] + }, + "fill": 0, "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 6 + "h": 8, + "w": 8, + "x": 8, + "y": 19 }, - "height": "", "hiddenSeries": false, - "id": 62071, - "interval": "$inter", + "id": 62657, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, "max": true, - "min": false, - "rightSide": false, + "min": true, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", + "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pointradius": 5, + "pluginVersion": "8.5.21", + "pointradius": 2, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu10", - "value": "cpu10" - } - }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"temperature_gpu\" FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + "refId": "A", + "resultFormat": "time_series" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", + "title": "Temperature", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } + } + ], "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "percent", + "$$hashKey": "object:58", + "format": "celsius", + "label": "", "logBase": 1, - "max": 100, - "min": 0, + "min": "0", "show": true }, { + "$$hashKey": "object:59", "format": "short", "logBase": 1, - "max": null, - "min": null, + "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -1741,6733 +1828,588 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "fill": 0, "fillGradient": 0, - "grid": {}, "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 6 + "h": 8, + "w": 8, + "x": 16, + "y": 19 }, - "height": "", "hiddenSeries": false, - "id": 62072, - "interval": "$inter", + "id": 62929, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, - "hideEmpty": true, - "hideZero": true, "max": true, - "min": false, - "rightSide": false, + "min": true, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", + "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, - "pointradius": 5, + "pluginVersion": "8.5.21", + "pointradius": 2, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu11", - "value": "cpu11" - } - }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "query": "SELECT \"power_draw\" FROM \"nvidia_smi\" WHERE (\"uuid\" =~ /^$gpu$/) AND $timeFilter GROUP BY \"uuid\"", "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + "refId": "A", + "resultFormat": "time_series" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", + "title": "Power Usage", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + "value_type": "individual" }, - "yaxes": [ + "transformations": [ { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true + "id": "renameByRegex", + "options": { + "regex": "(:*).* ", + "renamePattern": "$1" + } }, { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 13 - }, - "height": "", - "hiddenSeries": false, - "id": 62073, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu12", - "value": "cpu12" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "id": "renameByRegex", + "options": { + "regex": "(.*)-:*", + "renamePattern": "GPU-" + } + }, { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + "id": "renameByRegex", + "options": { + "regex": "}", + "renamePattern": "" + } } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "percent", + "$$hashKey": "object:58", + "format": "kwatt", + "label": "", "logBase": 1, - "max": 100, - "min": 0, + "min": "0", "show": true }, { + "$$hashKey": "object:59", "format": "short", "logBase": 1, - "max": null, - "min": null, + "min": "0", "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } + } + ], + "title": "GPU", + "type": "row" + }, + { + "collapsed": false, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 62045, + "panels": [], + "title": "CPU", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 13 - }, - "height": "", - "hiddenSeries": false, - "id": 62074, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu13", - "value": "cpu13" - } + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 2 + }, + "height": "300", + "hiddenSeries": false, + "id": 28239, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "dsType": "influxdb", + "function": "mean", + "groupBy": [ { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] + "interval": "auto", + "params": [ + "auto" ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true + "type": "time" }, { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true + "key": "host", + "params": [ + "tag" + ], + "type": "tag" } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "hide": false, + "measurement": "cpu_percentageBusy", + "policy": "default", + "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = 'cpu-total' AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:506", + "format": "percent", + "logBase": 1, + "max": 100, + "min": 0, + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 13 - }, - "height": "", - "hiddenSeries": false, - "id": 62075, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu14", - "value": "cpu14" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "$$hashKey": "object:507", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 10 + }, + "height": "350", + "hiddenSeries": false, + "id": 54694, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 13 - }, - "height": "", - "hiddenSeries": false, - "id": 62076, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu15", - "value": "cpu15" - } + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ + "dsType": "influxdb", + "function": "mean", + "groupBy": [ { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] + "interval": "auto", + "params": [ + "auto" ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true + "type": "time" }, { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true + "key": "host", + "params": [ + "tag" + ], + "type": "tag" } ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 20 - }, - "height": "", - "hiddenSeries": false, - "id": 62077, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu2", - "value": "cpu2" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 20 - }, - "height": "", - "hiddenSeries": false, - "id": 62078, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu3", - "value": "cpu3" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 20 - }, - "height": "", - "hiddenSeries": false, - "id": 62079, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu4", - "value": "cpu4" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 20 - }, - "height": "", - "hiddenSeries": false, - "id": 62080, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu5", - "value": "cpu5" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 0, - "y": 27 - }, - "height": "", - "hiddenSeries": false, - "id": 62081, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu6", - "value": "cpu6" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 27 - }, - "height": "", - "hiddenSeries": false, - "id": 62082, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu7", - "value": "cpu7" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 12, - "y": 27 - }, - "height": "", - "hiddenSeries": false, - "id": 62083, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu8", - "value": "cpu8" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 6, - "x": 18, - "y": 27 - }, - "height": "", - "hiddenSeries": false, - "id": 62084, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 4, - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatIteration": 1575627978415, - "repeatPanelId": 61868, - "scopedVars": { - "cpu": { - "selected": false, - "text": "cpu9", - "value": "cpu9" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $cpu $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "hide": false, - "measurement": "cpu_percentageBusy", - "policy": "default", - "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPU usage for $cpu", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": null, - "title": "Per-cpu usage", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 62053, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 42026, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "scopedVars": { - "netif": { - "selected": false, - "text": "eno2", - "value": "eno2" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 28572, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "scopedVars": { - "netif": { - "selected": false, - "text": "eno2", - "value": "eno2" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Packets", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "decimals": 1, - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 58901, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "scopedVars": { - "netif": { - "selected": false, - "text": "eno2", - "value": "eno2" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network drops", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "Drops per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 50643, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "scopedVars": { - "netif": { - "selected": false, - "text": "eno2", - "value": "eno2" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network errors", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "Errors per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": "netif", - "scopedVars": { - "netif": { - "selected": false, - "text": "eno2", - "value": "eno2" - } - }, - "title": "Network interface stats for $netif", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 62085, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 62086, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 42026, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "eno3d1", - "value": "eno3d1" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 62087, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 28572, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "eno3d1", - "value": "eno3d1" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Packets", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "decimals": 1, - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 62088, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 58901, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "eno3d1", - "value": "eno3d1" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network drops", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "Drops per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 62089, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 50643, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "eno3d1", - "value": "eno3d1" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network errors", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "Errors per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62053, - "scopedVars": { - "netif": { - "selected": false, - "text": "eno3d1", - "value": "eno3d1" - } - }, - "title": "Network interface stats for $netif", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 62090, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 62091, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 42026, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f0", - "value": "enp94s0f0" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 62092, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 28572, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f0", - "value": "enp94s0f0" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Packets", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "decimals": 1, - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 62093, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 58901, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f0", - "value": "enp94s0f0" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network drops", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "Drops per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 62094, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 50643, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f0", - "value": "enp94s0f0" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network errors", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "Errors per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62053, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f0", - "value": "enp94s0f0" - } - }, - "title": "Network interface stats for $netif", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 35 - }, - "id": 62095, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 62096, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 42026, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f1", - "value": "enp94s0f1" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 62097, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 28572, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f1", - "value": "enp94s0f1" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Packets", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "decimals": 1, - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 62098, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 58901, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f1", - "value": "enp94s0f1" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network drops", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "Drops per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 62099, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 50643, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f1", - "value": "enp94s0f1" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network errors", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "Errors per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62053, - "scopedVars": { - "netif": { - "selected": false, - "text": "enp94s0f1", - "value": "enp94s0f1" - } - }, - "title": "Network interface stats for $netif", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 36 - }, - "id": 62100, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 62101, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 42026, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "ens3", - "value": "ens3" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bps", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 62102, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 28572, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "ens3", - "value": "ens3" - } - }, - "seriesOverrides": [ - { - "alias": "/ in$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network Packets", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "decimals": 1, - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 62103, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 58901, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "ens3", - "value": "ens3" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network drops", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "pps", - "label": "Drops per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 62104, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeatIteration": 1613083475434, - "repeatPanelId": 50643, - "repeatedByRow": true, - "scopedVars": { - "netif": { - "selected": false, - "text": "ens3", - "value": "ens3" - } - }, - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_interface: $col", - "dsType": "influxdb", - "function": "derivative", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "interface", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "net_bytes_recv", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Network errors", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "Errors per second", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62053, - "scopedVars": { - "netif": { - "selected": false, - "text": "ens3", - "value": "ens3" - } - }, - "title": "Network interface stats for $netif", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 37 - }, - "id": 62054, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 12 - }, - "hiddenSeries": false, - "id": 26024, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/in/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "swap_in", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(\"in\")) as \"in\", non_negative_derivative(mean(\"out\")) as \"out\" FROM \"swap\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Swap I/O bytes", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 12 - }, - "hiddenSeries": false, - "id": 61850, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/total/", - "color": "#890F02", - "fill": 0 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "alias": "$tag_host: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "swap_in", - "policy": "default", - "query": "SELECT mean(used) as \"used\", mean(total) as \"total\" FROM \"swap\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Swap usage (bytes)", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": null, - "title": "Swap", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 38 - }, - "id": 62055, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 15 - }, - "hiddenSeries": false, - "id": 13782, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 6, - "nullPointMode": "connected", - "percentage": false, - "pluginVersion": "7.1.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "scopedVars": { - "disk": { - "selected": false, - "text": "nvme0n1", - "value": "nvme0n1" - } - }, - "seriesOverrides": [ - { - "alias": "/.*write$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "io_reads", - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(reads),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "io_reads", - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(writes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "C", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk I/O requests for /dev/$disk", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "iops", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } + "measurement": "system_load1", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(load1) as short,mean(load5) as medium,mean(load15) as long FROM \"system\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), * ORDER BY asc", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] ], - "yaxis": { - "align": false, - "alignLevel": null - } + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Load averages", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:581", + "format": "short", + "logBase": 1, + "min": 0, + "show": true }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 15 - }, - "hiddenSeries": false, - "id": 60200, - "interval": "$inter", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "maxPerRow": 6, - "nullPointMode": "connected", - "percentage": false, - "pluginVersion": "7.1.5", - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "scopedVars": { - "disk": { - "selected": false, - "text": "nvme0n1", - "value": "nvme0n1" - } - }, - "seriesOverrides": [ - { - "alias": "/.*write$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "io_reads", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(read_bytes),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] + "$$hashKey": "object:582", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 10 + }, + "height": "350", + "hiddenSeries": false, + "id": 61852, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$interval" ], - "tags": [] + "type": "time" }, { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "io_reads", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(write_bytes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "C", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] + "params": [ + "null" ], - "tags": [] + "type": "fill" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk I/O bytes for /dev/$disk", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, + "measurement": "processes", + "policy": "default", + "query": "SELECT mean(running) as running, mean(blocked) as blocked, mean(sleeping) as sleeping, mean(stopped) as stopped, mean(zombies) as zombies, mean(paging) as paging, mean(unknown) as unknown FROM \"processes\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "blocked" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true + "key": "host", + "operator": "=~", + "value": "/^$server$/" } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + ] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Processes", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:657", + "format": "short", + "logBase": 1, + "show": true }, + { + "$$hashKey": "object:658", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 62046, + "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -8476,22 +2418,21 @@ "fillGradient": 0, "grid": {}, "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 15 + "h": 10, + "w": 24, + "x": 0, + "y": 3 }, + "height": "400", "hiddenSeries": false, - "id": 56720, + "id": 12054, "interval": "$inter", "legend": { "alignAsTable": true, "avg": true, "current": true, - "hideEmpty": true, "max": true, - "min": false, - "rightSide": false, + "min": true, "show": true, "sort": "current", "sortDesc": true, @@ -8501,25 +2442,21 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 6, "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.5", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "scopedVars": { - "disk": { - "selected": false, - "text": "nvme0n1", - "value": "nvme0n1" - } - }, "seriesOverrides": [ { - "alias": "/.*write$/", - "transform": "negative-Y" + "alias": "/total/", + "color": "#BF1B00", + "fill": 0, + "linewidth": 2 } ], "spaceLength": 10, @@ -8527,56 +2464,11 @@ "steppedLine": false, "targets": [ { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "io_reads", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(read_time),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_name: $col", + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -8593,18 +2485,11 @@ "tag" ], "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" } ], - "measurement": "io_reads", + "measurement": "mem_inactive", "policy": "default", - "query": "SELECT non_negative_derivative(mean(write_time),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "query": "SELECT mean(total) as total, mean(used) as used, mean(cached) as cached, mean(free) as free, mean(buffered) as buffered FROM \"mem\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -8626,10 +2511,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk I/O time for /dev/$disk", + "title": "Memory usage", "tooltip": { "msResolution": false, "shared": true, @@ -8638,159 +2521,127 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "ms", + "$$hashKey": "object:737", + "format": "bytes", "logBase": 1, - "max": null, - "min": null, + "min": 0, "show": true }, { + "$$hashKey": "object:738", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "repeat": "disk", - "scopedVars": { - "disk": { - "selected": false, - "text": "nvme0n1", - "value": "nvme0n1" + "align": false + } } - }, - "title": "Disk IOPS for /dev/$disk", + ], + "title": "Memory", "type": "row" }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 39 + "y": 20 }, - "id": 62105, + "id": 62047, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, - "y": 15 + "y": 14 }, + "height": "", "hiddenSeries": false, - "id": 62106, - "interval": "$inter", + "id": 61855, "legend": { "alignAsTable": true, "avg": true, "current": true, - "hideEmpty": true, "max": true, - "min": false, + "min": true, "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 6, "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.5", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 13782, - "repeatedByRow": true, - "scopedVars": { - "disk": { - "selected": false, - "text": "sda", - "value": "sda" - } - }, - "seriesOverrides": [ - { - "alias": "/.*write$/", - "transform": "negative-Y" - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "alias": "$tag_host: $tag_name: $col", + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", "groupBy": [ { - "interval": "auto", "params": [ - "auto" + "$interval" ], "type": "time" }, { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", "params": [ - "tag" + "null" ], - "type": "tag" + "type": "fill" } ], - "measurement": "io_reads", - "orderByTime": "ASC", + "measurement": "processes", "policy": "default", - "query": "SELECT non_negative_derivative(mean(reads),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "query": "SELECT non_negative_derivative(mean(context_switches),1s)as \"context switches\" FROM \"kernel\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -8798,7 +2649,7 @@ [ { "params": [ - "value" + "blocked" ], "type": "field" }, @@ -8808,97 +2659,46 @@ } ] ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, + "tags": [ { "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" + "operator": "=~", + "value": "/^$server$/" } - ], - "measurement": "io_reads", - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(writes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "C", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + ] } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk I/O requests for /dev/$disk", + "title": "Context switches", "tooltip": { "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "iops", + "$$hashKey": "object:837", + "format": "ops", "logBase": 1, - "max": null, - "min": null, "show": true }, { + "$$hashKey": "object:838", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -8906,103 +2706,81 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, - "y": 15 + "y": 14 }, + "height": "", "hiddenSeries": false, - "id": 62107, - "interval": "$inter", + "id": 61960, "legend": { "alignAsTable": true, "avg": true, "current": true, - "hideEmpty": true, "max": true, - "min": false, + "min": true, "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 6, "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.5", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 60200, - "repeatedByRow": true, - "scopedVars": { - "disk": { - "selected": false, - "text": "sda", - "value": "sda" - } - }, - "seriesOverrides": [ - { - "alias": "/.*write$/", - "transform": "negative-Y" - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "alias": "$tag_host: $tag_name: $col", + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", "groupBy": [ { - "interval": "auto", "params": [ - "auto" + "$interval" ], "type": "time" }, { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", "params": [ - "tag" + "null" ], - "type": "tag" + "type": "fill" } ], - "measurement": "io_reads", + "measurement": "kernel", "policy": "default", - "query": "SELECT non_negative_derivative(mean(read_bytes),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "query": "SELECT non_negative_derivative(mean(processes_forked),1s) as forks FROM \"kernel\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -9010,7 +2788,7 @@ [ { "params": [ - "value" + "processes_forked" ], "type": "field" }, @@ -9020,96 +2798,46 @@ } ] ], - "tags": [] - }, - { - "alias": "$tag_host: $tag_name: $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, + "tags": [ { "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" + "operator": "=~", + "value": "/^$server$/" } - ], - "measurement": "io_reads", - "policy": "default", - "query": "SELECT non_negative_derivative(mean(write_bytes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", - "rawQuery": true, - "refId": "C", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] + ] } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk I/O bytes for /dev/$disk", + "title": "Forks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "bytes", + "$$hashKey": "object:913", + "format": "ops", "logBase": 1, - "max": null, - "min": null, "show": true }, { + "$$hashKey": "object:914", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -9117,67 +2845,61 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, - "y": 15 + "y": 14 }, + "height": "", "hiddenSeries": false, - "id": 62108, - "interval": "$inter", + "id": 62042, "legend": { "alignAsTable": true, "avg": true, "current": true, - "hideEmpty": true, "max": true, - "min": false, + "min": true, "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 6, "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, "percentage": false, - "pluginVersion": "7.1.5", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 56720, - "repeatedByRow": true, - "scopedVars": { - "disk": { - "selected": false, - "text": "sda", - "value": "sda" - } - }, "seriesOverrides": [ { - "alias": "/.*write$/", - "transform": "negative-Y" + "alias": "/max/", + "color": "#890F02", + "fill": 0 + }, + { + "alias": "/opened/", + "color": "#0A437C" } ], "spaceLength": 10, @@ -9185,35 +2907,30 @@ "steppedLine": false, "targets": [ { - "alias": "$tag_host: $tag_name: $col", + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", "groupBy": [ { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", "params": [ - "tag" + "$interval" ], - "type": "tag" + "type": "time" }, { - "key": "path", "params": [ - "tag" + "null" ], - "type": "tag" + "type": "fill" } ], - "measurement": "io_reads", + "measurement": "kernel", + "orderByTime": "ASC", "policy": "default", - "query": "SELECT non_negative_derivative(mean(read_time),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "query": "SELECT mean(\"file-max\") as max FROM \"linux_sysctl_fs\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -9221,7 +2938,7 @@ [ { "params": [ - "value" + "processes_forked" ], "type": "field" }, @@ -9231,38 +2948,39 @@ } ] ], - "tags": [] + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] }, { - "alias": "$tag_host: $tag_name: $col", + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", "groupBy": [ { - "interval": "auto", "params": [ - "auto" + "$interval" ], "type": "time" }, { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", "params": [ - "tag" + "null" ], - "type": "tag" + "type": "fill" } ], - "measurement": "io_reads", + "measurement": "kernel", + "orderByTime": "ASC", "policy": "default", - "query": "SELECT non_negative_derivative(mean(write_time),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "query": "SELECT mean(\"file-nr\") as opened FROM \"linux_sysctl_fs\" WHERE \"host\" =~ /^$server$/ AND $timeFilter GROUP BY time($interval), host fill(null)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -9270,7 +2988,7 @@ [ { "params": [ - "value" + "processes_forked" ], "type": "field" }, @@ -9280,106 +2998,100 @@ } ] ], - "tags": [] + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk I/O time for /dev/$disk", + "title": "File descriptors", "tooltip": { "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "ms", + "$$hashKey": "object:989", + "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true }, { + "$$hashKey": "object:990", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62055, - "scopedVars": { - "disk": { - "selected": false, - "text": "sda", - "value": "sda" - } - }, - "title": "Disk IOPS for /dev/$disk", + "title": "Kernel", "type": "row" }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 40 + "y": 21 }, - "id": 62056, + "id": 62048, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", - "editable": true, - "error": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] }, "fill": 1, "fillGradient": 0, - "grid": {}, "gridPos": { "h": 7, - "w": 12, + "w": 24, "x": 0, - "y": 55 + "y": 22 }, "hiddenSeries": false, - "id": 52240, - "interval": "$inter", + "id": 62043, "legend": { "alignAsTable": true, - "avg": true, + "avg": false, "current": true, - "max": true, + "hideEmpty": true, + "hideZero": true, + "max": false, "min": false, + "rightSide": true, "show": true, "sort": "current", "sortDesc": true, @@ -9389,138 +3101,150 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 4, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/", - "value": "/" - } - }, - "seriesOverrides": [ - { - "alias": "/total/", - "color": "#BF1B00", - "fill": 0, - "linewidth": 2, - "zindex": 3 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_irq", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", "groupBy": [ { - "interval": "auto", "params": [ - "auto" + "$__interval" ], "type": "time" }, { - "key": "host", "params": [ - "tag" + "irq" ], "type": "tag" }, { - "key": "path", "params": [ - "tag" + "host" ], "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" } ], - "measurement": "disk_total", + "measurement": "interrupts", + "orderByTime": "ASC", "policy": "default", - "query": "SELECT mean(total) AS \"total\", mean(used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", - "rawQuery": true, - "refId": "B", + "refId": "A", "resultFormat": "time_series", "select": [ [ { "params": [ - "value" + "total" ], "type": "field" }, { "params": [], "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" } ] ], - "tags": [] + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$server$/" + } + ] } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk usage for $mountpoint", + "title": "Interrupts", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "bytes", + "$$hashKey": "object:1069", + "format": "ops", "logBase": 1, - "max": null, - "min": 0, "show": true }, { + "$$hashKey": "object:1070", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } - }, + } + ], + "title": "Interrupts", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 62049, + "panels": [ { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -9530,26 +3254,30 @@ "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 55 + "w": 6, + "x": 0, + "y": 30 }, + "height": "", "hiddenSeries": false, - "id": 33458, + "id": 61868, "interval": "$inter", "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], "maxPerRow": 4, @@ -9558,88 +3286,22 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/", - "value": "/" - } - }, - "seriesOverrides": [ - { - "alias": "/used/", - "color": "#447EBC", - "zindex": 3 - }, - { - "alias": "/total/", - "bars": false, - "color": "#BF1B00", - "fill": 0, - "lines": true, - "linewidth": 1 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "alias": "$tag_host: mountpoint $tag_path - $col", - "dsType": "influxdb", - "function": "mean", - "groupBy": [ - { - "interval": "auto", - "params": [ - "auto" - ], - "type": "time" - }, - { - "key": "host", - "params": [ - "tag" - ], - "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" - } - ], - "measurement": "disk_inodes_free", - "policy": "default", - "query": "SELECT mean(inodes_used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, + "repeat": "cpu", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $cpu $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -9656,20 +3318,14 @@ "tag" ], "type": "tag" - }, - { - "key": "path", - "params": [ - "tag" - ], - "type": "tag" } ], - "measurement": "disk_inodes_free", + "hide": false, + "measurement": "cpu_percentageBusy", "policy": "default", - "query": "SELECT mean(inodes_free) + mean(inodes_used) as \"total\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT mean(usage_user) as \"user\", mean(usage_system) as \"system\", mean(usage_softirq) as \"softirq\", mean(usage_steal) as \"steal\", mean(usage_nice) as \"nice\", mean(usage_irq) as \"irq\", mean(usage_iowait) as \"iowait\", mean(usage_guest) as \"guest\", mean(usage_guest_nice) as \"guest_nice\" FROM \"cpu\" WHERE \"host\" =~ /$server$/ and cpu = '$cpu' AND $timeFilter GROUP BY time($interval), *", "rawQuery": true, - "refId": "A", + "refId": "B", "resultFormat": "time_series", "select": [ [ @@ -9689,10 +3345,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk inodes for $mountpoint", + "title": "CPU usage for $cpu", "tooltip": { "msResolution": false, "shared": true, @@ -9701,67 +3355,61 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "short", + "$$hashKey": "object:1795", + "format": "percent", "logBase": 1, - "max": null, + "max": 100, "min": 0, "show": true }, { + "$$hashKey": "object:1796", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], - "repeat": "mountpoint", - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/", - "value": "/" - } - }, - "title": "Disk space usage for $mountpoint", + "title": "Per-cpu usage", "type": "row" }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 41 + "y": 23 }, - "id": 62109, + "id": 62053, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -9773,10 +3421,10 @@ "h": 7, "w": 12, "x": 0, - "y": 55 + "y": 31 }, "hiddenSeries": false, - "id": 62110, + "id": 42026, "interval": "$inter", "legend": { "alignAsTable": true, @@ -9784,53 +3432,41 @@ "current": true, "max": true, "min": false, + "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 4, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 52240, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/boot/efi", - "value": "/boot/efi" - } - }, "seriesOverrides": [ { - "alias": "/total/", - "color": "#BF1B00", - "fill": 0, - "linewidth": 2, - "zindex": 3 + "alias": "/ in$/", + "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", + "function": "derivative", "groupBy": [ { "interval": "auto", @@ -9847,16 +3483,69 @@ "type": "tag" }, { - "key": "path", + "key": "interface", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_total", + "measurement": "net_bytes_recv", "policy": "default", - "query": "SELECT mean(total) AS \"total\", mean(used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(bytes_recv),1s)*8 as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(bytes_sent),1s)*8 as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -9878,10 +3567,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk usage for $mountpoint", + "title": "Network Usage", "tooltip": { "msResolution": false, "shared": true, @@ -9890,31 +3577,26 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "bytes", + "$$hashKey": "object:250", + "format": "bps", "logBase": 1, - "max": null, - "min": 0, "show": true }, { + "$$hashKey": "object:251", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -9922,12 +3604,14 @@ "bars": true, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -9939,10 +3623,10 @@ "h": 7, "w": 12, "x": 12, - "y": 55 + "y": 31 }, "hiddenSeries": false, - "id": 62111, + "id": 28572, "interval": "$inter", "legend": { "alignAsTable": true, @@ -9950,59 +3634,41 @@ "current": true, "max": true, "min": false, + "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], - "maxPerRow": 4, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 33458, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/boot/efi", - "value": "/boot/efi" - } - }, "seriesOverrides": [ { - "alias": "/used/", - "color": "#447EBC", - "zindex": 3 - }, - { - "alias": "/total/", - "bars": false, - "color": "#BF1B00", - "fill": 0, - "lines": true, - "linewidth": 1 + "alias": "/ in$/", + "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", + "function": "derivative", "groupBy": [ { "interval": "auto", @@ -10019,18 +3685,18 @@ "type": "tag" }, { - "key": "path", + "key": "interface", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "net_bytes_recv", "policy": "default", - "query": "SELECT mean(inodes_used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(packets_recv), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", "rawQuery": true, - "refId": "B", + "refId": "A", "resultFormat": "time_series", "select": [ [ @@ -10049,9 +3715,13 @@ "tags": [] }, { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", + "function": "derivative", "groupBy": [ { "interval": "auto", @@ -10068,18 +3738,18 @@ "type": "tag" }, { - "key": "path", + "key": "interface", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "net_bytes_recv", "policy": "default", - "query": "SELECT mean(inodes_free) + mean(inodes_used) as \"total\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(packets_sent), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), * fill(none)", "rawQuery": true, - "refId": "A", + "refId": "B", "resultFormat": "time_series", "select": [ [ @@ -10099,10 +3769,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk inodes for $mountpoint", + "title": "Network Packets", "tooltip": { "msResolution": false, "shared": true, @@ -10111,68 +3779,43 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "short", + "$$hashKey": "object:2032", + "format": "pps", + "label": "", "logBase": 1, - "max": null, - "min": 0, "show": true }, { + "$$hashKey": "object:2033", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } - } - ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62056, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/boot/efi", - "value": "/boot/efi" - } - }, - "title": "Disk space usage for $mountpoint", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 42 - }, - "id": 62112, - "panels": [ + }, { "aliasColors": {}, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "decimals": 1, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -10184,10 +3827,10 @@ "h": 7, "w": 12, "x": 0, - "y": 55 + "y": 38 }, "hiddenSeries": false, - "id": 62113, + "id": 58901, "interval": "$inter", "legend": { "alignAsTable": true, @@ -10195,53 +3838,89 @@ "current": true, "max": true, "min": false, + "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, - "lines": true, + "lines": false, "linewidth": 1, "links": [], - "maxPerRow": 4, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 52240, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/home", - "value": "/home" - } - }, - "seriesOverrides": [ - { - "alias": "/total/", - "color": "#BF1B00", - "fill": 0, - "linewidth": 2, - "zindex": 3 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "derivative", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "interface", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "net_bytes_recv", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(drop_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", + "function": "derivative", "groupBy": [ { "interval": "auto", @@ -10258,16 +3937,16 @@ "type": "tag" }, { - "key": "path", + "key": "interface", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_total", + "measurement": "net_bytes_recv", "policy": "default", - "query": "SELECT mean(total) AS \"total\", mean(used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(drop_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -10289,43 +3968,38 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk usage for $mountpoint", + "title": "Network drops", "tooltip": { "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "bytes", + "$$hashKey": "object:2105", + "format": "pps", + "label": "Drops per second", "logBase": 1, - "max": null, "min": 0, "show": true }, { + "$$hashKey": "object:2106", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -10333,12 +4007,14 @@ "bars": true, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -10350,10 +4026,10 @@ "h": 7, "w": 12, "x": 12, - "y": 55 + "y": 38 }, "hiddenSeries": false, - "id": 62114, + "id": 50643, "interval": "$inter", "legend": { "alignAsTable": true, @@ -10361,59 +4037,36 @@ "current": true, "max": true, "min": false, + "rightSide": false, "show": true, - "sort": "current", - "sortDesc": true, "total": false, "values": true }, "lines": false, - "linewidth": 1, + "linewidth": 2, "links": [], - "maxPerRow": 4, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 33458, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/home", - "value": "/home" - } - }, - "seriesOverrides": [ - { - "alias": "/used/", - "color": "#447EBC", - "zindex": 3 - }, - { - "alias": "/total/", - "bars": false, - "color": "#BF1B00", - "fill": 0, - "lines": true, - "linewidth": 1 - } - ], + "seriesOverrides": [], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", + "function": "derivative", "groupBy": [ { "interval": "auto", @@ -10430,18 +4083,18 @@ "type": "tag" }, { - "key": "path", + "key": "interface", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "net_bytes_recv", "policy": "default", - "query": "SELECT mean(inodes_used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(err_in), 1s) as \"in\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", "rawQuery": true, - "refId": "B", + "refId": "A", "resultFormat": "time_series", "select": [ [ @@ -10460,9 +4113,13 @@ "tags": [] }, { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_interface: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", - "function": "mean", + "function": "derivative", "groupBy": [ { "interval": "auto", @@ -10479,18 +4136,18 @@ "type": "tag" }, { - "key": "path", + "key": "interface", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "net_bytes_recv", "policy": "default", - "query": "SELECT mean(inodes_free) + mean(inodes_used) as \"total\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(err_out), 1s) as \"out\" FROM \"net\" WHERE host =~ /$server/ AND interface =~ /$netif/ AND $timeFilter GROUP BY time($interval), host,interface fill(none)", "rawQuery": true, - "refId": "A", + "refId": "B", "resultFormat": "time_series", "select": [ [ @@ -10510,10 +4167,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk inodes for $mountpoint", + "title": "Network errors", "tooltip": { "msResolution": false, "shared": true, @@ -10522,68 +4177,62 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { + "$$hashKey": "object:2178", "format": "short", + "label": "Errors per second", "logBase": 1, - "max": null, "min": 0, "show": true }, { + "$$hashKey": "object:2179", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62056, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/home", - "value": "/home" - } - }, - "title": "Disk space usage for $mountpoint", + "repeat": "netif", + "title": "Network interface stats for $netif", "type": "row" }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }, - "id": 62115, + "id": 62054, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -10595,10 +4244,10 @@ "h": 7, "w": 12, "x": 0, - "y": 55 + "y": 51 }, "hiddenSeries": false, - "id": 62116, + "id": 26024, "interval": "$inter", "legend": { "alignAsTable": true, @@ -10615,42 +4264,31 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 4, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 52240, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/mnt/localdisk", - "value": "/mnt/localdisk" - } - }, "seriesOverrides": [ { - "alias": "/total/", - "color": "#BF1B00", - "fill": 0, - "linewidth": 2, - "zindex": 3 + "alias": "/in/", + "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -10667,18 +4305,155 @@ "tag" ], "type": "tag" + } + ], + "measurement": "swap_in", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(\"in\")) as \"in\", non_negative_derivative(mean(\"out\")) as \"out\" FROM \"swap\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Swap I/O bytes", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2454", + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "$$hashKey": "object:2455", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 51 + }, + "hiddenSeries": false, + "id": 61850, + "interval": "$inter", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.5.21", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/total/", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "alias": "$tag_host: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" }, { - "key": "path", + "key": "host", "params": [ "tag" ], "type": "tag" } ], - "measurement": "disk_total", + "measurement": "swap_in", "policy": "default", - "query": "SELECT mean(total) AS \"total\", mean(used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT mean(used) as \"used\", mean(total) as \"total\" FROM \"swap\" WHERE host =~ /$server$/ AND $timeFilter GROUP BY time($interval), host ORDER BY asc", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -10700,10 +4475,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk usage for $mountpoint", + "title": "Swap usage (bytes)", "tooltip": { "msResolution": false, "shared": true, @@ -10712,44 +4485,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { + "$$hashKey": "object:2530", "format": "bytes", "logBase": 1, - "max": null, "min": 0, "show": true }, { + "$$hashKey": "object:2531", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } - }, + } + ], + "title": "Swap", + "type": "row" + }, + { + "collapsed": true, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 62055, + "panels": [ { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -10759,70 +4548,56 @@ "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 55 + "w": 8, + "x": 0, + "y": 59 }, "hiddenSeries": false, - "id": 62117, + "id": 13782, "interval": "$inter", "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": true, "max": true, "min": false, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 4, + "maxPerRow": 6, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 33458, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/mnt/localdisk", - "value": "/mnt/localdisk" - } - }, "seriesOverrides": [ { - "alias": "/used/", - "color": "#447EBC", - "zindex": 3 - }, - { - "alias": "/total/", - "bars": false, - "color": "#BF1B00", - "fill": 0, - "lines": true, - "linewidth": 1 + "alias": "/.*write$/", + "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -10848,9 +4623,10 @@ "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "io_reads", + "orderByTime": "ASC", "policy": "default", - "query": "SELECT mean(inodes_used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(reads),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -10871,7 +4647,11 @@ "tags": [] }, { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -10897,11 +4677,12 @@ "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "io_reads", + "orderByTime": "ASC", "policy": "default", - "query": "SELECT mean(inodes_free) + mean(inodes_used) as \"total\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(writes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", "rawQuery": true, - "refId": "A", + "refId": "C", "resultFormat": "time_series", "select": [ [ @@ -10921,10 +4702,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk inodes for $mountpoint", + "title": "Disk I/O requests for /dev/$disk", "tooltip": { "msResolution": false, "shared": true, @@ -10933,68 +4712,41 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "short", + "$$hashKey": "object:2618", + "format": "iops", "logBase": 1, - "max": null, - "min": 0, "show": true }, { + "$$hashKey": "object:2619", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } - } - ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62056, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/mnt/localdisk", - "value": "/mnt/localdisk" - } - }, - "title": "Disk space usage for $mountpoint", - "type": "row" - }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 44 - }, - "id": 62118, - "panels": [ + }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -11004,19 +4756,21 @@ "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 0, - "y": 55 + "w": 8, + "x": 8, + "y": 59 }, "hiddenSeries": false, - "id": 62119, + "id": 60200, "interval": "$inter", "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": true, "max": true, "min": false, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, @@ -11026,42 +4780,32 @@ "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 4, + "maxPerRow": 6, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 52240, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/nfs/cluster", - "value": "/nfs/cluster" - } - }, "seriesOverrides": [ { - "alias": "/total/", - "color": "#BF1B00", - "fill": 0, - "linewidth": 2, - "zindex": 3 + "alias": "/.*write$/", + "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -11087,9 +4831,9 @@ "type": "tag" } ], - "measurement": "disk_total", + "measurement": "io_reads", "policy": "default", - "query": "SELECT mean(total) AS \"total\", mean(used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(read_bytes),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -11108,13 +4852,64 @@ ] ], "tags": [] + }, + { + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "dsType": "influxdb", + "function": "mean", + "groupBy": [ + { + "interval": "auto", + "params": [ + "auto" + ], + "type": "time" + }, + { + "key": "host", + "params": [ + "tag" + ], + "type": "tag" + }, + { + "key": "path", + "params": [ + "tag" + ], + "type": "tag" + } + ], + "measurement": "io_reads", + "policy": "default", + "query": "SELECT non_negative_derivative(mean(write_bytes),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk usage for $mountpoint", + "title": "Disk I/O bytes for /dev/$disk", "tooltip": { "msResolution": false, "shared": true, @@ -11123,44 +4918,41 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { + "$$hashKey": "object:2694", "format": "bytes", "logBase": 1, - "max": null, - "min": 0, "show": true }, { + "$$hashKey": "object:2695", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -11170,70 +4962,56 @@ "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 55 + "w": 8, + "x": 16, + "y": 59 }, "hiddenSeries": false, - "id": 62120, + "id": 56720, "interval": "$inter", "legend": { "alignAsTable": true, "avg": true, "current": true, + "hideEmpty": true, "max": true, "min": false, + "rightSide": false, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, - "lines": false, + "lines": true, "linewidth": 1, "links": [], - "maxPerRow": 4, + "maxPerRow": 6, "nullPointMode": "connected", "options": { "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 33458, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/nfs/cluster", - "value": "/nfs/cluster" - } - }, "seriesOverrides": [ { - "alias": "/used/", - "color": "#447EBC", - "zindex": 3 - }, - { - "alias": "/total/", - "bars": false, - "color": "#BF1B00", - "fill": 0, - "lines": true, - "linewidth": 1 + "alias": "/.*write$/", + "transform": "negative-Y" } ], "spaceLength": 10, "stack": false, - "steppedLine": true, + "steppedLine": false, "targets": [ { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -11259,9 +5037,9 @@ "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "io_reads", "policy": "default", - "query": "SELECT mean(inodes_used) as \"used\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(read_time),1s) as \"read\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -11282,7 +5060,11 @@ "tags": [] }, { - "alias": "$tag_host: mountpoint $tag_path - $col", + "alias": "$tag_host: $tag_name: $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -11308,9 +5090,9 @@ "type": "tag" } ], - "measurement": "disk_inodes_free", + "measurement": "io_reads", "policy": "default", - "query": "SELECT mean(inodes_free) + mean(inodes_used) as \"total\" FROM \"disk\" WHERE \"host\" =~ /$server$/ AND \"path\" =~ /^$mountpoint$/ AND $timeFilter GROUP BY time($interval), \"host\", \"path\"", + "query": "SELECT non_negative_derivative(mean(write_time),1s) as \"write\" FROM \"diskio\" WHERE \"host\" =~ /$server$/ AND \"name\" =~ /$disk$/ AND $timeFilter GROUP BY time($interval), *", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -11332,10 +5114,8 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Disk inodes for $mountpoint", + "title": "Disk I/O time for /dev/$disk", "tooltip": { "msResolution": false, "shared": true, @@ -11344,68 +5124,60 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { - "format": "short", + "$$hashKey": "object:2770", + "format": "ms", "logBase": 1, - "max": null, - "min": 0, "show": true }, { + "$$hashKey": "object:2771", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62056, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/nfs/cluster", - "value": "/nfs/cluster" - } - }, - "title": "Disk space usage for $mountpoint", + "repeat": "disk", + "title": "Disk IOPS for /dev/$disk", "type": "row" }, { "collapsed": true, - "datasource": null, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 45 + "y": 49 }, - "id": 62121, + "id": 62056, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -11417,10 +5189,10 @@ "h": 7, "w": 12, "x": 0, - "y": 55 + "y": 85 }, "hiddenSeries": false, - "id": 62122, + "id": 52240, "interval": "$inter", "legend": { "alignAsTable": true, @@ -11443,21 +5215,10 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 52240, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/nfs/scratch", - "value": "/nfs/scratch" - } - }, "seriesOverrides": [ { "alias": "/total/", @@ -11473,6 +5234,10 @@ "targets": [ { "alias": "$tag_host: mountpoint $tag_path - $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -11522,9 +5287,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Disk usage for $mountpoint", "tooltip": { "msResolution": false, @@ -11534,31 +5297,27 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { + "$$hashKey": "object:2952", "format": "bytes", "logBase": 1, - "max": null, "min": 0, "show": true }, { + "$$hashKey": "object:2953", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -11566,12 +5325,14 @@ "bars": true, "dashLength": 10, "dashes": false, - "datasource": "InfluxDB", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "editable": true, "error": false, "fieldConfig": { "defaults": { - "custom": {}, "links": [] }, "overrides": [] @@ -11583,10 +5344,10 @@ "h": 7, "w": 12, "x": 12, - "y": 55 + "y": 85 }, "hiddenSeries": false, - "id": 62123, + "id": 33458, "interval": "$inter", "legend": { "alignAsTable": true, @@ -11609,21 +5370,10 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.4.1", + "pluginVersion": "8.5.21", "pointradius": 5, "points": false, "renderer": "flot", - "repeat": null, - "repeatIteration": 1613083475434, - "repeatPanelId": 33458, - "repeatedByRow": true, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/nfs/scratch", - "value": "/nfs/scratch" - } - }, "seriesOverrides": [ { "alias": "/used/", @@ -11645,6 +5395,10 @@ "targets": [ { "alias": "$tag_host: mountpoint $tag_path - $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -11694,6 +5448,10 @@ }, { "alias": "$tag_host: mountpoint $tag_path - $col", + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, "dsType": "influxdb", "function": "mean", "groupBy": [ @@ -11743,9 +5501,7 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, "title": "Disk inodes for $mountpoint", "tooltip": { "msResolution": false, @@ -11755,49 +5511,37 @@ }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { + "$$hashKey": "object:3025", "format": "short", "logBase": 1, - "max": null, "min": 0, "show": true }, { + "$$hashKey": "object:3026", "format": "short", "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], - "repeatIteration": 1613083475434, - "repeatPanelId": 62056, - "scopedVars": { - "mountpoint": { - "selected": false, - "text": "/nfs/scratch", - "value": "/nfs/scratch" - } - }, + "repeat": "mountpoint", "title": "Disk space usage for $mountpoint", "type": "row" } ], - "refresh": "10s", - "schemaVersion": 27, + "refresh": false, + "schemaVersion": 36, "style": "dark", "tags": [], "templating": { @@ -11810,8 +5554,6 @@ "value": "InfluxDB" }, "datasource": "InfluxDB telegraf", - "description": null, - "error": null, "hide": 0, "includeAll": false, "label": "", @@ -11833,9 +5575,6 @@ "text": "10s", "value": "10s" }, - "datasource": null, - "description": null, - "error": null, "hide": 0, "includeAll": false, "label": "Sampling", @@ -11894,20 +5633,16 @@ "type": "interval" }, { - "allValue": null, "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "$datasource" }, - "datasource": "$datasource", "definition": "select n_cpus from system", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Clusters (ncores)", @@ -11920,22 +5655,21 @@ "skipUrlSync": false, "sort": 3, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, "current": { "selected": false, "text": "All", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, "definition": "select host, n_cpus from system where n_cpus = $ncores group by \"host\"", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Cluster nodes", @@ -11948,22 +5682,21 @@ "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, "current": { "selected": false, "text": "All", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, "definition": "", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Mountpoint", @@ -11975,23 +5708,20 @@ "regex": "", "skipUrlSync": false, "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": null, "type": "query", "useTags": false }, { - "allValue": null, "current": { "selected": false, "text": "All", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, "definition": "", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "CPU", @@ -12003,23 +5733,20 @@ "regex": "/cpu[0-9]/", "skipUrlSync": false, "sort": 1, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": null, "type": "query", "useTags": false }, { - "allValue": null, "current": { "selected": false, "text": "All", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, "definition": "", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Disk", @@ -12031,23 +5758,20 @@ "regex": "[a-z]d[\\D]$|nvme[\\d]n[\\d]$", "skipUrlSync": false, "sort": 0, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": null, "type": "query", "useTags": false }, { - "allValue": null, "current": { "selected": false, "text": "All", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "influxdb", + "uid": "$datasource" + }, "definition": "", - "description": null, - "error": null, "hide": 0, "includeAll": true, "label": "Network interface", @@ -12059,11 +5783,32 @@ "regex": "^(?!.*veth|all|tap).*$", "skipUrlSync": false, "sort": 1, - "tagValuesQuery": null, - "tags": [], - "tagsQuery": null, "type": "query", "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "influxdb", + "uid": "${datasource}" + }, + "definition": "SHOW TAG VALUES FROM nvidia_smi with KEY=\"uuid\" where host =~ /$server/", + "hide": 0, + "includeAll": true, + "label": "GPU", + "multi": true, + "name": "gpu", + "options": [], + "query": "SHOW TAG VALUES FROM nvidia_smi with KEY=\"uuid\" where host =~ /$server/", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, @@ -12098,6 +5843,7 @@ }, "timezone": "browser", "title": "Cluster Dashboard", - "uid": "000000127", - "version": 3 -} + "uid": "00000012722", + "version": 50, + "weekStart": "" +} \ No newline at end of file diff --git a/playbooks/roles/grafana/tasks/el.yml b/playbooks/roles/grafana/tasks/el.yml index d90ef113..ce937288 100755 --- a/playbooks/roles/grafana/tasks/el.yml +++ b/playbooks/roles/grafana/tasks/el.yml @@ -15,7 +15,7 @@ - name: install grafana vars: package_name: - - https://dl.grafana.com/oss/release/grafana-8.5.21-1.x86_64.rpm + - grafana package_state: present include_role: name: safe_yum diff --git a/playbooks/roles/localdisk/tasks/common.yml b/playbooks/roles/localdisk/tasks/common.yml index 414aae73..558ae7af 100755 --- a/playbooks/roles/localdisk/tasks/common.yml +++ b/playbooks/roles/localdisk/tasks/common.yml @@ -53,42 +53,45 @@ - "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}" when: not ( one_lv | bool ) -- name: Create volume group - lvg: - vg: "vg_nvmes" - pvs: "{{['/dev/']|product(hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list)|map('join', '') | join(',')}}" - when: one_lv | bool +- name: Check for lvm devices + shell: fdisk -l|grep vg_nvmes|wc -l + register: lv_count -- name: Create Logical volume - lvol: - vg: "vg_nvmes" - lv: "lv_nvmes" - size: 100%FREE - opts: "{% if redundancy | bool %}--type raid10{% else%}-i4{% endif %}" - when: one_lv | bool +- block: + - name: Create volume group + lvg: + vg: "vg_nvmes" + pvs: "{{['/dev/']|product(hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list)|map('join', '') | join(',')}}" -- name: Create file system - filesystem: - fstype: xfs - dev: "/dev/vg_nvmes/lv_nvmes" - when: one_lv | bool + - name: Create Logical volume + lvol: + vg: "vg_nvmes" + lv: "lv_nvmes" + size: 100%FREE + opts: "{% if redundancy | bool %}--type raid10{% else%}{% if ( nvme_count | int ) > 3 %}-i4{% else%}-i2{% endif %}{% endif %}" -- name: Mount local volume - mount: - path: "{{ nvme_path_edited}}" - src: "/dev/vg_nvmes/lv_nvmes" - fstype: xfs - opts: defaults,noatime - state: mounted - when: one_lv | bool + - name: Create file system + filesystem: + fstype: xfs + dev: "/dev/vg_nvmes/lv_nvmes" -- name: "set permissions on {{ nvme_path_edited }}" - become: true - file: - path: "{{ nvme_path_edited}}" - state: directory - owner: "{{ ansible_user }}" - mode: 0775 - group: "{{privilege_group_name}}" - recurse: no - when: one_lv | bool \ No newline at end of file + - name: Mount local volume + mount: + path: "{{ nvme_path_edited}}" + src: "/dev/vg_nvmes/lv_nvmes" + fstype: xfs + opts: defaults,noatime + state: mounted + + - name: "set permissions on {{ nvme_path_edited }}" + become: true + file: + path: "{{ nvme_path_edited}}" + state: directory + owner: "{{ ansible_user }}" + mode: 0775 + group: "{{privilege_group_name}}" + recurse: no + when: + - one_lv | bool + - lv_count.stdout == '0' \ No newline at end of file diff --git a/playbooks/roles/mysql/defaults/main.yml b/playbooks/roles/mysql/defaults/main.yml index 7d4e4104..af3189aa 100644 --- a/playbooks/roles/mysql/defaults/main.yml +++ b/playbooks/roles/mysql/defaults/main.yml @@ -9,6 +9,11 @@ mysql_packages: - mysql-community-client - MySQL-python +mysql_packages_ol8: + - mysql-server + - mysql + - mysql-connector-python + deb_mariadb_packages: - mariadb-server - mariadb-common diff --git a/playbooks/roles/mysql/tasks/el.yml b/playbooks/roles/mysql/tasks/el.yml index d893abb8..e76fbd6e 100644 --- a/playbooks/roles/mysql/tasks/el.yml +++ b/playbooks/roles/mysql/tasks/el.yml @@ -22,20 +22,40 @@ mysql_root_pwd: "{{ lookup('password', '/etc/opt/oci-hpc/passwords/mysql/root.txt chars=ascii_letters,digits') }}" +- block: + - name: Install MySQL packages ol7 + vars: + package_name: + - mysql-release-el7 + include_role: + name: safe_yum + + - name: Install MySQL packages + vars: + package_name: '{{ mysql_packages }}' + package_repo: ol7_MySQL80 + include_role: + name: safe_yum + when: ansible_distribution_major_version == '7' -- name: Install MySQL packages - vars: - package_name: - - mysql-release-el7 - include_role: - name: safe_yum - -- name: Install MySQL packages - vars: - package_name: '{{ mysql_packages }}' - package_repo: ol7_MySQL80 - include_role: - name: safe_yum +- block: + - name: Select mysql package ol8 + set_fact: + package_name: mysql-release-el8 + + - name: Install MySQL packages ol8 + vars: + package_name: "{{ mysql_packages_ol8 }}" + package_repo: ol8_MySQL80 + include_role: + name: safe_yum + + - name: Make sure pymysql is present + become: true + pip: + name: pymysql + state: present + when: ansible_distribution_major_version == '8' - name: Update SELinux context for {{ mysql_db_path }} become: true @@ -85,7 +105,6 @@ enabled: yes - block: - - name: Start MySQL service become: true service: @@ -97,33 +116,55 @@ - name: get root password shell: "grep 'A temporary password is generated for root@localhost' /var/log/mysqld.log | awk -F ' ' '{print $(NF)}'" register: m_root_password - -# - name: Set MySQL root pasword -# become: true -# mysql_user: -# name: root -# password: '{{ mysql_root_pwd }}' -# host_all: yes -# check_implicit_admin: yes -# state: present -# + - name: check if DB exists shell: mysql --user root -p'{{ mysql_root_pwd }}' -e 'SHOW DATABASES;' register: dbstatus ignore_errors: true no_log: yes - - - name: fix user password - command: mysql --user root -p'{{ m_root_password.stdout }}' --connect-expired-password --execute="ALTER USER 'root'@'localhost' IDENTIFIED BY '{{ m_root_password.stdout }}';" - when: dbstatus.rc != 0 - - - name: fix password policy - command: mysql --user root --password={{ m_root_password.stdout }} --connect-expired-password --execute="SET GLOBAL validate_password.policy=LOW;" - when: dbstatus.rc != 0 - - - name: update expired root user password - command: mysql --user root --password={{ m_root_password.stdout }} --connect-expired-password --execute="ALTER USER 'root'@'localhost' IDENTIFIED BY '{{ mysql_root_pwd }}';" - when: dbstatus.rc != 0 + + - block: + - name: fix user password + command: mysql --user root -p'{{ m_root_password.stdout }}' --connect-expired-password --execute="ALTER USER 'root'@'localhost' IDENTIFIED BY '{{ m_root_password.stdout }}';" + when: dbstatus.rc != 0 + + - name: fix password policy + command: mysql --user root --password={{ m_root_password.stdout }} --connect-expired-password --execute="SET GLOBAL validate_password.policy=LOW;" + when: dbstatus.rc != 0 + + - name: update expired root user password + command: mysql --user root --password={{ m_root_password.stdout }} --connect-expired-password --execute="ALTER USER 'root'@'localhost' IDENTIFIED BY '{{ mysql_root_pwd }}';" + when: dbstatus.rc != 0 + when: ansible_distribution_major_version == '7' + + - block: + - name: Set MySQL root pasword ol8 + become: true + mysql_user: + name: root + password: '{{ mysql_root_pwd }}' + host_all: yes + check_implicit_admin: yes + state: present + + - name: check if DB exists + shell: mysql --user root -p'{{ mysql_root_pwd }}' -e 'SHOW DATABASES;' + register: dbstatus + ignore_errors: true + no_log: yes + + - name: fix user password + command: mysql --user root -p'{{ mysql_root_pwd }}' --connect-expired-password --execute="ALTER USER 'root'@'localhost' IDENTIFIED BY '{{ mysql_root_pwd }}';" + when: dbstatus.rc != 0 + + - name: fix password policy + command: mysql --user root --password={{ mysql_root_pwd }} --connect-expired-password --execute="SET GLOBAL validate_password.policy=LOW;" + when: dbstatus.rc != 0 + + - name: update expired root user password + command: mysql --user root --password={{ mysql_root_pwd }} --connect-expired-password --execute="ALTER USER 'root'@'localhost' IDENTIFIED BY '{{ mysql_root_pwd }}';" + when: dbstatus.rc != 0 + when: ansible_distribution_major_version == '8' - name: Render /root/.my.cnf become: true @@ -141,5 +182,4 @@ name: test state: absent login_password: "{{ mysql_root_pwd }}" - login_user: root - + login_user: root \ No newline at end of file diff --git a/playbooks/roles/nvidia-enroot/tasks/main.yml b/playbooks/roles/nvidia-enroot/tasks/main.yml index d5bb692e..7243e27b 100644 --- a/playbooks/roles/nvidia-enroot/tasks/main.yml +++ b/playbooks/roles/nvidia-enroot/tasks/main.yml @@ -1,11 +1,10 @@ --- -- include: oraclelinux-7.yml - when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' +- include: oraclelinux.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' #- include: centos-7.yml # when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' - include: ubuntu.yml - when: ansible_os_family == 'Debian' and ansible_distribution == 'Ubuntu' - + when: ansible_os_family == 'Debian' and ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml b/playbooks/roles/nvidia-enroot/tasks/oraclelinux.yml similarity index 79% rename from playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml rename to playbooks/roles/nvidia-enroot/tasks/oraclelinux.yml index 6d9ec324..3b2c1832 100644 --- a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml +++ b/playbooks/roles/nvidia-enroot/tasks/oraclelinux.yml @@ -1,7 +1,5 @@ --- - block: - - - name: install required packages vars: package_name: @@ -11,6 +9,20 @@ package_repo: "epel,ol7_developer_EPEL" include_role: name: safe_yum + when: ansible_distribution_major_version == '7' + + - name: install required packages + vars: + package_name: + - zstd + - "https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot-3.4.1-1.el8.{{ ansible_architecture }}.rpm" + - "https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot+caps-3.4.1-1.el8.{{ ansible_architecture }}.rpm" + package_repo: "ol8_developer_EPEL" + disable_gpg_check_var: True + include_role: + name: safe_yum + when: ansible_distribution_major_version == '8' + # needs reboot - name: update namespace.unpriv_enable=1 kernel parameters command: bash -c "sudo grubby --args="namespace.unpriv_enable=1" --update-kernel=`sudo grubby --default-kernel`" @@ -32,17 +44,25 @@ mode: '0755' owner: opc group: opc + when: ansible_distribution_major_version == '7' + + - name: Download enroot-check + get_url: + url: "https://github.com/NVIDIA/enroot/releases/download/v3.4.1/enroot-check_3.4.1_{{ ansible_architecture }}.run" + dest: "/tmp" + mode: '0755' + owner: opc + group: opc + when: ansible_distribution_major_version == '8' - name: execute enroot-check_*.run command: bash -c "/tmp/enroot-check_*.run --verify" - - name: - set_fact: + - set_fact: enroot_top_path_checked: "/etc/enroot/" when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - - name: - set_fact: + - set_fact: enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" @@ -94,7 +114,6 @@ line: 'ENROOT_ROOTFS_WRITABLE yes' backup: yes - - name: set permissions on {{enroot_top_path_checked}} become: true file: @@ -105,7 +124,6 @@ group: "{{privilege_group_name}}" recurse: no - - name: Make sure all {{enroot_top_path_checked}} directories exist file: path: "{{enroot_top_path_checked}}/{{item}}" diff --git a/playbooks/roles/openldap/files/el_memberof.ldif b/playbooks/roles/openldap/files/el_memberof.ldif index b4e70ac7..55d72569 100644 --- a/playbooks/roles/openldap/files/el_memberof.ldif +++ b/playbooks/roles/openldap/files/el_memberof.ldif @@ -28,5 +28,4 @@ objectClass: olcOverlayConfig objectClass: olcRefintConfig objectClass: top olcOverlay: {1}refint -olcRefintAttribute: memberof member - +olcRefintAttribute: memberof member \ No newline at end of file diff --git a/playbooks/roles/openldap/files/el_memberof_ol8.ldif b/playbooks/roles/openldap/files/el_memberof_ol8.ldif new file mode 100644 index 00000000..34f32fca --- /dev/null +++ b/playbooks/roles/openldap/files/el_memberof_ol8.ldif @@ -0,0 +1,32 @@ +dn: cn=module{0},cn=config +changetype: modify +add: olcModuleLoad +olcModuleLoad: memberof + +dn: olcOverlay={0}memberof,olcDatabase={2}mdb,cn=config +changetype: add +objectClass: olcConfig +objectClass: olcMemberOf +objectClass: olcOverlayConfig +objectClass: top +olcOverlay: {0}memberof +olcMemberOfDangling: ignore +olcMemberOfRefInt: TRUE +olcMemberOfGroupOC: groupOfMembers +olcMemberOfMemberAD: member +olcMemberOfMemberOfAD: memberOf + +dn: cn=module{0},cn=config +changetype: modify +add: olcModuleLoad +olcModuleLoad: refint + +dn: olcOverlay={1}refint,olcDatabase={2}mdb,cn=config +changetype: add +objectClass: olcConfig +objectClass: olcOverlayConfig +objectClass: olcRefintConfig +objectClass: top +olcOverlay: {1}refint +olcRefintAttribute: memberof member + diff --git a/playbooks/roles/openldap/files/el_ppolicy.ldif b/playbooks/roles/openldap/files/el_ppolicy.ldif index a1ea033f..21505aca 100644 --- a/playbooks/roles/openldap/files/el_ppolicy.ldif +++ b/playbooks/roles/openldap/files/el_ppolicy.ldif @@ -14,5 +14,4 @@ olcOverlay: ppolicy olcPPolicyDefault: cn=pwdDefaultPolicy,ou=Policies,dc=local olcPPolicyHashCleartext: FALSE olcPPolicyUseLockout: FALSE -olcPPolicyForwardUpdates: FALSE - +olcPPolicyForwardUpdates: FALSE \ No newline at end of file diff --git a/playbooks/roles/openldap/files/el_ppolicy_ol8.ldif b/playbooks/roles/openldap/files/el_ppolicy_ol8.ldif new file mode 100644 index 00000000..bfb085c8 --- /dev/null +++ b/playbooks/roles/openldap/files/el_ppolicy_ol8.ldif @@ -0,0 +1,17 @@ +dn: cn=module{0},cn=config +changetype: add +objectClass: olcModuleList +cn: module{0} +olcModuleLoad: ppolicy + +dn: olcOverlay={0}ppolicy,olcDatabase={2}mdb,cn=config +changetype: add +objectClass: olcPPolicyConfig +objectClass: olcOverlayConfig +objectClass: top +olcOverlay: ppolicy +olcPPolicyDefault: cn=pwdDefaultPolicy,ou=Policies,dc=local +olcPPolicyHashCleartext: FALSE +olcPPolicyUseLockout: FALSE +olcPPolicyForwardUpdates: FALSE + diff --git a/playbooks/roles/openldap/tasks/el-7.yml b/playbooks/roles/openldap/tasks/el.yml similarity index 51% rename from playbooks/roles/openldap/tasks/el-7.yml rename to playbooks/roles/openldap/tasks/el.yml index 3f55faac..94493978 100644 --- a/playbooks/roles/openldap/tasks/el-7.yml +++ b/playbooks/roles/openldap/tasks/el.yml @@ -1,6 +1,5 @@ --- # tasks file for openldap - - name: Create /etc/opt/oci-hpc/passwords/openldap become: true file: @@ -11,13 +10,30 @@ group: opc recurse: yes -- name: Install dependencies +- name: Install dependencies ol7 vars: package_name: - policycoreutils-python package_state: present include_role: name: safe_yum + when: ansible_distribution_major_version == '7' + +- block: + - name: Install dependencies ol8 + vars: + package_name: + - policycoreutils-python-utils + package_state: present + include_role: + name: safe_yum + + - name: Install openldap-servers + ansible.builtin.yum: + name: https://vault.centos.org/centos/8/PowerTools/x86_64/os/Packages/openldap-servers-2.4.46-18.el8.x86_64.rpm + state: present + disable_gpg_check: true + when: ansible_distribution_major_version == '8' - name: Generate openldap root password set_fact: @@ -31,18 +47,32 @@ '/etc/opt/oci-hpc//passwords/openldap/root.txt chars=ascii_letters,digits,hexdigits') }}" -- name: Install OpenLDAP packages +- name: Install OpenLDAP packages ol7 vars: package_name: "{{openldap_packages}}" include_role: name: safe_yum + when: ansible_distribution_major_version == '7' - block: - - name: Selinux fcontext on files - sefcontext: - target: "{{ openldap_server_conf_path }}(/.*)?" - setype: slapd_db_t - when: ansible_selinux.status == "enabled" + - name: Install OpenLDAP packages ol8 + vars: + package_name: "{{openldap_packages_ol8}}" + include_role: + name: safe_yum + + - name: Create sysconfig-slapd + ansible.builtin.copy: + content: "" + dest: /etc/sysconfig/slapd + mode: 0644 + when: ansible_distribution_major_version == '8' + +- name: Selinux fcontext on files + sefcontext: + target: "{{ openldap_server_conf_path }}(/.*)?" + setype: slapd_db_t + when: ansible_selinux.status == "enabled" - name: Hash OpenLDAP root password command: slappasswd -h {SSHA} -s {{ openldap_root_pwd }} @@ -81,7 +111,24 @@ groups: '{{ ssl_cert_group }}' - block: + - name: Install and upgrade pip + pip: + name: pip + extra_args: --upgrade + executable: pip3 + + - name: Install ladap3 and click + pip: + name: ['ldap3', 'click'] + executable: pip3 + - name: update /etc/openldap/ldap.conf + lineinfile: + dest: /etc/openldap/ldap.conf + line: TLS_REQSAN allow + when: ansible_distribution_major_version == '8' + +- block: - name: Ensure OpenLDAP data directory exists file: path: '{{ openldap_server_dir_path }}' @@ -115,29 +162,57 @@ creates: '/etc/openldap/slapd.d/cn=config/cn=schema/cn={?}{{ item }}.ldif' with_items: '{{ openldap_schemas }}' - - name: Render OpenLDAP configuration - config - template: - src: '{{ item }}.j2' - dest: '/tmp/{{ item }}' - with_items: - - config.ldif - changed_when: false + - block: + - name: Render OpenLDAP configuration - config ol7 + template: + src: '{{ item }}.j2' + dest: '/tmp/{{ item }}' + with_items: + - config.ldif + changed_when: false - - name: Render OpenLDAP configuration - local - template: - src: 'el_local.ldif.j2' - dest: '/tmp/local.ldif' - changed_when: false + - name: Render OpenLDAP configuration - local + template: + src: 'el_local.ldif.j2' + dest: '/tmp/local.ldif' + changed_when: false - - name: Copy OpenLDAP overlay configurations - ppolicy - copy: - src: 'el_ppolicy.ldif' - dest: '/tmp/ppolicy.ldif' - - - name: Copy OpenLDAP overlay configurations - memberof - copy: - src: 'el_memberof.ldif' - dest: '/tmp/memberof.ldif' + - name: Copy OpenLDAP overlay configurations - ppolicy + copy: + src: 'el_ppolicy.ldif' + dest: '/tmp/ppolicy.ldif' + + - name: Copy OpenLDAP overlay configurations - memberof + copy: + src: 'el_memberof.ldif' + dest: '/tmp/memberof.ldif' + when: ansible_distribution_major_version == '7' + + - block: + - name: Render OpenLDAP configuration - config ol8 + template: + src: '{{ item }}.j2' + dest: '/tmp/{{ item }}' + with_items: + - config.ldif + changed_when: false + + - name: Render OpenLDAP configuration - local + template: + src: 'el_local_ol8.ldif.j2' + dest: /tmp/local.ldif + changed_when: false + + - name: Copy OpenLDAP overlay configurations - ppolicy + copy: + src: 'el_ppolicy_ol8.ldif' + dest: /tmp/ppolicy.ldif + + - name: Copy OpenLDAP overlay configurations - memberof + copy: + src: 'el_memberof_ol8.ldif' + dest: /tmp/memberof.ldif + when: ansible_distribution_major_version == '8' - name: Load OpenLDAP global configuration command: ldapmodify -c -Y EXTERNAL -H ldapi:/// -Q -f /tmp/config.ldif @@ -145,7 +220,7 @@ creates: '/etc/openldap/slapd.d/cn=config.ldif' notify: restart openldap - - name: Load OpenLDAP local configuration + - name: Load OpenLDAP local configuration ol7 command: ldapmodify -c -Y EXTERNAL -H ldapi:/// -Q -f /tmp/{{ item }} args: creates: '/etc/openldap/slapd.d/cn=config/olcDatabase={?}hdb/olcOverlay={0}memberof.ldif' @@ -155,6 +230,25 @@ - ppolicy.ldif - memberof.ldif notify: restart openldap + when: ansible_distribution_major_version == '7' + + - name: restart openldap + service: + name: slapd + state: restarted + enabled: yes + when: ansible_distribution_major_version == '8' + + - name: Load OpenLDAP local configuration ol8 + command: ldapmodify -c -Y EXTERNAL -H ldapi:/// -Q -f /tmp/{{ item }} + args: + creates: '/etc/openldap/slapd.d/cn=config/olcDatabase={?}mdb/olcOverlay={0}memberof.ldif' + with_items: + - config.ldif + - local.ldif + - memberof.ldif + notify: restart openldap + when: ansible_distribution_major_version == '8' - name: Check local schemas command: 'ldapsearch -H ldapi:// -Y EXTERNAL -b "dc=local" "cn=pwdDefaultPolicy,ou=Policies,dc=local"' diff --git a/playbooks/roles/openldap/tasks/main.yml b/playbooks/roles/openldap/tasks/main.yml index 52bb5e32..860b1077 100644 --- a/playbooks/roles/openldap/tasks/main.yml +++ b/playbooks/roles/openldap/tasks/main.yml @@ -4,8 +4,11 @@ - include_vars: debian_vars.yml when: ansible_distribution == 'Ubuntu' -- include: el-7.yml - when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' +- include: el.yml + when: ansible_os_family == 'RedHat' + +#- include: el-8.yml +# when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' - include: debian.yml when: ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/roles/openldap/templates/el_local.ldif.j2 b/playbooks/roles/openldap/templates/el_local.ldif.j2 index 120363d3..5eb0ffb8 100644 --- a/playbooks/roles/openldap/templates/el_local.ldif.j2 +++ b/playbooks/roles/openldap/templates/el_local.ldif.j2 @@ -70,4 +70,3 @@ olcAccess: {3}to * by dn.base="cn=manager,dc=local" read by * none #olcDbIndex: givenName eq,sub #olcDbIndex: sn eq,sub #olcDbIndex: ou eq - diff --git a/playbooks/roles/openldap/templates/el_local_ol8.ldif.j2 b/playbooks/roles/openldap/templates/el_local_ol8.ldif.j2 new file mode 100644 index 00000000..c6aed4fa --- /dev/null +++ b/playbooks/roles/openldap/templates/el_local_ol8.ldif.j2 @@ -0,0 +1,73 @@ +# local database configuration +dn: cn=module{0},cn=config +changetype: add +objectClass: olcModuleList +cn: module{0} +olcModuleLoad: back_mdb + +dn: olcDatabase={2}mdb,cn=config +changetype: modify +replace: olcSuffix +olcSuffix: dc=local + +dn: olcDatabase={2}mdb,cn=config +changetype: modify +replace: olcRootDN +olcRootDN: cn=Manager,dc=local + +dn: olcDatabase={2}mdb,cn=config +changetype: modify +replace: olcRootPW +olcRootPW: {{ openldap_root_pwd_hash.stdout }} + + +dn: olcDatabase={2}mdb,cn=config +changetype: modify +replace: olcDbIndex +olcDbIndex: default eq,sub +olcDbIndex: cn eq,sub +olcDbIndex: uid eq +olcDbIndex: mail eq,sub +olcDbIndex: givenName eq,sub +olcDbIndex: sn eq,sub +olcDbIndex: ou eq + +dn: olcDatabase={2}mdb,cn=config +changetype: modify +replace: olcAccess +olcAccess: {1}to attrs=userPassword,givenName,sn by self write by anonymous auth by dn.base="cn=manager,dc=local" write by * none +olcAccess: {2}to * by self read by dn.base="cn=manager,dc=local" write by * read +olcAccess: {3}to * by dn.base="cn=manager,dc=local" read by * none + +#dn: olcDatabase={2}hdb,cn=config +#changetype: modify +##objectClass: olcDatabaseConfig +#objectClass: olcHdbConfig +#olcDatabase: {2}hdb +#olcSuffix: dc=local +#olcAccess: to attrs=userPassword,givenName,sn +# by self write +# by anonymous auth +# by dn.base="cn=manager,dc=local" write +# by * none +#olcAccess: to * +# by self read +# by dn.base="cn=manager,dc=local" write +# by * read +#olcAccess: to * +# by dn.base="cn=manager,dc=local" read +# by * none +#olcRootDN: cn=Manager,dc=local +#olcRootPW: {{ openldap_root_pwd_hash.stdout }} +#olcMonitoring: TRUE +#olcDbDirectory: {{ openldap_server_dir_path }} +#olcDbCacheSize: 10000 +#olcDbCheckpoint: 128 15 +#olcDbIndex: default eq,sub +#olcDbIndex: cn eq,sub +#olcDbIndex: uid eq +#olcDbIndex: mail eq,sub +#olcDbIndex: givenName eq,sub +#olcDbIndex: sn eq,sub +#olcDbIndex: ou eq + diff --git a/playbooks/roles/openldap/vars/el_vars.yml b/playbooks/roles/openldap/vars/el_vars.yml index 40e1d5c4..5a335226 100644 --- a/playbooks/roles/openldap/vars/el_vars.yml +++ b/playbooks/roles/openldap/vars/el_vars.yml @@ -6,6 +6,10 @@ openldap_packages: - openldap-clients - rsync +openldap_packages_ol8: + - openldap-clients + - rsync + openldap_default_user: ldap openldap_default_group: ldap diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml index fb61f6a8..793cd912 100755 --- a/playbooks/roles/packages/tasks/el-7.yml +++ b/playbooks/roles/packages/tasks/el-7.yml @@ -12,3 +12,12 @@ include_role: name: safe_yum ignore_errors: true + +- name: install oci-cli latest version + become: true + pip: + name: [oci-cli] + state: latest + executable: pip3 + ignore_errors: yes + when: ('bastion' in group_names) diff --git a/playbooks/roles/packages/tasks/main.yml b/playbooks/roles/packages/tasks/main.yml index 275cffe6..cdc1f36a 100755 --- a/playbooks/roles/packages/tasks/main.yml +++ b/playbooks/roles/packages/tasks/main.yml @@ -11,4 +11,4 @@ when: ansible_distribution == 'Ubuntu' - include: debian.yml - when: ansible_distribution == 'Debian' + when: ansible_distribution == 'Debian' \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/ol-7.yml b/playbooks/roles/packages/tasks/ol-7.yml index f0d58a2a..f3380d49 100644 --- a/playbooks/roles/packages/tasks/ol-7.yml +++ b/playbooks/roles/packages/tasks/ol-7.yml @@ -1,15 +1,26 @@ --- +- name: Enable ol7_developer_EPEL repo + shell: yum-config-manager --enable ol7_developer_EPEL + - name: Make sure python OpenSSL and parallel ssh is installed vars: package_name: - pyOpenSSL - python2-cryptography - - python36-oci-cli + - python3-oci-cli - pssh - pdsh - python3-pip package_state: latest - package_repo: "epel,ol7_developer_EPEL" + package_repo: "ol7_developer_EPEL" include_role: name: safe_yum - ignore_errors: true + +- name: install oci-cli latest version + become: true + pip: + name: [oci-cli] + state: latest + executable: pip3 + ignore_errors: yes + when: ('bastion' in group_names) \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/ol-8.yml b/playbooks/roles/packages/tasks/ol-8.yml index 61607e48..b3733379 100644 --- a/playbooks/roles/packages/tasks/ol-8.yml +++ b/playbooks/roles/packages/tasks/ol-8.yml @@ -1,15 +1,27 @@ --- +- name: Enable ol8_developer_EPEL repo + shell: yum-config-manager --enable ol8_developer_EPEL + - name: Make sure python OpenSSL and parallel ssh is installed vars: package_name: #- pyOpenSSL #- python2-cryptography - - python36-oci-cli + - python3-oci-cli - pssh - pdsh - python3-pip package_state: latest - package_repo: "epel,ol8_developer_EPEL" + package_repo: "ol8_developer_EPEL" include_role: name: safe_yum ignore_errors: true + +- name: install oci-cli latest version + become: true + pip: + name: [oci-cli] + state: latest + executable: pip3 + ignore_errors: yes + when: ('bastion' in group_names) \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/bastion.yml b/playbooks/roles/slurm/tasks/bastion.yml new file mode 100755 index 00000000..53febd7c --- /dev/null +++ b/playbooks/roles/slurm/tasks/bastion.yml @@ -0,0 +1,28 @@ +--- +- block: + - name: include common tasks + include_tasks: common.yml + vars: + slurm_repos: "epel,ol7_developer_EPEL" + when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) + + - name: run server directives ol7 bastion + include_tasks: server.yml + vars: + slurm_repos: "epel,ol7_developer_EPEL" + when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) + when: ansible_distribution_major_version == '7' + +- block: + - name: include common tasks + include_tasks: common.yml + vars: + slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" + when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) + + - name: run server directives ol8 bastion + include_tasks: server.yml + vars: + slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" + when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) + when: ansible_distribution_major_version == '8' \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/common_pyxis.yml b/playbooks/roles/slurm/tasks/common_pyxis.yml index 596b1286..ccd3fe8e 100644 --- a/playbooks/roles/slurm/tasks/common_pyxis.yml +++ b/playbooks/roles/slurm/tasks/common_pyxis.yml @@ -1,12 +1,10 @@ --- -- name: - set_fact: +- set_fact: enroot_top_path_checked: "/etc/enroot/" when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" -- name: - set_fact: +- set_fact: enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index ac62ccea..2d43c724 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -57,7 +57,7 @@ - name: set permissions become: true shell: - cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key + cmd: chown {{ bastion_username }}:{{ bastion_username }} /tmp/munge.key warn: false delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index a994c174..66ca5ed4 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -1,5 +1,4 @@ --- - - name: Run Pam settings include: compute_pam.yml when: pam|bool @@ -61,7 +60,7 @@ - name: set permissions become: true shell: - cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key + cmd: chown {{ bastion_username }}:{{ bastion_username }} /tmp/munge.key warn: false delegate_to: 127.0.0.1 run_once: true @@ -83,7 +82,6 @@ state: restarted enabled: true - - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index b2be275f..d9c15214 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -1,24 +1,4 @@ --- -#- name: download slurm Packages -# include_tasks: download.yml -# when: ('bastion' in group_names) and (not destroy|bool) and (initial|bool) and (cluster_nfs|bool) - -#- name: download slurm Packages -# include_tasks: download.yml -# when: (not destroy|bool) and (not cluster_nfs|bool) - -- name: include common tasks - vars: - slurm_repos: "epel,ol7_developer_EPEL" - include_tasks: common.yml - when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) - -- name: run server directives - vars: - slurm_repos: "epel,ol7_developer_EPEL" - include_tasks: server.yml - when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives vars: slurm_repos: "epel,ol7_developer_EPEL" diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index d4b2cbbb..1f5a2482 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -1,24 +1,4 @@ --- -#- name: download slurm Packages -# include_tasks: download.yml -# when: ('bastion' in group_names) and (not destroy|bool) and (initial|bool) and (cluster_nfs|bool) - -#- name: download slurm Packages -# include_tasks: download.yml -# when: (not destroy|bool) and (not cluster_nfs|bool) - -- name: include common tasks - vars: - slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" - include_tasks: common.yml - when: (not destroy|bool) and ((initial|bool) or (not initial|bool and ('compute' in group_names))) - -- name: run server directives - vars: - slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" - include_tasks: server.yml - when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives vars: slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" diff --git a/playbooks/roles/slurm/tasks/main.yml b/playbooks/roles/slurm/tasks/main.yml index b8f56252..89944752 100755 --- a/playbooks/roles/slurm/tasks/main.yml +++ b/playbooks/roles/slurm/tasks/main.yml @@ -7,6 +7,9 @@ - include_vars: ubuntu_vars.yml when: ansible_distribution == 'Ubuntu' +- include: bastion.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' + - include: el7.yml when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' diff --git a/playbooks/roles/slurm/tasks/server.yml b/playbooks/roles/slurm/tasks/server.yml index 33678c6a..433152fe 100755 --- a/playbooks/roles/slurm/tasks/server.yml +++ b/playbooks/roles/slurm/tasks/server.yml @@ -163,13 +163,13 @@ - name: add alias for max nodes distributed evenly lineinfile: path: '/home/{{ ansible_user }}/.bashrc' - line: alias max_nodes="python3 /opt/oci-hpc/bin/max_nodes_partition.py" + line: alias max_nodes="python3 /opt/oci-hpc/scripts/max_nodes_partition.py" state: present - name: add alias for validation of number of nodes, pcie, and gpu throttle check lineinfile: path: '/home/{{ ansible_user }}/.bashrc' - line: alias validate="python3 /opt/oci-hpc/bin/validation.py" + line: alias validate="python3 /opt/oci-hpc/scripts/validation.py" state: present - name: Generate gres.conf diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 10561d69..de6734ab 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -81,10 +81,14 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=26 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:V100:8 {% elif instance.shape == "BM.GPU4.8" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.B4.8" %} +{% elif instance.shape == "BM.GPU.B4.8" and threadspercore == 1 %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=8 CoresPerSocket=16 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 -{% elif instance.shape == "BM.GPU.A100-v2.8" %} +{% elif instance.shape == "BM.GPU.B4.8" and threadspercore == 2 %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 +{% elif instance.shape == "BM.GPU.A100-v2.8" and threadspercore == 1 %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=8 CoresPerSocket=16 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 +{% elif instance.shape == "BM.GPU.A100-v2.8" and threadspercore == 2 %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A100:8 {% elif instance.shape == "BM.GPU.T1.2" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 {% elif instance.shape == "BM.GPU.A10.4" %} @@ -105,16 +109,22 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard.A1.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E3.128" %} +{% elif instance.shape == "BM.Standard.E3.128" and threadspercore == 1%} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.Standard.E4.128" %} +{% elif instance.shape == "BM.Standard.E3.128" and threadspercore == 2%} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.Standard.E4.128" and threadspercore == 1 %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.DenseIO.E4.128" %} +{% elif instance.shape == "BM.Standard.E4.128" and threadspercore == 2 %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.DenseIO.E4.128" and threadspercore == 1 %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.DenseIO.E4.128" and threadspercore == 2 %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=255 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.HPC2.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} -{% elif instance.shape == "BM.HPC.E5.128" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.HPC.E5.144" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=72 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Optimized3.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif "VM.Standard2." in instance.shape %} diff --git a/playbooks/roles/ssh/tasks/common.yml b/playbooks/roles/ssh/tasks/common.yml index 51693fe0..496a8dc4 100644 --- a/playbooks/roles/ssh/tasks/common.yml +++ b/playbooks/roles/ssh/tasks/common.yml @@ -10,7 +10,7 @@ - name: Install ssh keys on all nodes copy: dest: "/home/{{ ansible_user }}/.ssh/id_rsa" - src: "/home/{{ ansible_user }}/.ssh/{{ item }}" + src: "/home/{{ bastion_username }}/.ssh/{{ item }}" owner: "{{ ansible_user }}" group: "{{ ansible_user }}" mode: '0600' @@ -18,7 +18,7 @@ - cluster.key - name: Generate an OpenSSL public key in OpenSSH v2 format - openssl_publickey: + community.crypto.openssl_publickey: path: "/home/{{ ansible_user }}/.ssh/id_rsa.pub" privatekey_path: "/home/{{ ansible_user }}/.ssh/id_rsa" format: OpenSSH diff --git a/playbooks/roles/sssd/tasks/debian.yml b/playbooks/roles/sssd/tasks/debian.yml index afac72c7..6e45779a 100644 --- a/playbooks/roles/sssd/tasks/debian.yml +++ b/playbooks/roles/sssd/tasks/debian.yml @@ -16,6 +16,17 @@ group: 'root' mode: '0600' notify: restart sssd + when: not pam | bool + +- name: Add configuration file to /etc/sssd/sssd.conf + template: + src: 'sssd_ubuntu.conf.j2' + dest: '/etc/sssd/sssd.conf' + owner: 'root' + group: 'root' + mode: '0600' + notify: restart sssd + when: pam | bool - name: Copy CA certificate copy: diff --git a/playbooks/roles/sssd/tasks/el-8.yml b/playbooks/roles/sssd/tasks/el-8.yml index ecfc4255..97dd12a8 100644 --- a/playbooks/roles/sssd/tasks/el-8.yml +++ b/playbooks/roles/sssd/tasks/el-8.yml @@ -16,6 +16,13 @@ mode: '0600' notify: restart sssd +- name: Update /etc/sssd/sssd.conf + lineinfile: + path: /etc/sssd/sssd.conf + line: "ldap_tls_reqcert = allow" + state: present + notify: restart sssd + - name: Copy CA certificate copy: src: "{{ ssl_ca_cert }}" diff --git a/playbooks/roles/sssd/templates/sssd.conf.j2 b/playbooks/roles/sssd/templates/sssd.conf.j2 index 6aca43ae..928b9986 100644 --- a/playbooks/roles/sssd/templates/sssd.conf.j2 +++ b/playbooks/roles/sssd/templates/sssd.conf.j2 @@ -5,33 +5,25 @@ services = nss, pam domains = cluster [nss] - filter_users = root entry_negative_timeout = 5 [pam] - pam_verbosity = 2 pam_account_expired_message = 'Your account has expired. Please contact a system administrator' [domain/cluster] - ldap_schema = rfc2307bis - id_provider = ldap auth_provider = ldap access_provider = ldap chpass_provider = ldap - cache_credentials = true entry_cache_timeout = 600 - ldap_uri = ldaps://{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }} - ldap_search_base = dc=local ldap_network_timeout = 30 - ldap_access_order = expire ldap_access_filter = (&(objectclass=inetOrgPerson)) ldap_account_expire_policy = shadow -enumerate = true +enumerate = true \ No newline at end of file diff --git a/playbooks/roles/sssd/templates/sssd_ubuntu.conf.j2 b/playbooks/roles/sssd/templates/sssd_ubuntu.conf.j2 new file mode 100644 index 00000000..10a81eb7 --- /dev/null +++ b/playbooks/roles/sssd/templates/sssd_ubuntu.conf.j2 @@ -0,0 +1,19 @@ +[sssd] +config_file_version = 2 +domains = cluster + +[domain/cluster] +ldap_schema = rfc2307bis +id_provider = ldap +auth_provider = ldap +access_provider = ldap +chpass_provider = ldap +cache_credentials = true +entry_cache_timeout = 600 +ldap_uri = ldaps://{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }} +ldap_search_base = dc=local +ldap_network_timeout = 30 +ldap_access_order = expire +ldap_access_filter = (&(objectclass=inetOrgPerson)) +ldap_account_expire_policy = shadow +enumerate = true diff --git a/playbooks/roles/telegraf/files/telegraf.conf b/playbooks/roles/telegraf/files/telegraf.conf index fc5a940b..5e4074b2 100755 --- a/playbooks/roles/telegraf/files/telegraf.conf +++ b/playbooks/roles/telegraf/files/telegraf.conf @@ -2777,7 +2777,7 @@ # # Returns ethtool statistics for given interfaces -# [[inputs.ethtool]] +[[inputs.ethtool]] # ## List of interfaces to pull metrics for # # interface_include = ["eth0"] # @@ -4070,7 +4070,7 @@ # # Read metrics about network interface usage -# [[inputs.net]] +[[inputs.net]] # ## By default, telegraf gathers stats from any up interface (excluding loopback) # ## Setting interfaces will tell it to gather these explicit interfaces, # ## regardless of status. @@ -4261,7 +4261,7 @@ # # Pulls statistics from nvidia GPUs attached to the host -# [[inputs.nvidia_smi]] +##[[inputs.nvidia_smi]] # ## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath # # bin_path = "/usr/bin/nvidia-smi" # diff --git a/playbooks/roles/telegraf/files/telegraf_gpu.conf b/playbooks/roles/telegraf/files/telegraf_gpu.conf new file mode 100755 index 00000000..2cbe5c82 --- /dev/null +++ b/playbooks/roles/telegraf/files/telegraf_gpu.conf @@ -0,0 +1,7022 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply surround +# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"), +# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR}) + + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "" + + ## Log at debug level. + # debug = false + ## Log only error level messages. + # quiet = false + + ## Log target controls the destination for logs and can be one of "file", + ## "stderr" or, on Windows, "eventlog". When set to "file", the output file + ## is determined by the "logfile" setting. + # logtarget = "file" + + ## Name of the file to be logged to when using the "file" logtarget. If set to + ## the empty string then logs are written to stderr. + # logfile = "" + + ## The logfile will be rotated after the time interval specified. When set + ## to 0 no time based rotation is performed. Logs are rotated only when + ## written to, if there is no log activity rotation may be delayed. + # logfile_rotation_interval = "0d" + + ## The logfile will be rotated when it becomes larger than the specified + ## size. When set to 0 no size based rotation is performed. + # logfile_rotation_max_size = "0MB" + + ## Maximum number of rotated archives to keep, any older logs are deleted. + ## If set to -1, no archives are removed. + # logfile_rotation_max_archives = 5 + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + + +# Configuration for sending metrics to InfluxDB +#[[outputs.influxdb]] + ## The full HTTP or UDP URL for your InfluxDB instance. + ## + ## Multiple URLs can be specified for a single cluster, only ONE of the + ## urls will be written to each interval. + # urls = ["unix:///var/run/influxdb.sock"] + # urls = ["udp://127.0.0.1:8089"] + # urls = ["http://127.0.0.1:8086"] + + ## The target database for metrics; will be created as needed. + ## For UDP url endpoint database needs to be configured on server side. + # database = "telegraf" + + ## The value of this tag will be used to determine the database. If this + ## tag is not set the 'database' option is used as the default. + # database_tag = "" + + ## If true, the 'database_tag' will not be included in the written metric. + # exclude_database_tag = false + + ## If true, no CREATE DATABASE queries will be sent. Set to true when using + ## Telegraf with a user without permissions to create databases or when the + ## database already exists. + # skip_database_creation = false + + ## Name of existing retention policy to write to. Empty string writes to + ## the default retention policy. Only takes effect when using HTTP. + # retention_policy = "" + + ## The value of this tag will be used to determine the retention policy. If this + ## tag is not set the 'retention_policy' option is used as the default. + # retention_policy_tag = "" + + ## If true, the 'retention_policy_tag' will not be included in the written metric. + # exclude_retention_policy_tag = false + + ## Write consistency (clusters only), can be: "any", "one", "quorum", "all". + ## Only takes effect when using HTTP. + # write_consistency = "any" + + ## Timeout for HTTP messages. + # timeout = "5s" + + ## HTTP Basic Auth + # username = "telegraf" + # password = "metricsmetricsmetricsmetrics" + + ## HTTP User-Agent + # user_agent = "telegraf" + + ## UDP payload size is the maximum packet size to send. + # udp_payload = "512B" + + ## Optional TLS Config for use on HTTP connections. + # tls_ca = "/etc/telegraf/ca.pem" + # tls_cert = "/etc/telegraf/cert.pem" + # tls_key = "/etc/telegraf/key.pem" + ## Use TLS but skip chain & host verification + # insecure_skip_verify = false + + ## HTTP Proxy override, if unset values the standard proxy environment + ## variables are consulted to determine which proxy, if any, should be used. + # http_proxy = "http://corporate.proxy:3128" + + ## Additional HTTP headers + # http_headers = {"X-Special-Header" = "Special-Value"} + + ## HTTP Content-Encoding for write request body, can be set to "gzip" to + ## compress body or "identity" to apply no encoding. + # content_encoding = "identity" + + ## When true, Telegraf will output unsigned integers as unsigned values, + ## i.e.: "42u". You will need a version of InfluxDB supporting unsigned + ## integer values. Enabling this option will result in field type errors if + ## existing data has been written. + # influx_uint_support = false + + +# # Configuration for Amon Server to send metrics to. +# [[outputs.amon]] +# ## Amon Server Key +# server_key = "my-server-key" # required. +# +# ## Amon Instance URL +# amon_instance = "https://youramoninstance" # required +# +# ## Connection timeout. +# # timeout = "5s" + + +# # Publishes metrics to an AMQP broker +# [[outputs.amqp]] +# ## Broker to publish to. +# ## deprecated in 1.7; use the brokers option +# # url = "amqp://localhost:5672/influxdb" +# +# ## Brokers to publish to. If multiple brokers are specified a random broker +# ## will be selected anytime a connection is established. This can be +# ## helpful for load balancing when not using a dedicated load balancer. +# brokers = ["amqp://localhost:5672/influxdb"] +# +# ## Maximum messages to send over a connection. Once this is reached, the +# ## connection is closed and a new connection is made. This can be helpful for +# ## load balancing when not using a dedicated load balancer. +# # max_messages = 0 +# +# ## Exchange to declare and publish to. +# exchange = "telegraf" +# +# ## Exchange type; common types are "direct", "fanout", "topic", "header", "x-consistent-hash". +# # exchange_type = "topic" +# +# ## If true, exchange will be passively declared. +# # exchange_passive = false +# +# ## Exchange durability can be either "transient" or "durable". +# # exchange_durability = "durable" +# +# ## Additional exchange arguments. +# # exchange_arguments = { } +# # exchange_arguments = {"hash_property" = "timestamp"} +# +# ## Authentication credentials for the PLAIN auth_method. +# # username = "" +# # password = "" +# +# ## Auth method. PLAIN and EXTERNAL are supported +# ## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as +# ## described here: https://www.rabbitmq.com/plugins.html +# # auth_method = "PLAIN" +# +# ## Metric tag to use as a routing key. +# ## ie, if this tag exists, its value will be used as the routing key +# # routing_tag = "host" +# +# ## Static routing key. Used when no routing_tag is set or as a fallback +# ## when the tag specified in routing tag is not found. +# # routing_key = "" +# # routing_key = "telegraf" +# +# ## Delivery Mode controls if a published message is persistent. +# ## One of "transient" or "persistent". +# # delivery_mode = "transient" +# +# ## InfluxDB database added as a message header. +# ## deprecated in 1.7; use the headers option +# # database = "telegraf" +# +# ## InfluxDB retention policy added as a message header +# ## deprecated in 1.7; use the headers option +# # retention_policy = "default" +# +# ## Static headers added to each published message. +# # headers = { } +# # headers = {"database" = "telegraf", "retention_policy" = "default"} +# +# ## Connection timeout. If not provided, will default to 5s. 0s means no +# ## timeout (not recommended). +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## If true use batch serialization format instead of line based delimiting. +# ## Only applies to data formats which are not line based such as JSON. +# ## Recommended to set to true. +# # use_batch_format = false +# +# ## Content encoding for message payloads, can be set to "gzip" to or +# ## "identity" to apply no encoding. +# ## +# ## Please note that when use_batch_format = false each amqp message contains only +# ## a single metric, it is recommended to use compression with batch format +# ## for best results. +# # content_encoding = "identity" +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" + + +# # Send metrics to Azure Application Insights +# [[outputs.application_insights]] +# ## Instrumentation key of the Application Insights resource. +# instrumentation_key = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxx" +# +# ## Regions that require endpoint modification https://docs.microsoft.com/en-us/azure/azure-monitor/app/custom-endpoints +# # endpoint_url = "https://dc.services.visualstudio.com/v2/track" +# +# ## Timeout for closing (default: 5s). +# # timeout = "5s" +# +# ## Enable additional diagnostic logging. +# # enable_diagnostic_logging = false +# +# ## Context Tag Sources add Application Insights context tags to a tag value. +# ## +# ## For list of allowed context tag keys see: +# ## https://github.com/Microsoft/ApplicationInsights-Go/blob/master/appinsights/contracts/contexttagkeys.go +# # [outputs.application_insights.context_tag_sources] +# # "ai.cloud.role" = "kubernetes_container_name" +# # "ai.cloud.roleInstance" = "kubernetes_pod_name" + + +# # Send aggregate metrics to Azure Monitor +# [[outputs.azure_monitor]] +# ## Timeout for HTTP writes. +# # timeout = "20s" +# +# ## Set the namespace prefix, defaults to "Telegraf/". +# # namespace_prefix = "Telegraf/" +# +# ## Azure Monitor doesn't have a string value type, so convert string +# ## fields to dimensions (a.k.a. tags) if enabled. Azure Monitor allows +# ## a maximum of 10 dimensions so Telegraf will only send the first 10 +# ## alphanumeric dimensions. +# # strings_as_dimensions = false +# +# ## Both region and resource_id must be set or be available via the +# ## Instance Metadata service on Azure Virtual Machines. +# # +# ## Azure Region to publish metrics against. +# ## ex: region = "southcentralus" +# # region = "" +# # +# ## The Azure Resource ID against which metric will be logged, e.g. +# ## ex: resource_id = "/subscriptions//resourceGroups//providers/Microsoft.Compute/virtualMachines/" +# # resource_id = "" +# +# ## Optionally, if in Azure US Government, China or other sovereign +# ## cloud environment, set appropriate REST endpoint for receiving +# ## metrics. (Note: region may be unused in this context) +# # endpoint_url = "https://monitoring.core.usgovcloudapi.net" + + +# # Publish Telegraf metrics to a Google Cloud PubSub topic +# [[outputs.cloud_pubsub]] +# ## Required. Name of Google Cloud Platform (GCP) Project that owns +# ## the given PubSub topic. +# project = "my-project" +# +# ## Required. Name of PubSub topic to publish metrics to. +# topic = "my-topic" +# +# ## Required. Data format to consume. +# ## Each data format has its own unique set of configuration options. +# ## Read more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Optional. Filepath for GCP credentials JSON file to authorize calls to +# ## PubSub APIs. If not set explicitly, Telegraf will attempt to use +# ## Application Default Credentials, which is preferred. +# # credentials_file = "path/to/my/creds.json" +# +# ## Optional. If true, will send all metrics per write in one PubSub message. +# # send_batched = true +# +# ## The following publish_* parameters specifically configures batching +# ## requests made to the GCP Cloud PubSub API via the PubSub Golang library. Read +# ## more here: https://godoc.org/cloud.google.com/go/pubsub#PublishSettings +# +# ## Optional. Send a request to PubSub (i.e. actually publish a batch) +# ## when it has this many PubSub messages. If send_batched is true, +# ## this is ignored and treated as if it were 1. +# # publish_count_threshold = 1000 +# +# ## Optional. Send a request to PubSub (i.e. actually publish a batch) +# ## when it has this many PubSub messages. If send_batched is true, +# ## this is ignored and treated as if it were 1 +# # publish_byte_threshold = 1000000 +# +# ## Optional. Specifically configures requests made to the PubSub API. +# # publish_num_go_routines = 2 +# +# ## Optional. Specifies a timeout for requests to the PubSub API. +# # publish_timeout = "30s" +# +# ## Optional. If true, published PubSub message data will be base64-encoded. +# # base64_data = false +# +# ## Optional. PubSub attributes to add to metrics. +# # [[inputs.pubsub.attributes]] +# # my_attr = "tag_value" + + +# # Configuration for AWS CloudWatch output. +# [[outputs.cloudwatch]] +# ## Amazon REGION +# region = "us-east-1" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# #token = "" +# #role_arn = "" +# #profile = "" +# #shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# ## Namespace for the CloudWatch MetricDatums +# namespace = "InfluxData/Telegraf" +# +# ## If you have a large amount of metrics, you should consider to send statistic +# ## values instead of raw metrics which could not only improve performance but +# ## also save AWS API cost. If enable this flag, this plugin would parse the required +# ## CloudWatch statistic fields (count, min, max, and sum) and send them to CloudWatch. +# ## You could use basicstats aggregator to calculate those fields. If not all statistic +# ## fields are available, all fields would still be sent as raw metrics. +# # write_statistics = false +# +# ## Enable high resolution metrics of 1 second (if not enabled, standard resolution are of 60 seconds precision) +# # high_resolution_metrics = false + + +# # Configuration for CrateDB to send metrics to. +# [[outputs.cratedb]] +# # A github.com/jackc/pgx connection string. +# # See https://godoc.org/github.com/jackc/pgx#ParseDSN +# url = "postgres://user:password@localhost/schema?sslmode=disable" +# # Timeout for all CrateDB queries. +# timeout = "5s" +# # Name of the table to store metrics in. +# table = "metrics" +# # If true, and the metrics table does not exist, create it automatically. +# table_create = true + + +# # Configuration for DataDog API to send metrics to. +# [[outputs.datadog]] +# ## Datadog API key +# apikey = "my-secret-key" +# +# ## Connection timeout. +# # timeout = "5s" +# +# ## Write URL override; useful for debugging. +# # url = "https://app.datadoghq.com/api/v1/series" + + +# # Send metrics to nowhere at all +# [[outputs.discard]] +# # no configuration + + +# # Configuration for Elasticsearch to send metrics to. +# [[outputs.elasticsearch]] +# ## The full HTTP endpoint URL for your Elasticsearch instance +# ## Multiple urls can be specified as part of the same cluster, +# ## this means that only ONE of the urls will be written to each interval. +# urls = [ "http://node1.es.example.com:9200" ] # required. +# ## Elasticsearch client timeout, defaults to "5s" if not set. +# timeout = "5s" +# ## Set to true to ask Elasticsearch a list of all cluster nodes, +# ## thus it is not necessary to list all nodes in the urls config option. +# enable_sniffer = false +# ## Set the interval to check if the Elasticsearch nodes are available +# ## Setting to "0s" will disable the health check (not recommended in production) +# health_check_interval = "10s" +# ## HTTP basic authentication details +# # username = "telegraf" +# # password = "mypassword" +# +# ## Index Config +# ## The target index for metrics (Elasticsearch will create if it not exists). +# ## You can use the date specifiers below to create indexes per time frame. +# ## The metric timestamp will be used to decide the destination index name +# # %Y - year (2016) +# # %y - last two digits of year (00..99) +# # %m - month (01..12) +# # %d - day of month (e.g., 01) +# # %H - hour (00..23) +# # %V - week of the year (ISO week) (01..53) +# ## Additionally, you can specify a tag name using the notation {{tag_name}} +# ## which will be used as part of the index name. If the tag does not exist, +# ## the default tag value will be used. +# # index_name = "telegraf-{{host}}-%Y.%m.%d" +# # default_tag_value = "none" +# index_name = "telegraf-%Y.%m.%d" # required. +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Template Config +# ## Set to true if you want telegraf to manage its index template. +# ## If enabled it will create a recommended index template for telegraf indexes +# manage_template = true +# ## The template name used for telegraf indexes +# template_name = "telegraf" +# ## Set to true if you want telegraf to overwrite an existing template +# overwrite_template = false + + +# # Send metrics to command as input over stdin +# [[outputs.exec]] +# ## Command to ingest metrics via stdin. +# command = ["tee", "-a", "/dev/null"] +# +# ## Timeout for command to complete. +# # timeout = "5s" +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" + + +# # Run executable as long-running output plugin +# [[outputs.execd]] +# ## Program to run as daemon +# command = ["my-telegraf-output", "--some-flag", "value"] +# +# ## Delay before the process is restarted after an unexpected termination +# restart_delay = "10s" +# +# ## Data format to export. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Send telegraf metrics to file(s) +# [[outputs.file]] +# ## Files to write to, "stdout" is a specially handled file. +# files = ["stdout", "/tmp/metrics.out"] +# +# ## Use batch serialization format instead of line based delimiting. The +# ## batch format allows for the production of non line based output formats and +# ## may more efficiently encode metric groups. +# # use_batch_format = false +# +# ## The file will be rotated after the time interval specified. When set +# ## to 0 no time based rotation is performed. +# # rotation_interval = "0d" +# +# ## The logfile will be rotated when it becomes larger than the specified +# ## size. When set to 0 no size based rotation is performed. +# # rotation_max_size = "0MB" +# +# ## Maximum number of rotated archives to keep, any older logs are deleted. +# ## If set to -1, no archives are removed. +# # rotation_max_archives = 5 +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Configuration for Graphite server to send metrics to +# [[outputs.graphite]] +# ## TCP endpoint for your graphite instance. +# ## If multiple endpoints are configured, output will be load balanced. +# ## Only one of the endpoints will be written to with each iteration. +# servers = ["localhost:2003"] +# ## Prefix metrics name +# prefix = "" +# ## Graphite output template +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# template = "host.tags.measurement.field" +# +# ## Enable Graphite tags support +# # graphite_tag_support = false +# +# ## Character for separating metric name and field for Graphite tags +# # graphite_separator = "." +# +# ## Graphite templates patterns +# ## 1. Template for cpu +# ## 2. Template for disk* +# ## 3. Default template +# # templates = [ +# # "cpu tags.measurement.host.field", +# # "disk* measurement.field", +# # "host.measurement.tags.field" +# #] +# +# ## timeout in seconds for the write connection to graphite +# timeout = 2 +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Send telegraf metrics to graylog +# [[outputs.graylog]] +# ## UDP endpoint for your graylog instance. +# servers = ["127.0.0.1:12201"] +# +# ## The field to use as the GELF short_message, if unset the static string +# ## "telegraf" will be used. +# ## example: short_message_field = "message" +# # short_message_field = "" + + +# # Configurable HTTP health check resource based on metrics +# [[outputs.health]] +# ## Address and port to listen on. +# ## ex: service_address = "http://localhost:8080" +# ## service_address = "unix:///var/run/telegraf-health.sock" +# # service_address = "http://:8080" +# +# ## The maximum duration for reading the entire request. +# # read_timeout = "5s" +# ## The maximum duration for writing the entire response. +# # write_timeout = "5s" +# +# ## Username and password to accept for HTTP basic authentication. +# # basic_username = "user1" +# # basic_password = "secret" +# +# ## Allowed CA certificates for client certificates. +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## TLS server certificate and private key. +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## One or more check sub-tables should be defined, it is also recommended to +# ## use metric filtering to limit the metrics that flow into this output. +# ## +# ## When using the default buffer sizes, this example will fail when the +# ## metric buffer is half full. +# ## +# ## namepass = ["internal_write"] +# ## tagpass = { output = ["influxdb"] } +# ## +# ## [[outputs.health.compares]] +# ## field = "buffer_size" +# ## lt = 5000.0 +# ## +# ## [[outputs.health.contains]] +# ## field = "buffer_size" + + +# # A plugin that can transmit metrics over HTTP +# [[outputs.http]] +# ## URL is the address to send metrics to +# url = "http://127.0.0.1:8080/telegraf" +# +# ## Timeout for HTTP message +# # timeout = "5s" +# +# ## HTTP method, one of: "POST" or "PUT" +# # method = "POST" +# +# ## HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## OAuth2 Client Credentials Grant +# # client_id = "clientid" +# # client_secret = "secret" +# # token_url = "https://indentityprovider/oauth2/v1/token" +# # scopes = ["urn:opc:idm:__myscopes__"] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Data format to output. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" +# +# ## HTTP Content-Encoding for write request body, can be set to "gzip" to +# ## compress body or "identity" to apply no encoding. +# # content_encoding = "identity" +# +# ## Additional HTTP headers +# # [outputs.http.headers] +# # # Should be set manually to "application/json" for json data_format +# # Content-Type = "text/plain; charset=utf-8" + + +# # Configuration for sending metrics to InfluxDB +# [[outputs.influxdb_v2]] +# ## The URLs of the InfluxDB cluster nodes. +# ## +# ## Multiple URLs can be specified for a single cluster, only ONE of the +# ## urls will be written to each interval. +# ## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"] +# urls = ["http://127.0.0.1:9999"] +# +# ## Token for authentication. +# token = "" +# +# ## Organization is the name of the organization you wish to write to; must exist. +# organization = "" +# +# ## Destination bucket to write into. +# bucket = "" +# +# ## The value of this tag will be used to determine the bucket. If this +# ## tag is not set the 'bucket' option is used as the default. +# # bucket_tag = "" +# +# ## If true, the bucket tag will not be added to the metric. +# # exclude_bucket_tag = false +# +# ## Timeout for HTTP messages. +# # timeout = "5s" +# +# ## Additional HTTP headers +# # http_headers = {"X-Special-Header" = "Special-Value"} +# +# ## HTTP Proxy override, if unset values the standard proxy environment +# ## variables are consulted to determine which proxy, if any, should be used. +# # http_proxy = "http://corporate.proxy:3128" +# +# ## HTTP User-Agent +# # user_agent = "telegraf" +# +# ## Content-Encoding for write request body, can be set to "gzip" to +# ## compress body or "identity" to apply no encoding. +# # content_encoding = "gzip" +# +# ## Enable or disable uint support for writing uints influxdb 2.0. +# # influx_uint_support = false +# +# ## Optional TLS Config for use on HTTP connections. +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Configuration for sending metrics to an Instrumental project +# [[outputs.instrumental]] +# ## Project API Token (required) +# api_token = "API Token" # required +# ## Prefix the metrics with a given name +# prefix = "" +# ## Stats output template (Graphite formatting) +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md#graphite +# template = "host.tags.measurement.field" +# ## Timeout in seconds to connect +# timeout = "2s" +# ## Display Communication to Instrumental +# debug = false + + +# # Configuration for the Kafka server to send metrics to +# [[outputs.kafka]] +# ## URLs of kafka brokers +# brokers = ["localhost:9092"] +# ## Kafka topic for producer messages +# topic = "telegraf" +# +# ## The value of this tag will be used as the topic. If not set the 'topic' +# ## option is used. +# # topic_tag = "" +# +# ## If true, the 'topic_tag' will be removed from to the metric. +# # exclude_topic_tag = false +# +# ## Optional Client id +# # client_id = "Telegraf" +# +# ## Set the minimal supported Kafka version. Setting this enables the use of new +# ## Kafka features and APIs. Of particular interest, lz4 compression +# ## requires at least version 0.10.0.0. +# ## ex: version = "1.1.0" +# # version = "" +# +# ## Optional topic suffix configuration. +# ## If the section is omitted, no suffix is used. +# ## Following topic suffix methods are supported: +# ## measurement - suffix equals to separator + measurement's name +# ## tags - suffix equals to separator + specified tags' values +# ## interleaved with separator +# +# ## Suffix equals to "_" + measurement name +# # [outputs.kafka.topic_suffix] +# # method = "measurement" +# # separator = "_" +# +# ## Suffix equals to "__" + measurement's "foo" tag value. +# ## If there's no such a tag, suffix equals to an empty string +# # [outputs.kafka.topic_suffix] +# # method = "tags" +# # keys = ["foo"] +# # separator = "__" +# +# ## Suffix equals to "_" + measurement's "foo" and "bar" +# ## tag values, separated by "_". If there is no such tags, +# ## their values treated as empty strings. +# # [outputs.kafka.topic_suffix] +# # method = "tags" +# # keys = ["foo", "bar"] +# # separator = "_" +# +# ## The routing tag specifies a tagkey on the metric whose value is used as +# ## the message key. The message key is used to determine which partition to +# ## send the message to. This tag is prefered over the routing_key option. +# routing_tag = "host" +# +# ## The routing key is set as the message key and used to determine which +# ## partition to send the message to. This value is only used when no +# ## routing_tag is set or as a fallback when the tag specified in routing tag +# ## is not found. +# ## +# ## If set to "random", a random value will be generated for each message. +# ## +# ## When unset, no message key is added and each message is routed to a random +# ## partition. +# ## +# ## ex: routing_key = "random" +# ## routing_key = "telegraf" +# # routing_key = "" +# +# ## CompressionCodec represents the various compression codecs recognized by +# ## Kafka in messages. +# ## 0 : No compression +# ## 1 : Gzip compression +# ## 2 : Snappy compression +# ## 3 : LZ4 compression +# # compression_codec = 0 +# +# ## RequiredAcks is used in Produce Requests to tell the broker how many +# ## replica acknowledgements it must see before responding +# ## 0 : the producer never waits for an acknowledgement from the broker. +# ## This option provides the lowest latency but the weakest durability +# ## guarantees (some data will be lost when a server fails). +# ## 1 : the producer gets an acknowledgement after the leader replica has +# ## received the data. This option provides better durability as the +# ## client waits until the server acknowledges the request as successful +# ## (only messages that were written to the now-dead leader but not yet +# ## replicated will be lost). +# ## -1: the producer gets an acknowledgement after all in-sync replicas have +# ## received the data. This option provides the best durability, we +# ## guarantee that no messages will be lost as long as at least one in +# ## sync replica remains. +# # required_acks = -1 +# +# ## The maximum number of times to retry sending a metric before failing +# ## until the next flush. +# # max_retry = 3 +# +# ## The maximum permitted size of a message. Should be set equal to or +# ## smaller than the broker's 'message.max.bytes'. +# # max_message_bytes = 1000000 +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Optional SASL Config +# # sasl_username = "kafka" +# # sasl_password = "secret" +# +# ## SASL protocol version. When connecting to Azure EventHub set to 0. +# # sasl_version = 1 +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# # data_format = "influx" + + +# # Configuration for the AWS Kinesis output. +# [[outputs.kinesis]] +# ## Amazon REGION of kinesis endpoint. +# region = "ap-southeast-2" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# #token = "" +# #role_arn = "" +# #profile = "" +# #shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# ## Kinesis StreamName must exist prior to starting telegraf. +# streamname = "StreamName" +# ## DEPRECATED: PartitionKey as used for sharding data. +# partitionkey = "PartitionKey" +# ## DEPRECATED: If set the partitionKey will be a random UUID on every put. +# ## This allows for scaling across multiple shards in a stream. +# ## This will cause issues with ordering. +# use_random_partitionkey = false +# ## The partition key can be calculated using one of several methods: +# ## +# ## Use a static value for all writes: +# # [outputs.kinesis.partition] +# # method = "static" +# # key = "howdy" +# # +# ## Use a random partition key on each write: +# # [outputs.kinesis.partition] +# # method = "random" +# # +# ## Use the measurement name as the partition key: +# # [outputs.kinesis.partition] +# # method = "measurement" +# # +# ## Use the value of a tag for all writes, if the tag is not set the empty +# ## default option will be used. When no default, defaults to "telegraf" +# # [outputs.kinesis.partition] +# # method = "tag" +# # key = "host" +# # default = "mykey" +# +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" +# +# ## debug will show upstream aws messages. +# debug = false + + +# # Configuration for Librato API to send metrics to. +# [[outputs.librato]] +# ## Librato API Docs +# ## http://dev.librato.com/v1/metrics-authentication +# ## Librato API user +# api_user = "telegraf@influxdb.com" # required. +# ## Librato API token +# api_token = "my-secret-token" # required. +# ## Debug +# # debug = false +# ## Connection timeout. +# # timeout = "5s" +# ## Output source Template (same as graphite buckets) +# ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md#graphite +# ## This template is used in librato's source (not metric's name) +# template = "host" +# + + +# # Configuration for MQTT server to send metrics to +# [[outputs.mqtt]] +# servers = ["localhost:1883"] # required. +# +# ## MQTT outputs send metrics to this topic format +# ## "///" +# ## ex: prefix/web01.example.com/mem +# topic_prefix = "telegraf" +# +# ## QoS policy for messages +# ## 0 = at most once +# ## 1 = at least once +# ## 2 = exactly once +# # qos = 2 +# +# ## username and password to connect MQTT server. +# # username = "telegraf" +# # password = "metricsmetricsmetricsmetrics" +# +# ## client ID, if not set a random ID is generated +# # client_id = "" +# +# ## Timeout for write operations. default: 5s +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## When true, metrics will be sent in one MQTT message per flush. Otherwise, +# ## metrics are written one metric per MQTT message. +# # batch = false +# +# ## When true, metric will have RETAIN flag set, making broker cache entries until someone +# ## actually reads it +# # retain = false +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Send telegraf measurements to NATS +# [[outputs.nats]] +# ## URLs of NATS servers +# servers = ["nats://localhost:4222"] +# +# ## Optional credentials +# # username = "" +# # password = "" +# +# ## Optional NATS 2.0 and NATS NGS compatible user credentials +# # credentials = "/etc/telegraf/nats.creds" +# +# ## NATS subject for producer messages +# subject = "telegraf" +# +# ## Use Transport Layer Security +# # secure = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Send metrics to New Relic metrics endpoint +# [[outputs.newrelic]] +# ## New Relic Insights API key +# insights_key = "insights api key" +# +# ## Prefix to add to add to metric name for easy identification. +# # metric_prefix = "" +# +# ## Timeout for writes to the New Relic API. +# # timeout = "15s" + + +# # Send telegraf measurements to NSQD +# [[outputs.nsq]] +# ## Location of nsqd instance listening on TCP +# server = "localhost:4150" +# ## NSQ topic for producer messages +# topic = "telegraf" +# +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# data_format = "influx" + + +# # Configuration for OpenTSDB server to send metrics to +# [[outputs.opentsdb]] +# ## prefix for metrics keys +# prefix = "my.specific.prefix." +# +# ## DNS name of the OpenTSDB server +# ## Using "opentsdb.example.com" or "tcp://opentsdb.example.com" will use the +# ## telnet API. "http://opentsdb.example.com" will use the Http API. +# host = "opentsdb.example.com" +# +# ## Port of the OpenTSDB server +# port = 4242 +# +# ## Number of data points to send to OpenTSDB in Http requests. +# ## Not used with telnet API. +# http_batch_size = 50 +# +# ## URI Path for Http requests to OpenTSDB. +# ## Used in cases where OpenTSDB is located behind a reverse proxy. +# http_path = "/api/put" +# +# ## Debug true - Prints OpenTSDB communication +# debug = false +# +# ## Separator separates measurement name from field +# separator = "_" + + +# # Configuration for the Prometheus client to spawn +# [[outputs.prometheus_client]] +# ## Address to listen on +# listen = ":9273" +# +# ## Metric version controls the mapping from Telegraf metrics into +# ## Prometheus format. When using the prometheus input, use the same value in +# ## both plugins to ensure metrics are round-tripped without modification. +# ## +# ## example: metric_version = 1; deprecated in 1.13 +# ## metric_version = 2; recommended version +# # metric_version = 1 +# +# ## Use HTTP Basic Authentication. +# # basic_username = "Foo" +# # basic_password = "Bar" +# +# ## If set, the IP Ranges which are allowed to access metrics. +# ## ex: ip_range = ["192.168.0.0/24", "192.168.1.0/30"] +# # ip_range = [] +# +# ## Path to publish the metrics on. +# # path = "/metrics" +# +# ## Expiration interval for each metric. 0 == no expiration +# # expiration_interval = "60s" +# +# ## Collectors to enable, valid entries are "gocollector" and "process". +# ## If unset, both are enabled. +# # collectors_exclude = ["gocollector", "process"] +# +# ## Send string metrics as Prometheus labels. +# ## Unless set to false all string metrics will be sent as labels. +# # string_as_label = true +# +# ## If set, enable TLS with the given certificate. +# # tls_cert = "/etc/ssl/telegraf.crt" +# # tls_key = "/etc/ssl/telegraf.key" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Export metric collection time. +# # export_timestamp = false + + +# # Configuration for the Riemann server to send metrics to +# [[outputs.riemann]] +# ## The full TCP or UDP URL of the Riemann server +# url = "tcp://localhost:5555" +# +# ## Riemann event TTL, floating-point time in seconds. +# ## Defines how long that an event is considered valid for in Riemann +# # ttl = 30.0 +# +# ## Separator to use between measurement and field name in Riemann service name +# ## This does not have any effect if 'measurement_as_attribute' is set to 'true' +# separator = "/" +# +# ## Set measurement name as Riemann attribute 'measurement', instead of prepending it to the Riemann service name +# # measurement_as_attribute = false +# +# ## Send string metrics as Riemann event states. +# ## Unless enabled all string metrics will be ignored +# # string_as_state = false +# +# ## A list of tag keys whose values get sent as Riemann tags. +# ## If empty, all Telegraf tag values will be sent as tags +# # tag_keys = ["telegraf","custom_tag"] +# +# ## Additional Riemann tags to send. +# # tags = ["telegraf-output"] +# +# ## Description for Riemann event +# # description_text = "metrics collected from telegraf" +# +# ## Riemann client write timeout, defaults to "5s" if not set. +# # timeout = "5s" + + +# # Configuration for the Riemann server to send metrics to +# [[outputs.riemann_legacy]] +# ## URL of server +# url = "localhost:5555" +# ## transport protocol to use either tcp or udp +# transport = "tcp" +# ## separator to use between input name and field name in Riemann service name +# separator = " " + + +# # Generic socket writer capable of handling multiple socket types. +# [[outputs.socket_writer]] +# ## URL to connect to +# # address = "tcp://127.0.0.1:8094" +# # address = "tcp://example.com:http" +# # address = "tcp4://127.0.0.1:8094" +# # address = "tcp6://127.0.0.1:8094" +# # address = "tcp6://[2001:db8::1]:8094" +# # address = "udp://127.0.0.1:8094" +# # address = "udp4://127.0.0.1:8094" +# # address = "udp6://127.0.0.1:8094" +# # address = "unix:///tmp/telegraf.sock" +# # address = "unixgram:///tmp/telegraf.sock" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Period between keep alive probes. +# ## Only applies to TCP sockets. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# # keep_alive_period = "5m" +# +# ## Content encoding for packet-based connections (i.e. UDP, unixgram). +# ## Can be set to "gzip" or to "identity" to apply no encoding. +# ## +# # content_encoding = "identity" +# +# ## Data format to generate. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# # data_format = "influx" + + +# # Configuration for Google Cloud Stackdriver to send metrics to +# [[outputs.stackdriver]] +# ## GCP Project +# project = "erudite-bloom-151019" +# +# ## The namespace for the metric descriptor +# namespace = "telegraf" +# +# ## Custom resource type +# # resource_type = "generic_node" +# +# ## Additional resource labels +# # [outputs.stackdriver.resource_labels] +# # node_id = "$HOSTNAME" +# # namespace = "myapp" +# # location = "eu-north0" + + +# # Configuration for Syslog server to send metrics to +# [[outputs.syslog]] +# ## URL to connect to +# ## ex: address = "tcp://127.0.0.1:8094" +# ## ex: address = "tcp4://127.0.0.1:8094" +# ## ex: address = "tcp6://127.0.0.1:8094" +# ## ex: address = "tcp6://[2001:db8::1]:8094" +# ## ex: address = "udp://127.0.0.1:8094" +# ## ex: address = "udp4://127.0.0.1:8094" +# ## ex: address = "udp6://127.0.0.1:8094" +# address = "tcp://127.0.0.1:8094" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Period between keep alive probes. +# ## Only applies to TCP sockets. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# # keep_alive_period = "5m" +# +# ## The framing technique with which it is expected that messages are +# ## transported (default = "octet-counting"). Whether the messages come +# ## using the octect-counting (RFC5425#section-4.3.1, RFC6587#section-3.4.1), +# ## or the non-transparent framing technique (RFC6587#section-3.4.2). Must +# ## be one of "octet-counting", "non-transparent". +# # framing = "octet-counting" +# +# ## The trailer to be expected in case of non-transparent framing (default = "LF"). +# ## Must be one of "LF", or "NUL". +# # trailer = "LF" +# +# ## SD-PARAMs settings +# ## Syslog messages can contain key/value pairs within zero or more +# ## structured data sections. For each unrecognized metric tag/field a +# ## SD-PARAMS is created. +# ## +# ## Example: +# ## [[outputs.syslog]] +# ## sdparam_separator = "_" +# ## default_sdid = "default@32473" +# ## sdids = ["foo@123", "bar@456"] +# ## +# ## input => xyzzy,x=y foo@123_value=42,bar@456_value2=84,something_else=1 +# ## output (structured data only) => [foo@123 value=42][bar@456 value2=84][default@32473 something_else=1 x=y] +# +# ## SD-PARAMs separator between the sdid and tag/field key (default = "_") +# # sdparam_separator = "_" +# +# ## Default sdid used for tags/fields that don't contain a prefix defined in +# ## the explicit sdids setting below If no default is specified, no SD-PARAMs +# ## will be used for unrecognized field. +# # default_sdid = "default@32473" +# +# ## List of explicit prefixes to extract from tag/field keys and use as the +# ## SDID, if they match (see above example for more details): +# # sdids = ["foo@123", "bar@456"] +# +# ## Default severity value. Severity and Facility are used to calculate the +# ## message PRI value (RFC5424#section-6.2.1). Used when no metric field +# ## with key "severity_code" is defined. If unset, 5 (notice) is the default +# # default_severity_code = 5 +# +# ## Default facility value. Facility and Severity are used to calculate the +# ## message PRI value (RFC5424#section-6.2.1). Used when no metric field with +# ## key "facility_code" is defined. If unset, 1 (user-level) is the default +# # default_facility_code = 1 +# +# ## Default APP-NAME value (RFC5424#section-6.2.5) +# ## Used when no metric tag with key "appname" is defined. +# ## If unset, "Telegraf" is the default +# # default_appname = "Telegraf" + + +# # Write metrics to Warp 10 +# [[outputs.warp10]] +# # Prefix to add to the measurement. +# prefix = "telegraf." +# +# # URL of the Warp 10 server +# warp_url = "http://localhost:8080" +# +# # Write token to access your app on warp 10 +# token = "Token" +# +# # Warp 10 query timeout +# # timeout = "15s" +# +# ## Print Warp 10 error body +# # print_error_body = false +# +# ## Max string error size +# # max_string_error_size = 511 +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Configuration for Wavefront server to send metrics to +# [[outputs.wavefront]] +# ## Url for Wavefront Direct Ingestion or using HTTP with Wavefront Proxy +# ## If using Wavefront Proxy, also specify port. example: http://proxyserver:2878 +# url = "https://metrics.wavefront.com" +# +# ## Authentication Token for Wavefront. Only required if using Direct Ingestion +# #token = "DUMMY_TOKEN" +# +# ## DNS name of the wavefront proxy server. Do not use if url is specified +# #host = "wavefront.example.com" +# +# ## Port that the Wavefront proxy server listens on. Do not use if url is specified +# #port = 2878 +# +# ## prefix for metrics keys +# #prefix = "my.specific.prefix." +# +# ## whether to use "value" for name of simple fields. default is false +# #simple_fields = false +# +# ## character to use between metric and field name. default is . (dot) +# #metric_separator = "." +# +# ## Convert metric name paths to use metricSeparator character +# ## When true will convert all _ (underscore) characters in final metric name. default is true +# #convert_paths = true +# +# ## Use Strict rules to sanitize metric and tag names from invalid characters +# ## When enabled forward slash (/) and comma (,) will be accepted +# #use_strict = false +# +# ## Use Regex to sanitize metric and tag names from invalid characters +# ## Regex is more thorough, but significantly slower. default is false +# #use_regex = false +# +# ## point tags to use as the source name for Wavefront (if none found, host will be used) +# #source_override = ["hostname", "address", "agent_host", "node_host"] +# +# ## whether to convert boolean values to numeric values, with false -> 0.0 and true -> 1.0. default is true +# #convert_bool = true +# +# ## Truncate metric tags to a total of 254 characters for the tag name value. Wavefront will reject any +# ## data point exceeding this limit if not truncated. Defaults to 'false' to provide backwards compatibility. +# #truncate_tags = false +# +# ## Define a mapping, namespaced by metric prefix, from string values to numeric values +# ## deprecated in 1.9; use the enum processor plugin +# #[[outputs.wavefront.string_to_number.elasticsearch]] +# # green = 1.0 +# # yellow = 0.5 +# # red = 0.0 + + +############################################################################### +# PROCESSOR PLUGINS # +############################################################################### + + +# # Clone metrics and apply modifications. +# [[processors.clone]] +# ## All modifications on inputs and aggregators can be overridden: +# # name_override = "new_name" +# # name_prefix = "new_name_prefix" +# # name_suffix = "new_name_suffix" +# +# ## Tags to be added (all values must be strings) +# # [processors.clone.tags] +# # additional_tag = "tag_value" + + +# # Convert values to another metric value type +# [[processors.converter]] +# ## Tags to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.tags] +# measurement = [] +# string = [] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] +# +# ## Fields to convert +# ## +# ## The table key determines the target type, and the array of key-values +# ## select the keys to convert. The array may contain globs. +# ## = [...] +# [processors.converter.fields] +# measurement = [] +# tag = [] +# string = [] +# integer = [] +# unsigned = [] +# boolean = [] +# float = [] + + +# # Dates measurements, tags, and fields that pass through this filter. +# [[processors.date]] +# ## New tag to create +# tag_key = "month" +# +# ## New field to create (cannot set both field_key and tag_key) +# # field_key = "month" +# +# ## Date format string, must be a representation of the Go "reference time" +# ## which is "Mon Jan 2 15:04:05 -0700 MST 2006". +# date_format = "Jan" +# +# ## If destination is a field, date format can also be one of +# ## "unix", "unix_ms", "unix_us", or "unix_ns", which will insert an integer field. +# # date_format = "unix" +# +# ## Offset duration added to the date string when writing the new tag. +# # date_offset = "0s" +# +# ## Timezone to use when creating the tag or field using a reference time +# ## string. This can be set to one of "UTC", "Local", or to a location name +# ## in the IANA Time Zone database. +# ## example: timezone = "America/Los_Angeles" +# # timezone = "UTC" + + +# # Filter metrics with repeating field values +# [[processors.dedup]] +# ## Maximum time to suppress output +# dedup_interval = "600s" + + +# # Defaults sets default value(s) for specified fields that are not set on incoming metrics. +# [[processors.defaults]] +# ## Ensures a set of fields always exists on your metric(s) with their +# ## respective default value. +# ## For any given field pair (key = default), if it's not set, a field +# ## is set on the metric with the specified default. +# ## +# ## A field is considered not set if it is nil on the incoming metric; +# ## or it is not nil but its value is an empty string or is a string +# ## of one or more spaces. +# ## = +# # [processors.defaults.fields] +# # field_1 = "bar" +# # time_idle = 0 +# # is_error = true + + +# # Map enum values according to given table. +# [[processors.enum]] +# [[processors.enum.mapping]] +# ## Name of the field to map +# field = "status" +# +# ## Name of the tag to map +# # tag = "status" +# +# ## Destination tag or field to be used for the mapped value. By default the +# ## source tag or field is used, overwriting the original value. +# dest = "status_code" +# +# ## Default value to be used for all values not contained in the mapping +# ## table. When unset, the unmodified value for the field will be used if no +# ## match is found. +# # default = 0 +# +# ## Table of mappings +# [processors.enum.mapping.value_mappings] +# green = 1 +# amber = 2 +# red = 3 + + +# # Run executable as long-running processor plugin +# [[processors.execd]] +# ## Program to run as daemon +# ## eg: command = ["/path/to/your_program", "arg1", "arg2"] +# command = ["cat"] +# +# ## Delay before the process is restarted after an unexpected termination +# restart_delay = "10s" + + +# # Performs file path manipulations on tags and fields +# [[processors.filepath]] +# ## Treat the tag value as a path and convert it to its last element, storing the result in a new tag +# # [[processors.filepath.basename]] +# # tag = "path" +# # dest = "basepath" +# +# ## Treat the field value as a path and keep all but the last element of path, typically the path's directory +# # [[processors.filepath.dirname]] +# # field = "path" +# +# ## Treat the tag value as a path, converting it to its the last element without its suffix +# # [[processors.filepath.stem]] +# # tag = "path" +# +# ## Treat the tag value as a path, converting it to the shortest path name equivalent +# ## to path by purely lexical processing +# # [[processors.filepath.clean]] +# # tag = "path" +# +# ## Treat the tag value as a path, converting it to a relative path that is lexically +# ## equivalent to the source path when joined to 'base_path' +# # [[processors.filepath.rel]] +# # tag = "path" +# # base_path = "/var/log" +# +# ## Treat the tag value as a path, replacing each separator character in path with a '/' character. Has only +# ## effect on Windows +# # [[processors.filepath.toslash]] +# # tag = "path" + + +# # Add a tag of the network interface name looked up over SNMP by interface number +# [[processors.ifname]] +# ## Name of tag holding the interface number +# # tag = "ifIndex" +# +# ## Name of output tag where service name will be added +# # dest = "ifName" +# +# ## Name of tag of the SNMP agent to request the interface name from +# # agent = "agent" +# +# ## Timeout for each request. +# # timeout = "5s" +# +# ## SNMP version; can be 1, 2, or 3. +# # version = 2 +# +# ## SNMP community string. +# # community = "public" +# +# ## Number of retries to attempt. +# # retries = 3 +# +# ## The GETBULK max-repetitions parameter. +# # max_repetitions = 10 +# +# ## SNMPv3 authentication and encryption options. +# ## +# ## Security Name. +# # sec_name = "myuser" +# ## Authentication protocol; one of "MD5", "SHA", or "". +# # auth_protocol = "MD5" +# ## Authentication password. +# # auth_password = "pass" +# ## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# # sec_level = "authNoPriv" +# ## Context Name. +# # context_name = "" +# ## Privacy protocol used for encrypted messages; one of "DES", "AES" or "". +# # priv_protocol = "" +# ## Privacy password used for encrypted messages. +# # priv_password = "" +# +# ## max_parallel_lookups is the maximum number of SNMP requests to +# ## make at the same time. +# # max_parallel_lookups = 100 +# +# ## ordered controls whether or not the metrics need to stay in the +# ## same order this plugin received them in. If false, this plugin +# ## may change the order when data is cached. If you need metrics to +# ## stay in order set this to true. keeping the metrics ordered may +# ## be slightly slower +# # ordered = false +# +# ## cache_ttl is the amount of time interface names are cached for a +# ## given agent. After this period elapses if names are needed they +# ## will be retrieved again. +# # cache_ttl = "8h" + + +# # Apply metric modifications using override semantics. +# [[processors.override]] +# ## All modifications on inputs and aggregators can be overridden: +# # name_override = "new_name" +# # name_prefix = "new_name_prefix" +# # name_suffix = "new_name_suffix" +# +# ## Tags to be added (all values must be strings) +# # [processors.override.tags] +# # additional_tag = "tag_value" + + +# # Parse a value in a specified field/tag(s) and add the result in a new metric +# [[processors.parser]] +# ## The name of the fields whose value will be parsed. +# parse_fields = [] +# +# ## If true, incoming metrics are not emitted. +# drop_original = false +# +# ## If set to override, emitted metrics will be merged by overriding the +# ## original metric using the newly parsed metrics. +# merge = "override" +# +# ## The dataformat to be read from files +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Rotate a single valued metric into a multi field metric +# [[processors.pivot]] +# ## Tag to use for naming the new field. +# tag_key = "name" +# ## Field to use as the value of the new field. +# value_key = "value" + + +# # Given a tag of a TCP or UDP port number, add a tag of the service name looked up in the system services file +# [[processors.port_name]] +# [[processors.port_name]] +# ## Name of tag holding the port number +# # tag = "port" +# +# ## Name of output tag where service name will be added +# # dest = "service" +# +# ## Default tcp or udp +# # default_protocol = "tcp" + + +# # Print all metrics that pass through this filter. +# [[processors.printer]] + + +# # Transforms tag and field values with regex pattern +# [[processors.regex]] +# ## Tag and field conversions defined in a separate sub-tables +# # [[processors.regex.tags]] +# # ## Tag to change +# # key = "resp_code" +# # ## Regular expression to match on a tag value +# # pattern = "^(\\d)\\d\\d$" +# # ## Matches of the pattern will be replaced with this string. Use ${1} +# # ## notation to use the text of the first submatch. +# # replacement = "${1}xx" +# +# # [[processors.regex.fields]] +# # ## Field to change +# # key = "request" +# # ## All the power of the Go regular expressions available here +# # ## For example, named subgroups +# # pattern = "^/api(?P/[\\w/]+)\\S*" +# # replacement = "${method}" +# # ## If result_key is present, a new field will be created +# # ## instead of changing existing field +# # result_key = "method" +# +# ## Multiple conversions may be applied for one field sequentially +# ## Let's extract one more value +# # [[processors.regex.fields]] +# # key = "request" +# # pattern = ".*category=(\\w+).*" +# # replacement = "${1}" +# # result_key = "search_category" + + +# # Rename measurements, tags, and fields that pass through this filter. +# [[processors.rename]] + + +# # ReverseDNS does a reverse lookup on IP addresses to retrieve the DNS name +# [[processors.reverse_dns]] +# ## For optimal performance, you may want to limit which metrics are passed to this +# ## processor. eg: +# ## namepass = ["my_metric_*"] +# +# ## cache_ttl is how long the dns entries should stay cached for. +# ## generally longer is better, but if you expect a large number of diverse lookups +# ## you'll want to consider memory use. +# cache_ttl = "24h" +# +# ## lookup_timeout is how long should you wait for a single dns request to repsond. +# ## this is also the maximum acceptable latency for a metric travelling through +# ## the reverse_dns processor. After lookup_timeout is exceeded, a metric will +# ## be passed on unaltered. +# ## multiple simultaneous resolution requests for the same IP will only make a +# ## single rDNS request, and they will all wait for the answer for this long. +# lookup_timeout = "3s" +# +# ## max_parallel_lookups is the maximum number of dns requests to be in flight +# ## at the same time. Requesting hitting cached values do not count against this +# ## total, and neither do mulptiple requests for the same IP. +# ## It's probably best to keep this number fairly low. +# max_parallel_lookups = 10 +# +# ## ordered controls whether or not the metrics need to stay in the same order +# ## this plugin received them in. If false, this plugin will change the order +# ## with requests hitting cached results moving through immediately and not +# ## waiting on slower lookups. This may cause issues for you if you are +# ## depending on the order of metrics staying the same. If so, set this to true. +# ## keeping the metrics ordered may be slightly slower. +# ordered = false +# +# [[processors.reverse_dns.lookup]] +# ## get the ip from the field "source_ip", and put the result in the field "source_name" +# field = "source_ip" +# dest = "source_name" +# +# [[processors.reverse_dns.lookup]] +# ## get the ip from the tag "destination_ip", and put the result in the tag +# ## "destination_name". +# tag = "destination_ip" +# dest = "destination_name" +# +# ## If you would prefer destination_name to be a field instead, you can use a +# ## processors.converter after this one, specifying the order attribute. + + +# # Add the S2 Cell ID as a tag based on latitude and longitude fields +# [[processors.s2geo]] +# ## The name of the lat and lon fields containing WGS-84 latitude and +# ## longitude in decimal degrees. +# # lat_field = "lat" +# # lon_field = "lon" +# +# ## New tag to create +# # tag_key = "s2_cell_id" +# +# ## Cell level (see https://s2geometry.io/resources/s2cell_statistics.html) +# # cell_level = 9 + + +# # Process metrics using a Starlark script +# [[processors.starlark]] +# ## The Starlark source can be set as a string in this configuration file, or +# ## by referencing a file containing the script. Only one source or script +# ## should be set at once. +# ## +# ## Source of the Starlark script. +# source = ''' +# def apply(metric): +# return metric +# ''' +# +# ## File containing a Starlark script. +# # script = "/usr/local/bin/myscript.star" + + +# # Perform string processing on tags, fields, and measurements +# [[processors.strings]] +# ## Convert a tag value to uppercase +# # [[processors.strings.uppercase]] +# # tag = "method" +# +# ## Convert a field value to lowercase and store in a new field +# # [[processors.strings.lowercase]] +# # field = "uri_stem" +# # dest = "uri_stem_normalised" +# +# ## Convert a field value to titlecase +# # [[processors.strings.titlecase]] +# # field = "status" +# +# ## Trim leading and trailing whitespace using the default cutset +# # [[processors.strings.trim]] +# # field = "message" +# +# ## Trim leading characters in cutset +# # [[processors.strings.trim_left]] +# # field = "message" +# # cutset = "\t" +# +# ## Trim trailing characters in cutset +# # [[processors.strings.trim_right]] +# # field = "message" +# # cutset = "\r\n" +# +# ## Trim the given prefix from the field +# # [[processors.strings.trim_prefix]] +# # field = "my_value" +# # prefix = "my_" +# +# ## Trim the given suffix from the field +# # [[processors.strings.trim_suffix]] +# # field = "read_count" +# # suffix = "_count" +# +# ## Replace all non-overlapping instances of old with new +# # [[processors.strings.replace]] +# # measurement = "*" +# # old = ":" +# # new = "_" +# +# ## Trims strings based on width +# # [[processors.strings.left]] +# # field = "message" +# # width = 10 +# +# ## Decode a base64 encoded utf-8 string +# # [[processors.strings.base64decode]] +# # field = "message" + + +# # Restricts the number of tags that can pass through this filter and chooses which tags to preserve when over the limit. +# [[processors.tag_limit]] +# ## Maximum number of tags to preserve +# limit = 10 +# +# ## List of tags to preferentially preserve +# keep = ["foo", "bar", "baz"] + + +# # Uses a Go template to create a new tag +# [[processors.template]] +# ## Tag to set with the output of the template. +# tag = "topic" +# +# ## Go template used to create the tag value. In order to ease TOML +# ## escaping requirements, you may wish to use single quotes around the +# ## template string. +# template = '{{ .Tag "hostname" }}.{{ .Tag "level" }}' + + +# # Print all metrics that pass through this filter. +# [[processors.topk]] +# ## How many seconds between aggregations +# # period = 10 +# +# ## How many top metrics to return +# # k = 10 +# +# ## Over which tags should the aggregation be done. Globs can be specified, in +# ## which case any tag matching the glob will aggregated over. If set to an +# ## empty list is no aggregation over tags is done +# # group_by = ['*'] +# +# ## Over which fields are the top k are calculated +# # fields = ["value"] +# +# ## What aggregation to use. Options: sum, mean, min, max +# # aggregation = "mean" +# +# ## Instead of the top k largest metrics, return the bottom k lowest metrics +# # bottomk = false +# +# ## The plugin assigns each metric a GroupBy tag generated from its name and +# ## tags. If this setting is different than "" the plugin will add a +# ## tag (which name will be the value of this setting) to each metric with +# ## the value of the calculated GroupBy tag. Useful for debugging +# # add_groupby_tag = "" +# +# ## These settings provide a way to know the position of each metric in +# ## the top k. The 'add_rank_field' setting allows to specify for which +# ## fields the position is required. If the list is non empty, then a field +# ## will be added to each and every metric for each string present in this +# ## setting. This field will contain the ranking of the group that +# ## the metric belonged to when aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_rank' +# # add_rank_fields = [] +# +# ## These settings provide a way to know what values the plugin is generating +# ## when aggregating metrics. The 'add_aggregate_field' setting allows to +# ## specify for which fields the final aggregation value is required. If the +# ## list is non empty, then a field will be added to each every metric for +# ## each field present in this setting. This field will contain +# ## the computed aggregation for the group that the metric belonged to when +# ## aggregated over that field. +# ## The name of the field will be set to the name of the aggregation field, +# ## suffixed with the string '_topk_aggregate' +# # add_aggregate_fields = [] + + +# # Rotate multi field metric into several single field metrics +# [[processors.unpivot]] +# ## Tag to use for the name. +# tag_key = "name" +# ## Field to use for the name of the value. +# value_key = "value" + + +############################################################################### +# AGGREGATOR PLUGINS # +############################################################################### + + +# # Keep the aggregate basicstats of each metric passing through. +# [[aggregators.basicstats]] +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## Configures which basic stats to push as fields +# # stats = ["count", "min", "max", "mean", "stdev", "s2", "sum"] + + +# # Report the final metric of a series +# [[aggregators.final]] +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## The time that a series is not updated until considering it final. +# series_timeout = "5m" + + +# # Create aggregate histograms. +# [[aggregators.histogram]] +# ## The period in which to flush the aggregator. +# period = "30s" +# +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# +# ## If true, the histogram will be reset on flush instead +# ## of accumulating the results. +# reset = false +# +# ## Whether bucket values should be accumulated. If set to false, "gt" tag will be added. +# ## Defaults to true. +# cumulative = true +# +# ## Example config that aggregates all fields of the metric. +# # [[aggregators.histogram.config]] +# # ## Right borders of buckets (with +Inf implicitly added). +# # buckets = [0.0, 15.6, 34.5, 49.1, 71.5, 80.5, 94.5, 100.0] +# # ## The name of metric. +# # measurement_name = "cpu" +# +# ## Example config that aggregates only specific fields of the metric. +# # [[aggregators.histogram.config]] +# # ## Right borders of buckets (with +Inf implicitly added). +# # buckets = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0] +# # ## The name of metric. +# # measurement_name = "diskio" +# # ## The concrete fields of metric +# # fields = ["io_time", "read_time", "write_time"] + + +# # Merge metrics into multifield metrics by series key +# [[aggregators.merge]] +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = true + + +# # Keep the aggregate min/max of each metric passing through. +# [[aggregators.minmax]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false + + +# # Count the occurrence of values in fields. +# [[aggregators.valuecounter]] +# ## General Aggregator Arguments: +# ## The period on which to flush & clear the aggregator. +# period = "30s" +# ## If true, the original metric will be dropped by the +# ## aggregator and will not get sent to the output plugins. +# drop_original = false +# ## The fields for which the values will be counted +# fields = [] + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics. + collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states. + report_active = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb", "vd*"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + # + ## On systems which support it, device metadata can be added in the form of + ## tags. + ## Currently only Linux is supported via udev properties. You can view + ## available properties for a device by running: + ## 'udevadm info -q property -n /dev/sda' + ## Note: Most, but not all, udev properties can be accessed this way. Properties + ## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH. + # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] + # + ## Using the same metadata source as device_tags, you can also customize the + ## name of the device via templates. + ## The 'name_templates' parameter is a list of templates to try and apply to + ## the device. The template may contain variables in the form of '$PROPERTY' or + ## '${PROPERTY}'. The first template which does not contain any variables not + ## present for the device is used as the device name tag. + ## The typical use case is for LVM volumes, to get the VG/LV name instead of + ## the near-meaningless DM-0 name. + # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + ## Uncomment to remove deprecated metrics. + # fielddrop = ["uptime_format"] + + +# # Gather ActiveMQ metrics +# [[inputs.activemq]] +# ## ActiveMQ WebConsole URL +# url = "http://127.0.0.1:8161" +# +# ## Required ActiveMQ Endpoint +# ## deprecated in 1.11; use the url option +# # server = "127.0.0.1" +# # port = 8161 +# +# ## Credentials for basic HTTP authentication +# # username = "admin" +# # password = "admin" +# +# ## Required ActiveMQ webadmin root path +# # webadmin = "admin" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read stats from aerospike server(s) +# [[inputs.aerospike]] +# ## Aerospike servers to connect to (with port) +# ## This plugin will query all namespaces the aerospike +# ## server has configured and get stats for them. +# servers = ["localhost:3000"] +# +# # username = "telegraf" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # enable_tls = false +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## If false, skip chain & host verification +# # insecure_skip_verify = true + + +# # Read Apache status information (mod_status) +# [[inputs.apache]] +# ## An array of URLs to gather from, must be directed at the machine +# ## readable version of the mod_status page including the auto query string. +# ## Default is "http://localhost/server-status?auto". +# urls = ["http://localhost/server-status?auto"] +# +# ## Credentials for basic HTTP authentication. +# # username = "myuser" +# # password = "mypassword" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Monitor APC UPSes connected to apcupsd +# [[inputs.apcupsd]] +# # A list of running apcupsd server to connect to. +# # If not provided will default to tcp://127.0.0.1:3551 +# servers = ["tcp://127.0.0.1:3551"] +# +# ## Timeout for dialing server. +# timeout = "5s" + + +# # Gather metrics from Apache Aurora schedulers +# [[inputs.aurora]] +# ## Schedulers are the base addresses of your Aurora Schedulers +# schedulers = ["http://127.0.0.1:8081"] +# +# ## Set of role types to collect metrics from. +# ## +# ## The scheduler roles are checked each interval by contacting the +# ## scheduler nodes; zookeeper is not contacted. +# # roles = ["leader", "follower"] +# +# ## Timeout is the max time for total network operations. +# # timeout = "5s" +# +# ## Username and password are sent using HTTP Basic Auth. +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Gather Azure Storage Queue metrics +# [[inputs.azure_storage_queue]] +# ## Required Azure Storage Account name +# account_name = "mystorageaccount" +# +# ## Required Azure Storage Account access key +# account_key = "storageaccountaccesskey" +# +# ## Set to false to disable peeking age of oldest message (executes faster) +# # peek_oldest_message_age = true + + +# # Read metrics of bcache from stats_total and dirty_data +# [[inputs.bcache]] +# ## Bcache sets path +# ## If not specified, then default is: +# bcachePath = "/sys/fs/bcache" +# +# ## By default, telegraf gather stats for all bcache devices +# ## Setting devices will restrict the stats to the specified +# ## bcache devices. +# bcacheDevs = ["bcache0"] + + +# # Collects Beanstalkd server and tubes stats +# [[inputs.beanstalkd]] +# ## Server to collect data from +# server = "localhost:11300" +# +# ## List of tubes to gather stats about. +# ## If no tubes specified then data gathered for each tube on server reported by list-tubes command +# tubes = ["notifications"] + + +# # Read BIND nameserver XML statistics +# [[inputs.bind]] +# ## An array of BIND XML statistics URI to gather stats. +# ## Default is "http://localhost:8053/xml/v3". +# # urls = ["http://localhost:8053/xml/v3"] +# # gather_memory_contexts = false +# # gather_views = false + + +# # Collect bond interface status, slaves statuses and failures count +# [[inputs.bond]] +# ## Sets 'proc' directory path +# ## If not specified, then default is /proc +# # host_proc = "/proc" +# +# ## By default, telegraf gather stats for all bond interfaces +# ## Setting interfaces will restrict the stats to the specified +# ## bond interfaces. +# # bond_interfaces = ["bond0"] + + +# # Collect Kafka topics and consumers status from Burrow HTTP API. +# [[inputs.burrow]] +# ## Burrow API endpoints in format "schema://host:port". +# ## Default is "http://localhost:8000". +# servers = ["http://localhost:8000"] +# +# ## Override Burrow API prefix. +# ## Useful when Burrow is behind reverse-proxy. +# # api_prefix = "/v3/kafka" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Limit per-server concurrent connections. +# ## Useful in case of large number of topics or consumer groups. +# # concurrent_connections = 20 +# +# ## Filter clusters, default is no filtering. +# ## Values can be specified as glob patterns. +# # clusters_include = [] +# # clusters_exclude = [] +# +# ## Filter consumer groups, default is no filtering. +# ## Values can be specified as glob patterns. +# # groups_include = [] +# # groups_exclude = [] +# +# ## Filter topics, default is no filtering. +# ## Values can be specified as glob patterns. +# # topics_include = [] +# # topics_exclude = [] +# +# ## Credentials for basic HTTP authentication. +# # username = "" +# # password = "" +# +# ## Optional SSL config +# # ssl_ca = "/etc/telegraf/ca.pem" +# # ssl_cert = "/etc/telegraf/cert.pem" +# # ssl_key = "/etc/telegraf/key.pem" +# # insecure_skip_verify = false + + +# # Collects performance metrics from the MON, OSD, MDS and RGW nodes in a Ceph storage cluster. +# [[inputs.ceph]] +# ## This is the recommended interval to poll. Too frequent and you will lose +# ## data points due to timeouts during rebalancing and recovery +# interval = '1m' +# +# ## All configuration values are optional, defaults are shown below +# +# ## location of ceph binary +# ceph_binary = "/usr/bin/ceph" +# +# ## directory in which to look for socket files +# socket_dir = "/var/run/ceph" +# +# ## prefix of MON and OSD socket files, used to determine socket type +# mon_prefix = "ceph-mon" +# osd_prefix = "ceph-osd" +# mds_prefix = "ceph-mds" +# rgw_prefix = "ceph-client" +# +# ## suffix used to identify socket files +# socket_suffix = "asok" +# +# ## Ceph user to authenticate as +# ceph_user = "client.admin" +# +# ## Ceph configuration to use to locate the cluster +# ceph_config = "/etc/ceph/ceph.conf" +# +# ## Whether to gather statistics via the admin socket +# gather_admin_socket_stats = true +# +# ## Whether to gather statistics via ceph commands +# gather_cluster_stats = false + + +# # Read specific statistics per cgroup +# [[inputs.cgroup]] +# ## Directories in which to look for files, globs are supported. +# ## Consider restricting paths to the set of cgroups you really +# ## want to monitor if you have a large number of cgroups, to avoid +# ## any cardinality issues. +# # paths = [ +# # "/sys/fs/cgroup/memory", +# # "/sys/fs/cgroup/memory/child1", +# # "/sys/fs/cgroup/memory/child2/*", +# # ] +# ## cgroup stat fields, as file names, globs are supported. +# ## these file names are appended to each path from above. +# # files = ["memory.*usage*", "memory.limit_in_bytes"] + + +# # Get standard chrony metrics, requires chronyc executable. +# [[inputs.chrony]] +# ## If true, chronyc tries to perform a DNS lookup for the time server. +# # dns_lookup = false + + +# # Pull Metric Statistics from Amazon CloudWatch +# [[inputs.cloudwatch]] +# ## Amazon Region +# region = "us-east-1" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# # access_key = "" +# # secret_key = "" +# # token = "" +# # role_arn = "" +# # profile = "" +# # shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# # The minimum period for Cloudwatch metrics is 1 minute (60s). However not all +# # metrics are made available to the 1 minute period. Some are collected at +# # 3 minute, 5 minute, or larger intervals. See https://aws.amazon.com/cloudwatch/faqs/#monitoring. +# # Note that if a period is configured that is smaller than the minimum for a +# # particular metric, that metric will not be returned by the Cloudwatch API +# # and will not be collected by Telegraf. +# # +# ## Requested CloudWatch aggregation Period (required - must be a multiple of 60s) +# period = "5m" +# +# ## Collection Delay (required - must account for metrics availability via CloudWatch API) +# delay = "5m" +# +# ## Recommended: use metric 'interval' that is a multiple of 'period' to avoid +# ## gaps or overlap in pulled data +# interval = "5m" +# +# ## Configure the TTL for the internal cache of metrics. +# # cache_ttl = "1h" +# +# ## Metric Statistic Namespace (required) +# namespace = "AWS/ELB" +# +# ## Maximum requests per second. Note that the global default AWS rate limit is +# ## 50 reqs/sec, so if you define multiple namespaces, these should add up to a +# ## maximum of 50. +# ## See http://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html +# # ratelimit = 25 +# +# ## Timeout for http requests made by the cloudwatch client. +# # timeout = "5s" +# +# ## Namespace-wide statistic filters. These allow fewer queries to be made to +# ## cloudwatch. +# # statistic_include = [ "average", "sum", "minimum", "maximum", sample_count" ] +# # statistic_exclude = [] +# +# ## Metrics to Pull +# ## Defaults to all Metrics in Namespace if nothing is provided +# ## Refreshes Namespace available metrics every 1h +# #[[inputs.cloudwatch.metrics]] +# # names = ["Latency", "RequestCount"] +# # +# # ## Statistic filters for Metric. These allow for retrieving specific +# # ## statistics for an individual metric. +# # # statistic_include = [ "average", "sum", "minimum", "maximum", sample_count" ] +# # # statistic_exclude = [] +# # +# # ## Dimension filters for Metric. All dimensions defined for the metric names +# # ## must be specified in order to retrieve the metric statistics. +# # [[inputs.cloudwatch.metrics.dimensions]] +# # name = "LoadBalancerName" +# # value = "p-example" + + +# # Collects conntrack stats from the configured directories and files. +# [[inputs.conntrack]] +# ## The following defaults would work with multiple versions of conntrack. +# ## Note the nf_ and ip_ filename prefixes are mutually exclusive across +# ## kernel versions, as are the directory locations. +# +# ## Superset of filenames to look for within the conntrack dirs. +# ## Missing files will be ignored. +# files = ["ip_conntrack_count","ip_conntrack_max", +# "nf_conntrack_count","nf_conntrack_max"] +# +# ## Directories to search within for the conntrack files above. +# ## Missing directories will be ignored. +# dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"] + + +# # Gather health check statuses from services registered in Consul +# [[inputs.consul]] +# ## Consul server address +# # address = "localhost:8500" +# +# ## URI scheme for the Consul server, one of "http", "https" +# # scheme = "http" +# +# ## ACL token used in every request +# # token = "" +# +# ## HTTP Basic Authentication username and password. +# # username = "" +# # password = "" +# +# ## Data center to query the health checks from +# # datacenter = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = true +# +# ## Consul checks' tag splitting +# # When tags are formatted like "key:value" with ":" as a delimiter then +# # they will be splitted and reported as proper key:value in Telegraf +# # tag_delimiter = ":" + + +# # Read metrics from one or many couchbase clusters +# [[inputs.couchbase]] +# ## specify servers via a url matching: +# ## [protocol://][:password]@address[:port] +# ## e.g. +# ## http://couchbase-0.example.com/ +# ## http://admin:secret@couchbase-0.example.com:8091/ +# ## +# ## If no servers are specified, then localhost is used as the host. +# ## If no protocol is specified, HTTP is used. +# ## If no port is specified, 8091 is used. +# servers = ["http://localhost:8091"] + + +# # Read CouchDB Stats from one or more servers +# [[inputs.couchdb]] +# ## Works with CouchDB stats endpoints out of the box +# ## Multiple Hosts from which to read CouchDB stats: +# hosts = ["http://localhost:8086/_stats"] +# +# ## Use HTTP Basic Authentication. +# # basic_username = "telegraf" +# # basic_password = "p@ssw0rd" + + +# # Input plugin for DC/OS metrics +# [[inputs.dcos]] +# ## The DC/OS cluster URL. +# cluster_url = "https://dcos-ee-master-1" +# +# ## The ID of the service account. +# service_account_id = "telegraf" +# ## The private key file for the service account. +# service_account_private_key = "/etc/telegraf/telegraf-sa-key.pem" +# +# ## Path containing login token. If set, will read on every gather. +# # token_file = "/home/dcos/.dcos/token" +# +# ## In all filter options if both include and exclude are empty all items +# ## will be collected. Arrays may contain glob patterns. +# ## +# ## Node IDs to collect metrics from. If a node is excluded, no metrics will +# ## be collected for its containers or apps. +# # node_include = [] +# # node_exclude = [] +# ## Container IDs to collect container metrics from. +# # container_include = [] +# # container_exclude = [] +# ## Container IDs to collect app metrics from. +# # app_include = [] +# # app_exclude = [] +# +# ## Maximum concurrent connections to the cluster. +# # max_connections = 10 +# ## Maximum time to receive a response from cluster. +# # response_timeout = "20s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## If false, skip chain & host verification +# # insecure_skip_verify = true +# +# ## Recommended filtering to reduce series cardinality. +# # [inputs.dcos.tagdrop] +# # path = ["/var/lib/mesos/slave/slaves/*"] + + +# # Read metrics from one or many disque servers +# [[inputs.disque]] +# ## An array of URI to gather stats about. Specify an ip or hostname +# ## with optional port and password. +# ## ie disque://localhost, disque://10.10.3.33:18832, 10.0.0.1:10000, etc. +# ## If no servers are specified, then localhost is used as the host. +# servers = ["localhost"] + + +# # Provide a native collection for dmsetup based statistics for dm-cache +# [[inputs.dmcache]] +# ## Whether to report per-device stats or not +# per_device = true + + +# # Query given DNS server and gives statistics +# [[inputs.dns_query]] +# ## servers to query +# servers = ["8.8.8.8"] +# +# ## Network is the network protocol name. +# # network = "udp" +# +# ## Domains or subdomains to query. +# # domains = ["."] +# +# ## Query record type. +# ## Possible values: A, AAAA, CNAME, MX, NS, PTR, TXT, SOA, SPF, SRV. +# # record_type = "A" +# +# ## Dns server port. +# # port = 53 +# +# ## Query timeout in seconds. +# # timeout = 2 + + +# # Read metrics about docker containers +# [[inputs.docker]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# endpoint = "unix:///var/run/docker.sock" +# +# ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +# gather_services = false +# +# ## Only collect metrics for these containers, collect all if empty +# container_names = [] +# +# ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars +# source_tag = false +# +# ## Containers to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] +# +# ## Container states to include and exclude. Globs accepted. +# ## When empty only containers in the "running" state will be captured. +# ## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] +# ## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] +# # container_state_include = [] +# # container_state_exclude = [] +# +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" +# +# ## Whether to report for each container per-device blkio (8:0, 8:1...) and +# ## network (eth0, eth1, ...) stats or not +# perdevice = true +# +# ## Whether to report for each container total blkio and network stats or not +# total = false +# +# ## Which environment variables should we use as a tag +# ##tag_env = ["JAVA_HOME", "HEAP_SIZE"] +# +# ## docker labels to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read statistics from one or many dovecot servers +# [[inputs.dovecot]] +# ## specify dovecot servers via an address:port list +# ## e.g. +# ## localhost:24242 +# ## +# ## If no servers are specified, then localhost is used as the host. +# servers = ["localhost:24242"] +# +# ## Type is one of "user", "domain", "ip", or "global" +# type = "global" +# +# ## Wildcard matches like "*.com". An empty string "" is same as "*" +# ## If type = "ip" filters should be +# filters = [""] + + +# # Read metrics about docker containers from Fargate/ECS v2, v3 meta endpoints. +# [[inputs.ecs]] +# ## ECS metadata url. +# ## Metadata v2 API is used if set explicitly. Otherwise, +# ## v3 metadata endpoint API is used if available. +# # endpoint_url = "" +# +# ## Containers to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all containers +# # container_name_include = [] +# # container_name_exclude = [] +# +# ## Container states to include and exclude. Globs accepted. +# ## When empty only containers in the "RUNNING" state will be captured. +# ## Possible values are "NONE", "PULLED", "CREATED", "RUNNING", +# ## "RESOURCES_PROVISIONED", "STOPPED". +# # container_status_include = [] +# # container_status_exclude = [] +# +# ## ecs labels to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all labels as tags +# ecs_label_include = [ "com.amazonaws.ecs.*" ] +# ecs_label_exclude = [] +# +# ## Timeout for queries. +# # timeout = "5s" + + +# # Read stats from one or more Elasticsearch servers or clusters +# [[inputs.elasticsearch]] +# ## specify a list of one or more Elasticsearch servers +# # you can add username and password to your url to use basic authentication: +# # servers = ["http://user:pass@localhost:9200"] +# servers = ["http://localhost:9200"] +# +# ## Timeout for HTTP requests to the elastic search server(s) +# http_timeout = "5s" +# +# ## When local is true (the default), the node will read only its own stats. +# ## Set local to false when you want to read the node stats from all nodes +# ## of the cluster. +# local = true +# +# ## Set cluster_health to true when you want to also obtain cluster health stats +# cluster_health = false +# +# ## Adjust cluster_health_level when you want to also obtain detailed health stats +# ## The options are +# ## - indices (default) +# ## - cluster +# # cluster_health_level = "indices" +# +# ## Set cluster_stats to true when you want to also obtain cluster stats. +# cluster_stats = false +# +# ## Only gather cluster_stats from the master node. To work this require local = true +# cluster_stats_only_from_master = true +# +# ## Indices to collect; can be one or more indices names or _all +# indices_include = ["_all"] +# +# ## One of "shards", "cluster", "indices" +# indices_level = "shards" +# +# ## node_stats is a list of sub-stats that you want to have gathered. Valid options +# ## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http", +# ## "breaker". Per default, all stats are gathered. +# # node_stats = ["jvm", "http"] +# +# ## HTTP Basic Authentication username and password. +# # username = "" +# # password = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Returns ethtool statistics for given interfaces +[[inputs.ethtool]] +# ## List of interfaces to pull metrics for +# # interface_include = ["eth0"] +# +# ## List of interfaces to ignore when pulling metrics. +# # interface_exclude = ["eth1"] + + +# # Read metrics from one or more commands that can output to stdout +# [[inputs.exec]] +# ## Commands array +# commands = [ +# "/tmp/test.sh", +# "/usr/bin/mycollector --foo=bar", +# "/tmp/collect_*.sh" +# ] +# +# ## Timeout for each command to complete. +# timeout = "5s" +# +# ## measurement name suffix (for separating different commands) +# name_suffix = "_mycollector" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from fail2ban. +# [[inputs.fail2ban]] +# ## Use sudo to run fail2ban-client +# use_sudo = false + + +# # Read devices value(s) from a Fibaro controller +# [[inputs.fibaro]] +# ## Required Fibaro controller address/hostname. +# ## Note: at the time of writing this plugin, Fibaro only implemented http - no https available +# url = "http://:80" +# +# ## Required credentials to access the API (http://) +# username = "" +# password = "" +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" + + +# # Parse a complete file each interval +# [[inputs.file]] +# ## Files to parse each interval. Accept standard unix glob matching rules, +# ## as well as ** to match recursive files and directories. +# files = ["/tmp/metrics.out"] +# +# ## Name a tag containing the name of the file the data was parsed from. Leave empty +# ## to disable. +# # file_tag = "" +# +# ## Character encoding to use when interpreting the file contents. Invalid +# ## characters are replaced using the unicode replacement character. When set +# ## to the empty string the data is not decoded to text. +# ## ex: character_encoding = "utf-8" +# ## character_encoding = "utf-16le" +# ## character_encoding = "utf-16be" +# ## character_encoding = "" +# # character_encoding = "" +# +# ## The dataformat to be read from files +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Count files in a directory +# [[inputs.filecount]] +# ## Directory to gather stats about. +# ## deprecated in 1.9; use the directories option +# # directory = "/var/cache/apt/archives" +# +# ## Directories to gather stats about. +# ## This accept standard unit glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## /var/log/** -> recursively find all directories in /var/log and count files in each directories +# ## /var/log/*/* -> find all directories with a parent dir in /var/log and count files in each directories +# ## /var/log -> count all files in /var/log and all of its subdirectories +# directories = ["/var/cache/apt/archives"] +# +# ## Only count files that match the name pattern. Defaults to "*". +# name = "*.deb" +# +# ## Count files in subdirectories. Defaults to true. +# recursive = false +# +# ## Only count regular files. Defaults to true. +# regular_only = true +# +# ## Follow all symlinks while walking the directory tree. Defaults to false. +# follow_symlinks = false +# +# ## Only count files that are at least this size. If size is +# ## a negative number, only count files that are smaller than the +# ## absolute value of size. Acceptable units are B, KiB, MiB, KB, ... +# ## Without quotes and units, interpreted as size in bytes. +# size = "0B" +# +# ## Only count files that have not been touched for at least this +# ## duration. If mtime is negative, only count files that have been +# ## touched in this duration. Defaults to "0s". +# mtime = "0s" + + +# # Read stats about given file(s) +# [[inputs.filestat]] +# ## Files to gather stats about. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/log/**.log"] +# +# ## If true, read the entire file and calculate an md5 checksum. +# md5 = false + + +# # Read real time temps from fireboard.io servers +# [[inputs.fireboard]] +# ## Specify auth token for your account +# auth_token = "invalidAuthToken" +# ## You can override the fireboard server URL if necessary +# # url = https://fireboard.io/api/v1/devices.json +# ## You can set a different http_timeout if you need to +# ## You should set a string using an number and time indicator +# ## for example "12s" for 12 seconds. +# # http_timeout = "4s" + + +# # Read metrics exposed by fluentd in_monitor plugin +# [[inputs.fluentd]] +# ## This plugin reads information exposed by fluentd (using /api/plugins.json endpoint). +# ## +# ## Endpoint: +# ## - only one URI is allowed +# ## - https is not supported +# endpoint = "http://localhost:24220/api/plugins.json" +# +# ## Define which plugins have to be excluded (based on "type" field - e.g. monitor_agent) +# exclude = [ +# "monitor_agent", +# "dummy", +# ] + + +# # Gather repository information from GitHub hosted repositories. +# [[inputs.github]] +# ## List of repositories to monitor. +# repositories = [ +# "influxdata/telegraf", +# "influxdata/influxdb" +# ] +# +# ## Github API access token. Unauthenticated requests are limited to 60 per hour. +# # access_token = "" +# +# ## Github API enterprise url. Github Enterprise accounts must specify their base url. +# # enterprise_base_url = "" +# +# ## Timeout for HTTP requests. +# # http_timeout = "5s" + + +# # Read flattened metrics from one or more GrayLog HTTP endpoints +# [[inputs.graylog]] +# ## API endpoint, currently supported API: +# ## +# ## - multiple (Ex http://:12900/system/metrics/multiple) +# ## - namespace (Ex http://:12900/system/metrics/namespace/{namespace}) +# ## +# ## For namespace endpoint, the metrics array will be ignored for that call. +# ## Endpoint can contain namespace and multiple type calls. +# ## +# ## Please check http://[graylog-server-ip]:12900/api-browser for full list +# ## of endpoints +# servers = [ +# "http://[graylog-server-ip]:12900/system/metrics/multiple", +# ] +# +# ## Metrics list +# ## List of metrics can be found on Graylog webservice documentation. +# ## Or by hitting the the web service api at: +# ## http://[graylog-host]:12900/system/metrics +# metrics = [ +# "jvm.cl.loaded", +# "jvm.memory.pools.Metaspace.committed" +# ] +# +# ## Username and password +# username = "" +# password = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics of haproxy, via socket or csv stats page +# [[inputs.haproxy]] +# ## An array of address to gather stats about. Specify an ip on hostname +# ## with optional port. ie localhost, 10.10.3.33:1936, etc. +# ## Make sure you specify the complete path to the stats endpoint +# ## including the protocol, ie http://10.10.3.33:1936/haproxy?stats +# +# ## If no servers are specified, then default to 127.0.0.1:1936/haproxy?stats +# servers = ["http://myhaproxy.com:1936/haproxy?stats"] +# +# ## Credentials for basic HTTP authentication +# # username = "admin" +# # password = "admin" +# +# ## You can also use local socket with standard wildcard globbing. +# ## Server address not starting with 'http' will be treated as a possible +# ## socket, so both examples below are valid. +# # servers = ["socket:/run/haproxy/admin.sock", "/run/haproxy/*.sock"] +# +# ## By default, some of the fields are renamed from what haproxy calls them. +# ## Setting this option to true results in the plugin keeping the original +# ## field names. +# # keep_field_names = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Monitor disks' temperatures using hddtemp +# [[inputs.hddtemp]] +# ## By default, telegraf gathers temps data from all disks detected by the +# ## hddtemp. +# ## +# ## Only collect temps from the selected disks. +# ## +# ## A * as the device name will return the temperature values of all disks. +# ## +# # address = "127.0.0.1:7634" +# # devices = ["sda", "*"] + + +# # Read formatted metrics from one or more HTTP endpoints +# [[inputs.http]] +# ## One or more URLs from which to read formatted metrics +# urls = [ +# "http://localhost/metrics" +# ] +# +# ## HTTP method +# # method = "GET" +# +# ## Optional HTTP headers +# # headers = {"X-Special-Header" = "Special-Value"} +# +# ## Optional file with Bearer token +# ## file content is added as an Authorization header +# # bearer_token = "/path/to/file" +# +# ## Optional HTTP Basic Auth Credentials +# # username = "username" +# # password = "pa$$word" +# +# ## HTTP entity-body to send with POST/PUT requests. +# # body = "" +# +# ## HTTP Content-Encoding for write request body, can be set to "gzip" to +# ## compress body or "identity" to apply no encoding. +# # content_encoding = "identity" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" +# +# ## List of success status codes +# # success_status_codes = [200] +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# # data_format = "influx" + + +# # HTTP/HTTPS request given an address a method and a timeout +# [[inputs.http_response]] +# ## Deprecated in 1.12, use 'urls' +# ## Server address (default http://localhost) +# # address = "http://localhost" +# +# ## List of urls to query. +# # urls = ["http://localhost"] +# +# ## Set http_proxy (telegraf uses the system wide proxy settings if it's is not set) +# # http_proxy = "http://localhost:8888" +# +# ## Set response_timeout (default 5 seconds) +# # response_timeout = "5s" +# +# ## HTTP Request Method +# # method = "GET" +# +# ## Whether to follow redirects from the server (defaults to false) +# # follow_redirects = false +# +# ## Optional file with Bearer token +# ## file content is added as an Authorization header +# # bearer_token = "/path/to/file" +# +# ## Optional HTTP Basic Auth Credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional HTTP Request Body +# # body = ''' +# # {'fake':'data'} +# # ''' +# +# ## Optional name of the field that will contain the body of the response. +# ## By default it is set to an empty String indicating that the body's content won't be added +# # response_body_field = '' +# +# ## Maximum allowed HTTP response body size in bytes. +# ## 0 means to use the default of 32MiB. +# ## If the response body size exceeds this limit a "body_read_error" will be raised +# # response_body_max_size = "32MiB" +# +# ## Optional substring or regex match in body of the response (case sensitive) +# # response_string_match = "\"service_status\": \"up\"" +# # response_string_match = "ok" +# # response_string_match = "\".*_status\".?:.?\"up\"" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## HTTP Request Headers (all values must be strings) +# # [inputs.http_response.headers] +# # Host = "github.com" +# +# ## Optional setting to map response http headers into tags +# ## If the http header is not present on the request, no corresponding tag will be added +# ## If multiple instances of the http header are present, only the first value will be used +# # http_header_tags = {"HTTP_HEADER" = "TAG_NAME"} +# +# ## Interface to use when dialing an address +# # interface = "eth0" + + +# # Read flattened metrics from one or more JSON HTTP endpoints +# [[inputs.httpjson]] +# ## NOTE This plugin only reads numerical measurements, strings and booleans +# ## will be ignored. +# +# ## Name for the service being polled. Will be appended to the name of the +# ## measurement e.g. httpjson_webserver_stats +# ## +# ## Deprecated (1.3.0): Use name_override, name_suffix, name_prefix instead. +# name = "webserver_stats" +# +# ## URL of each server in the service's cluster +# servers = [ +# "http://localhost:9999/stats/", +# "http://localhost:9998/stats/", +# ] +# ## Set response_timeout (default 5 seconds) +# response_timeout = "5s" +# +# ## HTTP method to use: GET or POST (case-sensitive) +# method = "GET" +# +# ## List of tag names to extract from top-level of JSON server response +# # tag_keys = [ +# # "my_tag_1", +# # "my_tag_2" +# # ] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## HTTP parameters (all values must be strings). For "GET" requests, data +# ## will be included in the query. For "POST" requests, data will be included +# ## in the request body as "x-www-form-urlencoded". +# # [inputs.httpjson.parameters] +# # event_type = "cpu_spike" +# # threshold = "0.75" +# +# ## HTTP Headers (all values must be strings) +# # [inputs.httpjson.headers] +# # X-Auth-Token = "my-xauth-token" +# # apiVersion = "v1" + + +# # Gather Icinga2 status +# [[inputs.icinga2]] +# ## Required Icinga2 server address +# # server = "https://localhost:5665" +# +# ## Required Icinga2 object type ("services" or "hosts") +# # object_type = "services" +# +# ## Credentials for basic HTTP authentication +# # username = "admin" +# # password = "admin" +# +# ## Maximum time to receive response. +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = true + + +# # Gets counters from all InfiniBand cards and ports installed +# [[inputs.infiniband]] +# # no configuration + + +# # Read InfluxDB-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.influxdb]] +# ## Works with InfluxDB debug endpoints out of the box, +# ## but other services can use this format too. +# ## See the influxdb plugin's README for more details. +# +# ## Multiple URLs from which to read InfluxDB-formatted JSON +# ## Default is "http://localhost:8086/debug/vars". +# urls = [ +# "http://localhost:8086/debug/vars" +# ] +# +# ## Username and password to send using HTTP Basic Authentication. +# # username = "" +# # password = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## http request & header timeout +# timeout = "5s" + + +# # Collect statistics about itself +# [[inputs.internal]] +# ## If true, collect telegraf memory stats. +# # collect_memstats = true + + +# # This plugin gathers interrupts data from /proc/interrupts and /proc/softirqs. +# [[inputs.interrupts]] +# ## When set to true, cpu metrics are tagged with the cpu. Otherwise cpu is +# ## stored as a field. +# ## +# ## The default is false for backwards compatibility, and will be changed to +# ## true in a future version. It is recommended to set to true on new +# ## deployments. +# # cpu_as_tag = false +# +# ## To filter which IRQs to collect, make use of tagpass / tagdrop, i.e. +# # [inputs.interrupts.tagdrop] +# # irq = [ "NET_RX", "TASKLET" ] + + +# # Read metrics from the bare metal servers via IPMI +# [[inputs.ipmi_sensor]] +# ## optionally specify the path to the ipmitool executable +# # path = "/usr/bin/ipmitool" +# ## +# ## Setting 'use_sudo' to true will make use of sudo to run ipmitool. +# ## Sudo must be configured to allow the telegraf user to run ipmitool +# ## without a password. +# # use_sudo = false +# ## +# ## optionally force session privilege level. Can be CALLBACK, USER, OPERATOR, ADMINISTRATOR +# # privilege = "ADMINISTRATOR" +# ## +# ## optionally specify one or more servers via a url matching +# ## [username[:password]@][protocol[(address)]] +# ## e.g. +# ## root:passwd@lan(127.0.0.1) +# ## +# ## if no servers are specified, local machine sensor stats will be queried +# ## +# # servers = ["USERID:PASSW0RD@lan(192.168.1.1)"] +# +# ## Recommended: use metric 'interval' that is a multiple of 'timeout' to avoid +# ## gaps or overlap in pulled data +# interval = "30s" +# +# ## Timeout for the ipmitool command to complete +# timeout = "20s" +# +# ## Schema Version: (Optional, defaults to version 1) +# metric_version = 2 + + +# # Gather packets and bytes counters from Linux ipsets +# [[inputs.ipset]] +# ## By default, we only show sets which have already matched at least 1 packet. +# ## set include_unmatched_sets = true to gather them all. +# include_unmatched_sets = false +# ## Adjust your sudo settings appropriately if using this option ("sudo ipset save") +# use_sudo = false +# ## The default timeout of 1s for ipset execution can be overridden here: +# # timeout = "1s" + + +# # Gather packets and bytes throughput from iptables +# [[inputs.iptables]] +# ## iptables require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run iptables. +# ## Users must configure sudo to allow telegraf user to run iptables with no password. +# ## iptables can be restricted to only list command "iptables -nvL". +# use_sudo = false +# ## Setting 'use_lock' to true runs iptables with the "-w" option. +# ## Adjust your sudo settings appropriately if using this option ("iptables -w 5 -nvl") +# use_lock = false +# ## Define an alternate executable, such as "ip6tables". Default is "iptables". +# # binary = "ip6tables" +# ## defines the table to monitor: +# table = "filter" +# ## defines the chains to monitor. +# ## NOTE: iptables rules without a comment will not be monitored. +# ## Read the plugin documentation for more information. +# chains = [ "INPUT" ] + + +# # Collect virtual and real server stats from Linux IPVS +# [[inputs.ipvs]] +# # no configuration + + +# # Read jobs and cluster metrics from Jenkins instances +# [[inputs.jenkins]] +# ## The Jenkins URL in the format "schema://host:port" +# url = "http://my-jenkins-instance:8080" +# # username = "admin" +# # password = "admin" +# +# ## Set response_timeout +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Optional Max Job Build Age filter +# ## Default 1 hour, ignore builds older than max_build_age +# # max_build_age = "1h" +# +# ## Optional Sub Job Depth filter +# ## Jenkins can have unlimited layer of sub jobs +# ## This config will limit the layers of pulling, default value 0 means +# ## unlimited pulling until no more sub jobs +# # max_subjob_depth = 0 +# +# ## Optional Sub Job Per Layer +# ## In workflow-multibranch-plugin, each branch will be created as a sub job. +# ## This config will limit to call only the lasted branches in each layer, +# ## empty will use default value 10 +# # max_subjob_per_layer = 10 +# +# ## Jobs to exclude from gathering +# # job_exclude = [ "job1", "job2/subjob1/subjob2", "job3/*"] +# +# ## Nodes to exclude from gathering +# # node_exclude = [ "node1", "node2" ] +# +# ## Worker pool for jenkins plugin only +# ## Empty this field will use default value 5 +# # max_connections = 5 + + +# # Read JMX metrics through Jolokia +# [[inputs.jolokia]] +# # DEPRECATED: the jolokia plugin has been deprecated in favor of the +# # jolokia2 plugin +# # see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/jolokia2 +# +# ## This is the context root used to compose the jolokia url +# ## NOTE that Jolokia requires a trailing slash at the end of the context root +# ## NOTE that your jolokia security policy must allow for POST requests. +# context = "/jolokia/" +# +# ## This specifies the mode used +# # mode = "proxy" +# # +# ## When in proxy mode this section is used to specify further +# ## proxy address configurations. +# ## Remember to change host address to fit your environment. +# # [inputs.jolokia.proxy] +# # host = "127.0.0.1" +# # port = "8080" +# +# ## Optional http timeouts +# ## +# ## response_header_timeout, if non-zero, specifies the amount of time to wait +# ## for a server's response headers after fully writing the request. +# # response_header_timeout = "3s" +# ## +# ## client_timeout specifies a time limit for requests made by this client. +# ## Includes connection time, any redirects, and reading the response body. +# # client_timeout = "4s" +# +# ## Attribute delimiter +# ## +# ## When multiple attributes are returned for a single +# ## [inputs.jolokia.metrics], the field name is a concatenation of the metric +# ## name, and the attribute name, separated by the given delimiter. +# # delimiter = "_" +# +# ## List of servers exposing jolokia read service +# [[inputs.jolokia.servers]] +# name = "as-server-01" +# host = "127.0.0.1" +# port = "8080" +# # username = "myuser" +# # password = "mypassword" +# +# ## List of metrics collected on above servers +# ## Each metric consists in a name, a jmx path and either +# ## a pass or drop slice attribute. +# ## This collect all heap memory usage metrics. +# [[inputs.jolokia.metrics]] +# name = "heap_memory_usage" +# mbean = "java.lang:type=Memory" +# attribute = "HeapMemoryUsage" +# +# ## This collect thread counts metrics. +# [[inputs.jolokia.metrics]] +# name = "thread_count" +# mbean = "java.lang:type=Threading" +# attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" +# +# ## This collect number of class loaded/unloaded counts metrics. +# [[inputs.jolokia.metrics]] +# name = "class_count" +# mbean = "java.lang:type=ClassLoading" +# attribute = "LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" + + +# # Read JMX metrics from a Jolokia REST agent endpoint +# [[inputs.jolokia2_agent]] +# # default_tag_prefix = "" +# # default_field_prefix = "" +# # default_field_separator = "." +# +# # Add agents URLs to query +# urls = ["http://localhost:8080/jolokia"] +# # username = "" +# # password = "" +# # response_timeout = "5s" +# +# ## Optional TLS config +# # tls_ca = "/var/private/ca.pem" +# # tls_cert = "/var/private/client.pem" +# # tls_key = "/var/private/client-key.pem" +# # insecure_skip_verify = false +# +# ## Add metrics to read +# [[inputs.jolokia2_agent.metric]] +# name = "java_runtime" +# mbean = "java.lang:type=Runtime" +# paths = ["Uptime"] + + +# # Read JMX metrics from a Jolokia REST proxy endpoint +# [[inputs.jolokia2_proxy]] +# # default_tag_prefix = "" +# # default_field_prefix = "" +# # default_field_separator = "." +# +# ## Proxy agent +# url = "http://localhost:8080/jolokia" +# # username = "" +# # password = "" +# # response_timeout = "5s" +# +# ## Optional TLS config +# # tls_ca = "/var/private/ca.pem" +# # tls_cert = "/var/private/client.pem" +# # tls_key = "/var/private/client-key.pem" +# # insecure_skip_verify = false +# +# ## Add proxy targets to query +# # default_target_username = "" +# # default_target_password = "" +# [[inputs.jolokia2_proxy.target]] +# url = "service:jmx:rmi:///jndi/rmi://targethost:9999/jmxrmi" +# # username = "" +# # password = "" +# +# ## Add metrics to read +# [[inputs.jolokia2_proxy.metric]] +# name = "java_runtime" +# mbean = "java.lang:type=Runtime" +# paths = ["Uptime"] + + +# # Read Kapacitor-formatted JSON metrics from one or more HTTP endpoints +# [[inputs.kapacitor]] +# ## Multiple URLs from which to read Kapacitor-formatted JSON +# ## Default is "http://localhost:9092/kapacitor/v1/debug/vars". +# urls = [ +# "http://localhost:9092/kapacitor/v1/debug/vars" +# ] +# +# ## Time limit for http requests +# timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Get kernel statistics from /proc/vmstat +# [[inputs.kernel_vmstat]] +# # no configuration + + +# # Read status information from one or more Kibana servers +# [[inputs.kibana]] +# ## Specify a list of one or more Kibana servers +# servers = ["http://localhost:5601"] +# +# ## Timeout for HTTP requests +# timeout = "5s" +# +# ## HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from the Kubernetes api +# [[inputs.kube_inventory]] +# ## URL for the Kubernetes API +# url = "https://127.0.0.1" +# +# ## Namespace to use. Set to "" to use all namespaces. +# # namespace = "default" +# +# ## Use bearer token for authorization. ('bearer_token' takes priority) +# ## If both of these are empty, we'll use the default serviceaccount: +# ## at: /run/secrets/kubernetes.io/serviceaccount/token +# # bearer_token = "/path/to/bearer/token" +# ## OR +# # bearer_token_string = "abc_123" +# +# ## Set response_timeout (default 5 seconds) +# # response_timeout = "5s" +# +# ## Optional Resources to exclude from gathering +# ## Leave them with blank with try to gather everything available. +# ## Values can be - "daemonsets", deployments", "endpoints", "ingress", "nodes", +# ## "persistentvolumes", "persistentvolumeclaims", "pods", "services", "statefulsets" +# # resource_exclude = [ "deployments", "nodes", "statefulsets" ] +# +# ## Optional Resources to include when gathering +# ## Overrides resource_exclude if both set. +# # resource_include = [ "deployments", "nodes", "statefulsets" ] +# +# ## selectors to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all selectors as tags +# ## selector_exclude overrides selector_include if both set. +# # selector_include = [] +# # selector_exclude = ["*"] +# +# ## Optional TLS Config +# # tls_ca = "/path/to/cafile" +# # tls_cert = "/path/to/certfile" +# # tls_key = "/path/to/keyfile" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from the kubernetes kubelet api +# [[inputs.kubernetes]] +# ## URL for the kubelet +# url = "http://127.0.0.1:10255" +# +# ## Use bearer token for authorization. ('bearer_token' takes priority) +# ## If both of these are empty, we'll use the default serviceaccount: +# ## at: /run/secrets/kubernetes.io/serviceaccount/token +# # bearer_token = "/path/to/bearer/token" +# ## OR +# # bearer_token_string = "abc_123" +# +# ## Pod labels to be added as tags. An empty array for both include and +# ## exclude will include all labels. +# # label_include = [] +# # label_exclude = ["*"] +# +# ## Set response_timeout (default 5 seconds) +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = /path/to/cafile +# # tls_cert = /path/to/certfile +# # tls_key = /path/to/keyfile +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from a LeoFS Server via SNMP +# [[inputs.leofs]] +# ## An array of URLs of the form: +# ## host [ ":" port] +# servers = ["127.0.0.1:4020"] + + +# # Provides Linux sysctl fs metrics +# [[inputs.linux_sysctl_fs]] +# # no configuration + + +# # Read metrics exposed by Logstash +# [[inputs.logstash]] +# ## The URL of the exposed Logstash API endpoint. +# url = "http://127.0.0.1:9600" +# +# ## Use Logstash 5 single pipeline API, set to true when monitoring +# ## Logstash 5. +# # single_pipeline = false +# +# ## Enable optional collection components. Can contain +# ## "pipelines", "process", and "jvm". +# # collect = ["pipelines", "process", "jvm"] +# +# ## Timeout for HTTP requests. +# # timeout = "5s" +# +# ## Optional HTTP Basic Auth credentials. +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config. +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Use TLS but skip chain & host verification. +# # insecure_skip_verify = false +# +# ## Optional HTTP headers. +# # [inputs.logstash.headers] +# # "X-Special-Header" = "Special-Value" + + +# # Read metrics from local Lustre service on OST, MDS +# [[inputs.lustre2]] +# ## An array of /proc globs to search for Lustre stats +# ## If not specified, the default will work on Lustre 2.5.x +# ## +# # ost_procfiles = [ +# # "/proc/fs/lustre/obdfilter/*/stats", +# # "/proc/fs/lustre/osd-ldiskfs/*/stats", +# # "/proc/fs/lustre/obdfilter/*/job_stats", +# # ] +# # mds_procfiles = [ +# # "/proc/fs/lustre/mdt/*/md_stats", +# # "/proc/fs/lustre/mdt/*/job_stats", +# # ] + + +# # Gathers metrics from the /3.0/reports MailChimp API +# [[inputs.mailchimp]] +# ## MailChimp API key +# ## get from https://admin.mailchimp.com/account/api/ +# api_key = "" # required +# ## Reports for campaigns sent more than days_old ago will not be collected. +# ## 0 means collect all. +# days_old = 0 +# ## Campaign ID to get, if empty gets all campaigns, this option overrides days_old +# # campaign_id = "" + + +# # Retrieves information on a specific host in a MarkLogic Cluster +# [[inputs.marklogic]] +# ## Base URL of the MarkLogic HTTP Server. +# url = "http://localhost:8002" +# +# ## List of specific hostnames to retrieve information. At least (1) required. +# # hosts = ["hostname1", "hostname2"] +# +# ## Using HTTP Basic Authentication. Management API requires 'manage-user' role privileges +# # username = "myuser" +# # password = "mypassword" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from one or many mcrouter servers +# [[inputs.mcrouter]] +# ## An array of address to gather stats about. Specify an ip or hostname +# ## with port. ie tcp://localhost:11211, tcp://10.0.0.1:11211, etc. +# servers = ["tcp://localhost:11211", "unix:///var/run/mcrouter.sock"] +# +# ## Timeout for metric collections from all servers. Minimum timeout is "1s". +# # timeout = "5s" + + +# # Read metrics from one or many memcached servers +# [[inputs.memcached]] +# ## An array of address to gather stats about. Specify an ip on hostname +# ## with optional port. ie localhost, 10.0.0.1:11211, etc. +# servers = ["localhost:11211"] +# # unix_sockets = ["/var/run/memcached.sock"] + + +# # Telegraf plugin for gathering metrics from N Mesos masters +# [[inputs.mesos]] +# ## Timeout, in ms. +# timeout = 100 +# +# ## A list of Mesos masters. +# masters = ["http://localhost:5050"] +# +# ## Master metrics groups to be collected, by default, all enabled. +# master_collections = [ +# "resources", +# "master", +# "system", +# "agents", +# "frameworks", +# "framework_offers", +# "tasks", +# "messages", +# "evqueue", +# "registrar", +# "allocator", +# ] +# +# ## A list of Mesos slaves, default is [] +# # slaves = [] +# +# ## Slave metrics groups to be collected, by default, all enabled. +# # slave_collections = [ +# # "resources", +# # "agent", +# # "system", +# # "executors", +# # "tasks", +# # "messages", +# # ] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Collects scores from a Minecraft server's scoreboard using the RCON protocol +# [[inputs.minecraft]] +# ## Address of the Minecraft server. +# # server = "localhost" +# +# ## Server RCON Port. +# # port = "25575" +# +# ## Server RCON Password. +# password = "" +# +# ## Uncomment to remove deprecated metric components. +# # tagdrop = ["server"] + + +# # Retrieve data from MODBUS slave devices +# [[inputs.modbus]] +# ## Connection Configuration +# ## +# ## The plugin supports connections to PLCs via MODBUS/TCP or +# ## via serial line communication in binary (RTU) or readable (ASCII) encoding +# ## +# ## Device name +# name = "Device" +# +# ## Slave ID - addresses a MODBUS device on the bus +# ## Range: 0 - 255 [0 = broadcast; 248 - 255 = reserved] +# slave_id = 1 +# +# ## Timeout for each request +# timeout = "1s" +# +# ## Maximum number of retries and the time to wait between retries +# ## when a slave-device is busy. +# # busy_retries = 0 +# # busy_retries_wait = "100ms" +# +# # TCP - connect via Modbus/TCP +# controller = "tcp://localhost:502" +# +# ## Serial (RS485; RS232) +# # controller = "file:///dev/ttyUSB0" +# # baud_rate = 9600 +# # data_bits = 8 +# # parity = "N" +# # stop_bits = 1 +# # transmission_mode = "RTU" +# +# +# ## Measurements +# ## +# +# ## Digital Variables, Discrete Inputs and Coils +# ## measurement - the (optional) measurement name, defaults to "modbus" +# ## name - the variable name +# ## address - variable address +# +# discrete_inputs = [ +# { name = "start", address = [0]}, +# { name = "stop", address = [1]}, +# { name = "reset", address = [2]}, +# { name = "emergency_stop", address = [3]}, +# ] +# coils = [ +# { name = "motor1_run", address = [0]}, +# { name = "motor1_jog", address = [1]}, +# { name = "motor1_stop", address = [2]}, +# ] +# +# ## Analog Variables, Input Registers and Holding Registers +# ## measurement - the (optional) measurement name, defaults to "modbus" +# ## name - the variable name +# ## byte_order - the ordering of bytes +# ## |---AB, ABCD - Big Endian +# ## |---BA, DCBA - Little Endian +# ## |---BADC - Mid-Big Endian +# ## |---CDAB - Mid-Little Endian +# ## data_type - INT16, UINT16, INT32, UINT32, INT64, UINT64, FLOAT32-IEEE (the IEEE 754 binary representation) +# ## FLOAT32, FIXED, UFIXED (fixed-point representation on input) +# ## scale - the final numeric variable representation +# ## address - variable address +# +# holding_registers = [ +# { name = "power_factor", byte_order = "AB", data_type = "FIXED", scale=0.01, address = [8]}, +# { name = "voltage", byte_order = "AB", data_type = "FIXED", scale=0.1, address = [0]}, +# { name = "energy", byte_order = "ABCD", data_type = "FIXED", scale=0.001, address = [5,6]}, +# { name = "current", byte_order = "ABCD", data_type = "FIXED", scale=0.001, address = [1,2]}, +# { name = "frequency", byte_order = "AB", data_type = "UFIXED", scale=0.1, address = [7]}, +# { name = "power", byte_order = "ABCD", data_type = "UFIXED", scale=0.1, address = [3,4]}, +# ] +# input_registers = [ +# { name = "tank_level", byte_order = "AB", data_type = "INT16", scale=1.0, address = [0]}, +# { name = "tank_ph", byte_order = "AB", data_type = "INT16", scale=1.0, address = [1]}, +# { name = "pump1_speed", byte_order = "ABCD", data_type = "INT32", scale=1.0, address = [3,4]}, +# ] + + +# # Read metrics from one or many MongoDB servers +# [[inputs.mongodb]] +# ## An array of URLs of the form: +# ## "mongodb://" [user ":" pass "@"] host [ ":" port] +# ## For example: +# ## mongodb://user:auth_key@10.10.3.30:27017, +# ## mongodb://10.10.3.33:18832, +# servers = ["mongodb://127.0.0.1:27017"] +# +# ## When true, collect cluster status +# ## Note that the query that counts jumbo chunks triggers a COLLSCAN, which +# ## may have an impact on performance. +# # gather_cluster_status = true +# +# ## When true, collect per database stats +# # gather_perdb_stats = false +# +# ## When true, collect per collection stats +# # gather_col_stats = false +# +# ## List of db where collections stats are collected +# ## If empty, all db are concerned +# # col_stats_dbs = ["local"] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics and status information about processes managed by Monit +# [[inputs.monit]] +# ## Monit HTTPD address +# address = "http://127.0.0.1:2812" +# +# ## Username and Password for Monit +# # username = "" +# # password = "" +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Aggregates the contents of multiple files into a single point +# [[inputs.multifile]] +# ## Base directory where telegraf will look for files. +# ## Omit this option to use absolute paths. +# base_dir = "/sys/bus/i2c/devices/1-0076/iio:device0" +# +# ## If true, Telegraf discard all data when a single file can't be read. +# ## Else, Telegraf omits the field generated from this file. +# # fail_early = true +# +# ## Files to parse each interval. +# [[inputs.multifile.file]] +# file = "in_pressure_input" +# dest = "pressure" +# conversion = "float" +# [[inputs.multifile.file]] +# file = "in_temp_input" +# dest = "temperature" +# conversion = "float(3)" +# [[inputs.multifile.file]] +# file = "in_humidityrelative_input" +# dest = "humidityrelative" +# conversion = "float(3)" + + +# # Read metrics from one or many mysql servers +# [[inputs.mysql]] +# ## specify servers via a url matching: +# ## [username[:password]@][protocol[(address)]]/[?tls=[true|false|skip-verify|custom]] +# ## see https://github.com/go-sql-driver/mysql#dsn-data-source-name +# ## e.g. +# ## servers = ["user:passwd@tcp(127.0.0.1:3306)/?tls=false"] +# ## servers = ["user@tcp(127.0.0.1:3306)/?tls=false"] +# # +# ## If no servers are specified, then localhost is used as the host. +# servers = ["tcp(127.0.0.1:3306)/"] +# +# ## Selects the metric output format. +# ## +# ## This option exists to maintain backwards compatibility, if you have +# ## existing metrics do not set or change this value until you are ready to +# ## migrate to the new format. +# ## +# ## If you do not have existing metrics from this plugin set to the latest +# ## version. +# ## +# ## Telegraf >=1.6: metric_version = 2 +# ## <1.6: metric_version = 1 (or unset) +# metric_version = 2 +# +# ## if the list is empty, then metrics are gathered from all database tables +# # table_schema_databases = [] +# +# ## gather metrics from INFORMATION_SCHEMA.TABLES for databases provided above list +# # gather_table_schema = false +# +# ## gather thread state counts from INFORMATION_SCHEMA.PROCESSLIST +# # gather_process_list = false +# +# ## gather user statistics from INFORMATION_SCHEMA.USER_STATISTICS +# # gather_user_statistics = false +# +# ## gather auto_increment columns and max values from information schema +# # gather_info_schema_auto_inc = false +# +# ## gather metrics from INFORMATION_SCHEMA.INNODB_METRICS +# # gather_innodb_metrics = false +# +# ## gather metrics from SHOW SLAVE STATUS command output +# # gather_slave_status = false +# +# ## gather metrics from SHOW BINARY LOGS command output +# # gather_binary_logs = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.GLOBAL_VARIABLES +# # gather_global_variables = true +# +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMARY_BY_TABLE +# # gather_table_io_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_LOCK_WAITS +# # gather_table_lock_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMARY_BY_INDEX_USAGE +# # gather_index_io_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.EVENT_WAITS +# # gather_event_waits = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME +# # gather_file_events_stats = false +# +# ## gather metrics from PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST +# # gather_perf_events_statements = false +# +# ## the limits for metrics form perf_events_statements +# # perf_events_statements_digest_text_limit = 120 +# # perf_events_statements_limit = 250 +# # perf_events_statements_time_limit = 86400 +# +# ## Some queries we may want to run less often (such as SHOW GLOBAL VARIABLES) +# ## example: interval_slow = "30m" +# # interval_slow = "" +# +# ## Optional TLS Config (will be used if tls=custom parameter specified in server uri) +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Provides metrics about the state of a NATS server +# [[inputs.nats]] +# ## The address of the monitoring endpoint of the NATS server +# server = "http://localhost:8222" +# +# ## Maximum time to receive response +# # response_timeout = "5s" + + +# # Neptune Apex data collector +# [[inputs.neptune_apex]] +# ## The Neptune Apex plugin reads the publicly available status.xml data from a local Apex. +# ## Measurements will be logged under "apex". +# +# ## The base URL of the local Apex(es). If you specify more than one server, they will +# ## be differentiated by the "source" tag. +# servers = [ +# "http://apex.local", +# ] +# +# ## The response_timeout specifies how long to wait for a reply from the Apex. +# #response_timeout = "5s" + + +# # Read metrics about network interface usage +[[inputs.net]] +# ## By default, telegraf gathers stats from any up interface (excluding loopback) +# ## Setting interfaces will tell it to gather these explicit interfaces, +# ## regardless of status. +# ## +# # interfaces = ["eth0"] +# ## +# ## On linux systems telegraf also collects protocol stats. +# ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. +# ## +# # ignore_protocol_stats = false +# ## + + +# # Collect response time of a TCP or UDP connection +# [[inputs.net_response]] +# ## Protocol, must be "tcp" or "udp" +# ## NOTE: because the "udp" protocol does not respond to requests, it requires +# ## a send/expect string pair (see below). +# protocol = "tcp" +# ## Server address (default localhost) +# address = "localhost:80" +# +# ## Set timeout +# # timeout = "1s" +# +# ## Set read timeout (only used if expecting a response) +# # read_timeout = "1s" +# +# ## The following options are required for UDP checks. For TCP, they are +# ## optional. The plugin will send the given string to the server and then +# ## expect to receive the given 'expect' string back. +# ## string sent to the server +# # send = "ssh" +# ## expected string in answer +# # expect = "ssh" +# +# ## Uncomment to remove deprecated fields +# # fielddrop = ["result_type", "string_found"] + + +# # Read TCP metrics such as established, time wait and sockets counts. +# [[inputs.netstat]] +# # no configuration + + +# # Read Nginx's basic status information (ngx_http_stub_status_module) +# [[inputs.nginx]] +# # An array of Nginx stub_status URI to gather stats. +# urls = ["http://localhost/server_status"] +# +# ## Optional TLS Config +# tls_ca = "/etc/telegraf/ca.pem" +# tls_cert = "/etc/telegraf/cert.cer" +# tls_key = "/etc/telegraf/key.key" +# ## Use TLS but skip chain & host verification +# insecure_skip_verify = false +# +# # HTTP response timeout (default: 5s) +# response_timeout = "5s" + + +# # Read Nginx Plus' full status information (ngx_http_status_module) +# [[inputs.nginx_plus]] +# ## An array of ngx_http_status_module or status URI to gather stats. +# urls = ["http://localhost/status"] +# +# # HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read Nginx Plus Api documentation +# [[inputs.nginx_plus_api]] +# ## An array of API URI to gather stats. +# urls = ["http://localhost/api"] +# +# # Nginx API version, default: 3 +# # api_version = 3 +# +# # HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read Nginx virtual host traffic status module information (nginx-module-sts) +# [[inputs.nginx_sts]] +# ## An array of ngx_http_status_module or status URI to gather stats. +# urls = ["http://localhost/status"] +# +# ## HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read nginx_upstream_check module status information (https://github.com/yaoweibin/nginx_upstream_check_module) +# [[inputs.nginx_upstream_check]] +# ## An URL where Nginx Upstream check module is enabled +# ## It should be set to return a JSON formatted response +# url = "http://127.0.0.1/status?format=json" +# +# ## HTTP method +# # method = "GET" +# +# ## Optional HTTP headers +# # headers = {"X-Special-Header" = "Special-Value"} +# +# ## Override HTTP "Host" header +# # host_header = "check.example.com" +# +# ## Timeout for HTTP requests +# timeout = "5s" +# +# ## Optional HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read Nginx virtual host traffic status module information (nginx-module-vts) +# [[inputs.nginx_vts]] +# ## An array of ngx_http_status_module or status URI to gather stats. +# urls = ["http://localhost/status"] +# +# ## HTTP response timeout (default: 5s) +# response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read NSQ topic and channel statistics. +# [[inputs.nsq]] +# ## An array of NSQD HTTP API endpoints +# endpoints = ["http://localhost:4151"] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Collect kernel snmp counters and network interface statistics +# [[inputs.nstat]] +# ## file paths for proc files. If empty default paths will be used: +# ## /proc/net/netstat, /proc/net/snmp, /proc/net/snmp6 +# ## These can also be overridden with env variables, see README. +# proc_net_netstat = "/proc/net/netstat" +# proc_net_snmp = "/proc/net/snmp" +# proc_net_snmp6 = "/proc/net/snmp6" +# ## dump metrics with 0 values too +# dump_zeros = true + + +# # Get standard NTP query metrics, requires ntpq executable. +# [[inputs.ntpq]] +# ## If false, set the -n ntpq flag. Can reduce metric gather time. +# dns_lookup = true + + +# # Pulls statistics from nvidia GPUs attached to the host +[[inputs.nvidia_smi]] +# ## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# # bin_path = "/usr/bin/nvidia-smi" +# +# ## Optional: timeout for GPU polling +# # timeout = "5s" + + +# # OpenLDAP cn=Monitor plugin +# [[inputs.openldap]] +# host = "localhost" +# port = 389 +# +# # ldaps, starttls, or no encryption. default is an empty string, disabling all encryption. +# # note that port will likely need to be changed to 636 for ldaps +# # valid options: "" | "starttls" | "ldaps" +# tls = "" +# +# # skip peer certificate verification. Default is false. +# insecure_skip_verify = false +# +# # Path to PEM-encoded Root certificate to use to verify server certificate +# tls_ca = "/etc/ssl/certs.pem" +# +# # dn/password to bind with. If bind_dn is empty, an anonymous bind is performed. +# bind_dn = "" +# bind_password = "" +# +# # Reverse metric names so they sort more naturally. Recommended. +# # This defaults to false if unset, but is set to true when generating a new config +# reverse_metric_names = true + + +# # Get standard NTP query metrics from OpenNTPD. +# [[inputs.openntpd]] +# ## Run ntpctl binary with sudo. +# # use_sudo = false +# +# ## Location of the ntpctl binary. +# # binary = "/usr/sbin/ntpctl" +# +# ## Maximum time the ntpctl binary is allowed to run. +# # timeout = "5ms" + + +# # A plugin to collect stats from Opensmtpd - a validating, recursive, and caching DNS resolver +# [[inputs.opensmtpd]] +# ## If running as a restricted user you can prepend sudo for additional access: +# #use_sudo = false +# +# ## The default location of the smtpctl binary can be overridden with: +# binary = "/usr/sbin/smtpctl" +# +# ## The default timeout of 1000ms can be overridden with (in milliseconds): +# timeout = 1000 + + +# # Read current weather and forecasts data from openweathermap.org +# [[inputs.openweathermap]] +# ## OpenWeatherMap API key. +# app_id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" +# +# ## City ID's to collect weather data from. +# city_id = ["5391959"] +# +# ## Language of the description field. Can be one of "ar", "bg", +# ## "ca", "cz", "de", "el", "en", "fa", "fi", "fr", "gl", "hr", "hu", +# ## "it", "ja", "kr", "la", "lt", "mk", "nl", "pl", "pt", "ro", "ru", +# ## "se", "sk", "sl", "es", "tr", "ua", "vi", "zh_cn", "zh_tw" +# # lang = "en" +# +# ## APIs to fetch; can contain "weather" or "forecast". +# fetch = ["weather", "forecast"] +# +# ## OpenWeatherMap base URL +# # base_url = "https://api.openweathermap.org/" +# +# ## Timeout for HTTP response. +# # response_timeout = "5s" +# +# ## Preferred unit system for temperature and wind speed. Can be one of +# ## "metric", "imperial", or "standard". +# # units = "metric" +# +# ## Query interval; OpenWeatherMap updates their weather data every 10 +# ## minutes. +# interval = "10m" + + +# # Read metrics of passenger using passenger-status +# [[inputs.passenger]] +# ## Path of passenger-status. +# ## +# ## Plugin gather metric via parsing XML output of passenger-status +# ## More information about the tool: +# ## https://www.phusionpassenger.com/library/admin/apache/overall_status_report.html +# ## +# ## If no path is specified, then the plugin simply execute passenger-status +# ## hopefully it can be found in your PATH +# command = "passenger-status -v --show=xml" + + +# # Gather counters from PF +# [[inputs.pf]] +# ## PF require root access on most systems. +# ## Setting 'use_sudo' to true will make use of sudo to run pfctl. +# ## Users must configure sudo to allow telegraf user to run pfctl with no password. +# ## pfctl can be restricted to only list command "pfctl -s info". +# use_sudo = false + + +# # Read metrics of phpfpm, via HTTP status page or socket +# [[inputs.phpfpm]] +# ## An array of addresses to gather stats about. Specify an ip or hostname +# ## with optional port and path +# ## +# ## Plugin can be configured in three modes (either can be used): +# ## - http: the URL must start with http:// or https://, ie: +# ## "http://localhost/status" +# ## "http://192.168.130.1/status?full" +# ## +# ## - unixsocket: path to fpm socket, ie: +# ## "/var/run/php5-fpm.sock" +# ## or using a custom fpm status path: +# ## "/var/run/php5-fpm.sock:fpm-custom-status-path" +# ## +# ## - fcgi: the URL must start with fcgi:// or cgi://, and port must be present, ie: +# ## "fcgi://10.0.0.12:9000/status" +# ## "cgi://10.0.10.12:9001/status" +# ## +# ## Example of multiple gathering from local socket and remote host +# ## urls = ["http://192.168.1.20/status", "/tmp/fpm.sock"] +# urls = ["http://localhost/status"] +# +# ## Duration allowed to complete HTTP requests. +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Ping given url(s) and return statistics +# [[inputs.ping]] +# ## Hosts to send ping packets to. +# urls = ["example.org"] +# +# ## Method used for sending pings, can be either "exec" or "native". When set +# ## to "exec" the systems ping command will be executed. When set to "native" +# ## the plugin will send pings directly. +# ## +# ## While the default is "exec" for backwards compatibility, new deployments +# ## are encouraged to use the "native" method for improved compatibility and +# ## performance. +# # method = "exec" +# +# ## Number of ping packets to send per interval. Corresponds to the "-c" +# ## option of the ping command. +# # count = 1 +# +# ## Time to wait between sending ping packets in seconds. Operates like the +# ## "-i" option of the ping command. +# # ping_interval = 1.0 +# +# ## If set, the time to wait for a ping response in seconds. Operates like +# ## the "-W" option of the ping command. +# # timeout = 1.0 +# +# ## If set, the total ping deadline, in seconds. Operates like the -w option +# ## of the ping command. +# # deadline = 10 +# +# ## Interface or source address to send ping from. Operates like the -I or -S +# ## option of the ping command. +# # interface = "" +# +# ## Specify the ping executable binary. +# # binary = "ping" +# +# ## Arguments for ping command. When arguments is not empty, the command from +# ## the binary option will be used and other options (ping_interval, timeout, +# ## etc) will be ignored. +# # arguments = ["-c", "3"] +# +# ## Use only IPv6 addresses when resolving a hostname. +# # ipv6 = false + + +# # Measure postfix queue statistics +# [[inputs.postfix]] +# ## Postfix queue directory. If not provided, telegraf will try to use +# ## 'postconf -h queue_directory' to determine it. +# # queue_directory = "/var/spool/postfix" + + +# # Read metrics from one or many PowerDNS servers +# [[inputs.powerdns]] +# ## An array of sockets to gather stats about. +# ## Specify a path to unix socket. +# unix_sockets = ["/var/run/pdns.controlsocket"] + + +# # Read metrics from one or many PowerDNS Recursor servers +# [[inputs.powerdns_recursor]] +# ## Path to the Recursor control socket. +# unix_sockets = ["/var/run/pdns_recursor.controlsocket"] +# +# ## Directory to create receive socket. This default is likely not writable, +# ## please reference the full plugin documentation for a recommended setup. +# # socket_dir = "/var/run/" +# ## Socket permissions for the receive socket. +# # socket_mode = "0666" + + +# # Monitor process cpu and memory usage +# [[inputs.procstat]] +# ## PID file to monitor process +# pid_file = "/var/run/nginx.pid" +# ## executable name (ie, pgrep ) +# # exe = "nginx" +# ## pattern as argument for pgrep (ie, pgrep -f ) +# # pattern = "nginx" +# ## user as argument for pgrep (ie, pgrep -u ) +# # user = "nginx" +# ## Systemd unit name +# # systemd_unit = "nginx.service" +# ## CGroup name or path +# # cgroup = "systemd/system.slice/nginx.service" +# +# ## Windows service name +# # win_service = "" +# +# ## override for process_name +# ## This is optional; default is sourced from /proc//status +# # process_name = "bar" +# +# ## Field name prefix +# # prefix = "" +# +# ## When true add the full cmdline as a tag. +# # cmdline_tag = false +# +# ## Add the PID as a tag instead of as a field. When collecting multiple +# ## processes with otherwise matching tags this setting should be enabled to +# ## ensure each process has a unique identity. +# ## +# ## Enabling this option may result in a large number of series, especially +# ## when processes have a short lifetime. +# # pid_tag = false +# +# ## Method to use when finding process IDs. Can be one of 'pgrep', or +# ## 'native'. The pgrep finder calls the pgrep executable in the PATH while +# ## the native finder performs the search directly in a manor dependent on the +# ## platform. Default is 'pgrep' +# # pid_finder = "pgrep" + + +# # Reads last_run_summary.yaml file and converts to measurements +# [[inputs.puppetagent]] +# ## Location of puppet last run summary file +# location = "/var/lib/puppet/state/last_run_summary.yaml" + + +# # Reads metrics from RabbitMQ servers via the Management Plugin +# [[inputs.rabbitmq]] +# ## Management Plugin url. (default: http://localhost:15672) +# # url = "http://localhost:15672" +# ## Tag added to rabbitmq_overview series; deprecated: use tags +# # name = "rmq-server-1" +# ## Credentials +# # username = "guest" +# # password = "guest" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Optional request timeouts +# ## +# ## ResponseHeaderTimeout, if non-zero, specifies the amount of time to wait +# ## for a server's response headers after fully writing the request. +# # header_timeout = "3s" +# ## +# ## client_timeout specifies a time limit for requests made by this client. +# ## Includes connection time, any redirects, and reading the response body. +# # client_timeout = "4s" +# +# ## A list of nodes to gather as the rabbitmq_node measurement. If not +# ## specified, metrics for all nodes are gathered. +# # nodes = ["rabbit@node1", "rabbit@node2"] +# +# ## A list of queues to gather as the rabbitmq_queue measurement. If not +# ## specified, metrics for all queues are gathered. +# # queues = ["telegraf"] +# +# ## A list of exchanges to gather as the rabbitmq_exchange measurement. If not +# ## specified, metrics for all exchanges are gathered. +# # exchanges = ["telegraf"] +# +# ## Queues to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all queues +# queue_name_include = [] +# queue_name_exclude = [] +# +# ## Federation upstreams include and exclude when gathering the rabbitmq_federation measurement. +# ## If neither are specified, metrics for all federation upstreams are gathered. +# ## Federation link metrics will only be gathered for queues and exchanges +# ## whose non-federation metrics will be collected (e.g a queue excluded +# ## by the 'queue_name_exclude' option will also be excluded from federation). +# ## Globs accepted. +# # federation_upstream_include = ["dataCentre-*"] +# # federation_upstream_exclude = [] + + +# # Read raindrops stats (raindrops - real-time stats for preforking Rack servers) +# [[inputs.raindrops]] +# ## An array of raindrops middleware URI to gather stats. +# urls = ["http://localhost:8080/_raindrops"] + + +# # Read CPU, Fans, Powersupply and Voltage metrics of hardware server through redfish APIs +# [[inputs.redfish]] +# ## Server url +# address = "https://127.0.0.1:5000" +# +# ## Username, Password for hardware server +# username = "root" +# password = "password123456" +# +# ## ComputerSystemId +# computer_system_id="2M220100SL" +# +# ## Amount of time allowed to complete the HTTP request +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from one or many redis servers +# [[inputs.redis]] +# ## specify servers via a url matching: +# ## [protocol://][:password]@address[:port] +# ## e.g. +# ## tcp://localhost:6379 +# ## tcp://:password@192.168.99.100 +# ## unix:///var/run/redis.sock +# ## +# ## If no servers are specified, then localhost is used as the host. +# ## If no port is specified, 6379 is used +# servers = ["tcp://localhost:6379"] +# +# ## specify server password +# # password = "s#cr@t%" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = true + + +# # Read metrics from one or many RethinkDB servers +# [[inputs.rethinkdb]] +# ## An array of URI to gather stats about. Specify an ip or hostname +# ## with optional port add password. ie, +# ## rethinkdb://user:auth_key@10.10.3.30:28105, +# ## rethinkdb://10.10.3.33:18832, +# ## 10.0.0.1:10000, etc. +# servers = ["127.0.0.1:28015"] +# ## +# ## If you use actual rethinkdb of > 2.3.0 with username/password authorization, +# ## protocol have to be named "rethinkdb2" - it will use 1_0 H. +# # servers = ["rethinkdb2://username:password@127.0.0.1:28015"] +# ## +# ## If you use older versions of rethinkdb (<2.2) with auth_key, protocol +# ## have to be named "rethinkdb". +# # servers = ["rethinkdb://username:auth_key@127.0.0.1:28015"] + + +# # Read metrics one or many Riak servers +# [[inputs.riak]] +# # Specify a list of one or more riak http servers +# servers = ["http://localhost:8098"] + + +# # Read API usage and limits for a Salesforce organisation +# [[inputs.salesforce]] +# ## specify your credentials +# ## +# username = "your_username" +# password = "your_password" +# ## +# ## (optional) security token +# # security_token = "your_security_token" +# ## +# ## (optional) environment type (sandbox or production) +# ## default is: production +# ## +# # environment = "production" +# ## +# ## (optional) API version (default: "39.0") +# ## +# # version = "39.0" + + +# # Monitor sensors, requires lm-sensors package +# [[inputs.sensors]] +# ## Remove numbers from field names. +# ## If true, a field name like 'temp1_input' will be changed to 'temp_input'. +# # remove_numbers = true +# +# ## Timeout is the maximum amount of time that the sensors command can run. +# # timeout = "5s" + + +# # Read metrics from storage devices supporting S.M.A.R.T. +# [[inputs.smart]] +# ## Optionally specify the path to the smartctl executable +# # path = "/usr/bin/smartctl" +# +# ## On most platforms smartctl requires root access. +# ## Setting 'use_sudo' to true will make use of sudo to run smartctl. +# ## Sudo must be configured to to allow the telegraf user to run smartctl +# ## without a password. +# # use_sudo = false +# +# ## Skip checking disks in this power mode. Defaults to +# ## "standby" to not wake up disks that have stoped rotating. +# ## See --nocheck in the man pages for smartctl. +# ## smartctl version 5.41 and 5.42 have faulty detection of +# ## power mode and might require changing this value to +# ## "never" depending on your disks. +# # nocheck = "standby" +# +# ## Gather all returned S.M.A.R.T. attribute metrics and the detailed +# ## information from each drive into the 'smart_attribute' measurement. +# # attributes = false +# +# ## Optionally specify devices to exclude from reporting. +# # excludes = [ "/dev/pass6" ] +# +# ## Optionally specify devices and device type, if unset +# ## a scan (smartctl --scan) for S.M.A.R.T. devices will +# ## done and all found will be included except for the +# ## excluded in excludes. +# # devices = [ "/dev/ada0 -d atacam" ] +# +# ## Timeout for the smartctl command to complete. +# # timeout = "30s" + + +# # Retrieves SNMP values from remote agents +# [[inputs.snmp]] +# ## Agent addresses to retrieve values from. +# ## example: agents = ["udp://127.0.0.1:161"] +# ## agents = ["tcp://127.0.0.1:161"] +# agents = ["udp://127.0.0.1:161"] +# +# ## Timeout for each request. +# # timeout = "5s" +# +# ## SNMP version; can be 1, 2, or 3. +# # version = 2 +# +# ## SNMP community string. +# # community = "public" +# +# ## Number of retries to attempt. +# # retries = 3 +# +# ## The GETBULK max-repetitions parameter. +# # max_repetitions = 10 +# +# ## SNMPv3 authentication and encryption options. +# ## +# ## Security Name. +# # sec_name = "myuser" +# ## Authentication protocol; one of "MD5", "SHA", or "". +# # auth_protocol = "MD5" +# ## Authentication password. +# # auth_password = "pass" +# ## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# # sec_level = "authNoPriv" +# ## Context Name. +# # context_name = "" +# ## Privacy protocol used for encrypted messages; one of "DES", "AES" or "". +# # priv_protocol = "" +# ## Privacy password used for encrypted messages. +# # priv_password = "" +# +# ## Add fields and tables defining the variables you wish to collect. This +# ## example collects the system uptime and interface variables. Reference the +# ## full plugin documentation for configuration details. + + +# # DEPRECATED! PLEASE USE inputs.snmp INSTEAD. +# [[inputs.snmp_legacy]] +# ## Use 'oids.txt' file to translate oids to names +# ## To generate 'oids.txt' you need to run: +# ## snmptranslate -m all -Tz -On | sed -e 's/"//g' > /tmp/oids.txt +# ## Or if you have an other MIB folder with custom MIBs +# ## snmptranslate -M /mycustommibfolder -Tz -On -m all | sed -e 's/"//g' > oids.txt +# snmptranslate_file = "/tmp/oids.txt" +# [[inputs.snmp.host]] +# address = "192.168.2.2:161" +# # SNMP community +# community = "public" # default public +# # SNMP version (1, 2 or 3) +# # Version 3 not supported yet +# version = 2 # default 2 +# # SNMP response timeout +# timeout = 2.0 # default 2.0 +# # SNMP request retries +# retries = 2 # default 2 +# # Which get/bulk do you want to collect for this host +# collect = ["mybulk", "sysservices", "sysdescr"] +# # Simple list of OIDs to get, in addition to "collect" +# get_oids = [] +# +# [[inputs.snmp.host]] +# address = "192.168.2.3:161" +# community = "public" +# version = 2 +# timeout = 2.0 +# retries = 2 +# collect = ["mybulk"] +# get_oids = [ +# "ifNumber", +# ".1.3.6.1.2.1.1.3.0", +# ] +# +# [[inputs.snmp.get]] +# name = "ifnumber" +# oid = "ifNumber" +# +# [[inputs.snmp.get]] +# name = "interface_speed" +# oid = "ifSpeed" +# instance = "0" +# +# [[inputs.snmp.get]] +# name = "sysuptime" +# oid = ".1.3.6.1.2.1.1.3.0" +# unit = "second" +# +# [[inputs.snmp.bulk]] +# name = "mybulk" +# max_repetition = 127 +# oid = ".1.3.6.1.2.1.1" +# +# [[inputs.snmp.bulk]] +# name = "ifoutoctets" +# max_repetition = 127 +# oid = "ifOutOctets" +# +# [[inputs.snmp.host]] +# address = "192.168.2.13:161" +# #address = "127.0.0.1:161" +# community = "public" +# version = 2 +# timeout = 2.0 +# retries = 2 +# #collect = ["mybulk", "sysservices", "sysdescr", "systype"] +# collect = ["sysuptime" ] +# [[inputs.snmp.host.table]] +# name = "iftable3" +# include_instances = ["enp5s0", "eth1"] +# +# # SNMP TABLEs +# # table without mapping neither subtables +# [[inputs.snmp.table]] +# name = "iftable1" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# +# # table without mapping but with subtables +# [[inputs.snmp.table]] +# name = "iftable2" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# sub_tables = [".1.3.6.1.2.1.2.2.1.13"] +# +# # table with mapping but without subtables +# [[inputs.snmp.table]] +# name = "iftable3" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# # if empty. get all instances +# mapping_table = ".1.3.6.1.2.1.31.1.1.1.1" +# # if empty, get all subtables +# +# # table with both mapping and subtables +# [[inputs.snmp.table]] +# name = "iftable4" +# oid = ".1.3.6.1.2.1.31.1.1.1" +# # if empty get all instances +# mapping_table = ".1.3.6.1.2.1.31.1.1.1.1" +# # if empty get all subtables +# # sub_tables could be not "real subtables" +# sub_tables=[".1.3.6.1.2.1.2.2.1.13", "bytes_recv", "bytes_send"] + + +# # Read stats from one or more Solr servers or cores +# [[inputs.solr]] +# ## specify a list of one or more Solr servers +# servers = ["http://localhost:8983"] +# +# ## specify a list of one or more Solr cores (default - all) +# # cores = ["main"] +# +# ## Optional HTTP Basic Auth Credentials +# # username = "username" +# # password = "pa$$word" + + +# # Read metrics from Microsoft SQL Server +# [[inputs.sqlserver]] +# ## Specify instances to monitor with a list of connection strings. +# ## All connection parameters are optional. +# ## By default, the host is localhost, listening on default port, TCP 1433. +# ## for Windows, the user is the currently running AD user (SSO). +# ## See https://github.com/denisenkom/go-mssqldb for detailed connection +# ## parameters, in particular, tls connections can be created like so: +# ## "encrypt=true;certificate=;hostNameInCertificate=" +# # servers = [ +# # "Server=192.168.1.10;Port=1433;User Id=;Password=;app name=telegraf;log=1;", +# # ] +# +# ## Optional parameter, setting this to 2 will use a new version +# ## of the collection queries that break compatibility with the original +# ## dashboards. +# ## Version 2 - is compatible from SQL Server 2012 and later versions and also for SQL Azure DB +# query_version = 2 +# +# ## If you are using AzureDB, setting this to true will gather resource utilization metrics +# # azuredb = false +# +# ## Possible queries +# ## Version 2: +# ## - PerformanceCounters +# ## - WaitStatsCategorized +# ## - DatabaseIO +# ## - ServerProperties +# ## - MemoryClerk +# ## - Schedulers +# ## - SqlRequests +# ## - VolumeSpace +# ## - Cpu +# ## Version 1: +# ## - PerformanceCounters +# ## - WaitStatsCategorized +# ## - CPUHistory +# ## - DatabaseIO +# ## - DatabaseSize +# ## - DatabaseStats +# ## - DatabaseProperties +# ## - MemoryClerk +# ## - VolumeSpace +# ## - PerformanceMetrics +# +# ## A list of queries to include. If not specified, all the above listed queries are used. +# # include_query = [] +# +# ## A list of queries to explicitly ignore. +# exclude_query = [ 'Schedulers' , 'SqlRequests'] + + +# # Gather timeseries from Google Cloud Platform v3 monitoring API +# [[inputs.stackdriver]] +# ## GCP Project +# project = "erudite-bloom-151019" +# +# ## Include timeseries that start with the given metric type. +# metric_type_prefix_include = [ +# "compute.googleapis.com/", +# ] +# +# ## Exclude timeseries that start with the given metric type. +# # metric_type_prefix_exclude = [] +# +# ## Many metrics are updated once per minute; it is recommended to override +# ## the agent level interval with a value of 1m or greater. +# interval = "1m" +# +# ## Maximum number of API calls to make per second. The quota for accounts +# ## varies, it can be viewed on the API dashboard: +# ## https://cloud.google.com/monitoring/quotas#quotas_and_limits +# # rate_limit = 14 +# +# ## The delay and window options control the number of points selected on +# ## each gather. When set, metrics are gathered between: +# ## start: now() - delay - window +# ## end: now() - delay +# # +# ## Collection delay; if set too low metrics may not yet be available. +# # delay = "5m" +# # +# ## If unset, the window will start at 1m and be updated dynamically to span +# ## the time between calls (approximately the length of the plugin interval). +# # window = "1m" +# +# ## TTL for cached list of metric types. This is the maximum amount of time +# ## it may take to discover new metrics. +# # cache_ttl = "1h" +# +# ## If true, raw bucket counts are collected for distribution value types. +# ## For a more lightweight collection, you may wish to disable and use +# ## distribution_aggregation_aligners instead. +# # gather_raw_distribution_buckets = true +# +# ## Aggregate functions to be used for metrics whose value type is +# ## distribution. These aggregate values are recorded in in addition to raw +# ## bucket counts; if they are enabled. +# ## +# ## For a list of aligner strings see: +# ## https://cloud.google.com/monitoring/api/ref_v3/rpc/google.monitoring.v3#aligner +# # distribution_aggregation_aligners = [ +# # "ALIGN_PERCENTILE_99", +# # "ALIGN_PERCENTILE_95", +# # "ALIGN_PERCENTILE_50", +# # ] +# +# ## Filters can be added to reduce the number of time series matched. All +# ## functions are supported: starts_with, ends_with, has_substring, and +# ## one_of. Only the '=' operator is supported. +# ## +# ## The logical operators when combining filters are defined statically using +# ## the following values: +# ## filter ::= {AND } +# ## resource_labels ::= {OR } +# ## metric_labels ::= {OR } +# ## +# ## For more details, see https://cloud.google.com/monitoring/api/v3/filters +# # +# ## Resource labels refine the time series selection with the following expression: +# ## resource.labels. = +# # [[inputs.stackdriver.filter.resource_labels]] +# # key = "instance_name" +# # value = 'starts_with("localhost")' +# # +# ## Metric labels refine the time series selection with the following expression: +# ## metric.labels. = +# # [[inputs.stackdriver.filter.metric_labels]] +# # key = "device_name" +# # value = 'one_of("sda", "sdb")' + + +# # Get synproxy counter statistics from procfs +# [[inputs.synproxy]] +# # no configuration + + +# # Sysstat metrics collector +# [[inputs.sysstat]] +# ## Path to the sadc command. +# # +# ## Common Defaults: +# ## Debian/Ubuntu: /usr/lib/sysstat/sadc +# ## Arch: /usr/lib/sa/sadc +# ## RHEL/CentOS: /usr/lib64/sa/sadc +# sadc_path = "/usr/lib/sa/sadc" # required +# +# ## Path to the sadf command, if it is not in PATH +# # sadf_path = "/usr/bin/sadf" +# +# ## Activities is a list of activities, that are passed as argument to the +# ## sadc collector utility (e.g: DISK, SNMP etc...) +# ## The more activities that are added, the more data is collected. +# # activities = ["DISK"] +# +# ## Group metrics to measurements. +# ## +# ## If group is false each metric will be prefixed with a description +# ## and represents itself a measurement. +# ## +# ## If Group is true, corresponding metrics are grouped to a single measurement. +# # group = true +# +# ## Options for the sadf command. The values on the left represent the sadf +# ## options and the values on the right their description (which are used for +# ## grouping and prefixing metrics). +# ## +# ## Run 'sar -h' or 'man sar' to find out the supported options for your +# ## sysstat version. +# [inputs.sysstat.options] +# -C = "cpu" +# -B = "paging" +# -b = "io" +# -d = "disk" # requires DISK activity +# "-n ALL" = "network" +# "-P ALL" = "per_cpu" +# -q = "queue" +# -R = "mem" +# -r = "mem_util" +# -S = "swap_util" +# -u = "cpu_util" +# -v = "inode" +# -W = "swap" +# -w = "task" +# # -H = "hugepages" # only available for newer linux distributions +# # "-I ALL" = "interrupts" # requires INT activity +# +# ## Device tags can be used to add additional tags for devices. +# ## For example the configuration below adds a tag vg with value rootvg for +# ## all metrics with sda devices. +# # [[inputs.sysstat.device_tags.sda]] +# # vg = "rootvg" + + +# # Gather systemd units state +# [[inputs.systemd_units]] +# ## Set timeout for systemctl execution +# # timeout = "1s" +# # +# ## Filter for a specific unit type, default is "service", other possible +# ## values are "socket", "target", "device", "mount", "automount", "swap", +# ## "timer", "path", "slice" and "scope ": +# # unittype = "service" + + +# # Reads metrics from a Teamspeak 3 Server via ServerQuery +# [[inputs.teamspeak]] +# ## Server address for Teamspeak 3 ServerQuery +# # server = "127.0.0.1:10011" +# ## Username for ServerQuery +# username = "serverqueryuser" +# ## Password for ServerQuery +# password = "secret" +# ## Array of virtual servers +# # virtual_servers = [1] + + +# # Read metrics about temperature +# [[inputs.temp]] +# # no configuration + + +# # Read Tengine's basic status information (ngx_http_reqstat_module) +# [[inputs.tengine]] +# # An array of Tengine reqstat module URI to gather stats. +# urls = ["http://127.0.0.1/us"] +# +# # HTTP response timeout (default: 5s) +# # response_timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.cer" +# # tls_key = "/etc/telegraf/key.key" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Gather metrics from the Tomcat server status page. +# [[inputs.tomcat]] +# ## URL of the Tomcat server status +# # url = "http://127.0.0.1:8080/manager/status/all?XML=true" +# +# ## HTTP Basic Auth Credentials +# # username = "tomcat" +# # password = "s3cret" +# +# ## Request timeout +# # timeout = "5s" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Inserts sine and cosine waves for demonstration purposes +# [[inputs.trig]] +# ## Set the amplitude +# amplitude = 10.0 + + +# # Read Twemproxy stats data +# [[inputs.twemproxy]] +# ## Twemproxy stats address and port (no scheme) +# addr = "localhost:22222" +# ## Monitor pool name +# pools = ["redis_pool", "mc_pool"] + + +# # A plugin to collect stats from the Unbound DNS resolver +# [[inputs.unbound]] +# ## Address of server to connect to, read from unbound conf default, optionally ':port' +# ## Will lookup IP if given a hostname +# server = "127.0.0.1:8953" +# +# ## If running as a restricted user you can prepend sudo for additional access: +# # use_sudo = false +# +# ## The default location of the unbound-control binary can be overridden with: +# # binary = "/usr/sbin/unbound-control" +# +# ## The default location of the unbound config file can be overridden with: +# # config_file = "/etc/unbound/unbound.conf" +# +# ## The default timeout of 1s can be overridden with: +# # timeout = "1s" +# +# ## When set to true, thread metrics are tagged with the thread id. +# ## +# ## The default is false for backwards compatibility, and will be changed to +# ## true in a future version. It is recommended to set to true on new +# ## deployments. +# thread_as_tag = false + + +# # Read uWSGI metrics. +# [[inputs.uwsgi]] +# ## List with urls of uWSGI Stats servers. URL must match pattern: +# ## scheme://address[:port] +# ## +# ## For example: +# ## servers = ["tcp://localhost:5050", "http://localhost:1717", "unix:///tmp/statsock"] +# servers = ["tcp://127.0.0.1:1717"] +# +# ## General connection timeout +# # timeout = "5s" + + +# # A plugin to collect stats from Varnish HTTP Cache +# [[inputs.varnish]] +# ## If running as a restricted user you can prepend sudo for additional access: +# #use_sudo = false +# +# ## The default location of the varnishstat binary can be overridden with: +# binary = "/usr/bin/varnishstat" +# +# ## By default, telegraf gather stats for 3 metric points. +# ## Setting stats will override the defaults shown below. +# ## Glob matching can be used, ie, stats = ["MAIN.*"] +# ## stats may also be set to ["*"], which will collect all stats +# stats = ["MAIN.cache_hit", "MAIN.cache_miss", "MAIN.uptime"] +# +# ## Optional name for the varnish instance (or working directory) to query +# ## Usually append after -n in varnish cli +# # instance_name = instanceName +# +# ## Timeout for varnishstat command +# # timeout = "1s" + + +# # Collect Wireguard server interface and peer statistics +# [[inputs.wireguard]] +# ## Optional list of Wireguard device/interface names to query. +# ## If omitted, all Wireguard interfaces are queried. +# # devices = ["wg0"] + + +# # Monitor wifi signal strength and quality +# [[inputs.wireless]] +# ## Sets 'proc' directory path +# ## If not specified, then default is /proc +# # host_proc = "/proc" + + +# # Reads metrics from a SSL certificate +# [[inputs.x509_cert]] +# ## List certificate sources +# sources = ["/etc/ssl/certs/ssl-cert-snakeoil.pem", "tcp://example.org:443"] +# +# ## Timeout for SSL connection +# # timeout = "5s" +# +# ## Pass a different name into the TLS request (Server Name Indication) +# ## example: server_name = "myhost.example.org" +# # server_name = "" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" + + +# # Read metrics of ZFS from arcstats, zfetchstats, vdev_cache_stats, and pools +# [[inputs.zfs]] +# ## ZFS kstat path. Ignored on FreeBSD +# ## If not specified, then default is: +# # kstatPath = "/proc/spl/kstat/zfs" +# +# ## By default, telegraf gather all zfs stats +# ## If not specified, then default is: +# # kstatMetrics = ["arcstats", "zfetchstats", "vdev_cache_stats"] +# ## For Linux, the default is: +# # kstatMetrics = ["abdstats", "arcstats", "dnodestats", "dbufcachestats", +# # "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"] +# ## By default, don't gather zpool stats +# # poolMetrics = false + + +# # Reads 'mntr' stats from one or many zookeeper servers +# [[inputs.zookeeper]] +# ## An array of address to gather stats about. Specify an ip or hostname +# ## with port. ie localhost:2181, 10.0.0.1:2181, etc. +# +# ## If no servers are specified, then localhost is used as the host. +# ## If no port is specified, 2181 is used +# servers = [":2181"] +# +# ## Timeout for metric collections from all servers. Minimum timeout is "1s". +# # timeout = "5s" +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## If false, skip chain & host verification +# # insecure_skip_verify = true + + +############################################################################### +# SERVICE INPUT PLUGINS # +############################################################################### + + +# # AMQP consumer plugin +# [[inputs.amqp_consumer]] +# ## Broker to consume from. +# ## deprecated in 1.7; use the brokers option +# # url = "amqp://localhost:5672/influxdb" +# +# ## Brokers to consume from. If multiple brokers are specified a random broker +# ## will be selected anytime a connection is established. This can be +# ## helpful for load balancing when not using a dedicated load balancer. +# brokers = ["amqp://localhost:5672/influxdb"] +# +# ## Authentication credentials for the PLAIN auth_method. +# # username = "" +# # password = "" +# +# ## Name of the exchange to declare. If unset, no exchange will be declared. +# exchange = "telegraf" +# +# ## Exchange type; common types are "direct", "fanout", "topic", "header", "x-consistent-hash". +# # exchange_type = "topic" +# +# ## If true, exchange will be passively declared. +# # exchange_passive = false +# +# ## Exchange durability can be either "transient" or "durable". +# # exchange_durability = "durable" +# +# ## Additional exchange arguments. +# # exchange_arguments = { } +# # exchange_arguments = {"hash_property" = "timestamp"} +# +# ## AMQP queue name. +# queue = "telegraf" +# +# ## AMQP queue durability can be "transient" or "durable". +# queue_durability = "durable" +# +# ## If true, queue will be passively declared. +# # queue_passive = false +# +# ## A binding between the exchange and queue using this binding key is +# ## created. If unset, no binding is created. +# binding_key = "#" +# +# ## Maximum number of messages server should give to the worker. +# # prefetch_count = 50 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Auth method. PLAIN and EXTERNAL are supported +# ## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as +# ## described here: https://www.rabbitmq.com/plugins.html +# # auth_method = "PLAIN" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Content encoding for message payloads, can be set to "gzip" to or +# ## "identity" to apply no encoding. +# # content_encoding = "identity" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read Cassandra metrics through Jolokia +# [[inputs.cassandra]] +# ## DEPRECATED: The cassandra plugin has been deprecated. Please use the +# ## jolokia2 plugin instead. +# ## +# ## see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/jolokia2 +# +# context = "/jolokia/read" +# ## List of cassandra servers exposing jolokia read service +# servers = ["myuser:mypassword@10.10.10.1:8778","10.10.10.2:8778",":8778"] +# ## List of metrics collected on above servers +# ## Each metric consists of a jmx path. +# ## This will collect all heap memory usage metrics from the jvm and +# ## ReadLatency metrics for all keyspaces and tables. +# ## "type=Table" in the query works with Cassandra3.0. Older versions might +# ## need to use "type=ColumnFamily" +# metrics = [ +# "/java.lang:type=Memory/HeapMemoryUsage", +# "/org.apache.cassandra.metrics:type=Table,keyspace=*,scope=*,name=ReadLatency" +# ] + + +# # Cisco model-driven telemetry (MDT) input plugin for IOS XR, IOS XE and NX-OS platforms +# [[inputs.cisco_telemetry_mdt]] +# ## Telemetry transport can be "tcp" or "grpc". TLS is only supported when +# ## using the grpc transport. +# transport = "grpc" +# +# ## Address and port to host telemetry listener +# service_address = ":57000" +# +# ## Enable TLS; grpc transport only. +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Enable TLS client authentication and define allowed CA certificates; grpc +# ## transport only. +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Define (for certain nested telemetry measurements with embedded tags) which fields are tags +# # embedded_tags = ["Cisco-IOS-XR-qos-ma-oper:qos/interface-table/interface/input/service-policy-names/service-policy-instance/statistics/class-stats/class-name"] +# +# ## Define aliases to map telemetry encoding paths to simple measurement names +# [inputs.cisco_telemetry_mdt.aliases] +# ifstats = "ietf-interfaces:interfaces-state/interface/statistics" + + +# # Read metrics from one or many ClickHouse servers +# [[inputs.clickhouse]] +# ## Username for authorization on ClickHouse server +# ## example: user = "default"" +# username = "default" +# +# ## Password for authorization on ClickHouse server +# ## example: password = "super_secret" +# +# ## HTTP(s) timeout while getting metrics values +# ## The timeout includes connection time, any redirects, and reading the response body. +# ## example: timeout = 1s +# # timeout = 5s +# +# ## List of servers for metrics scraping +# ## metrics scrape via HTTP(s) clickhouse interface +# ## https://clickhouse.tech/docs/en/interfaces/http/ +# ## example: servers = ["http://127.0.0.1:8123","https://custom-server.mdb.yandexcloud.net"] +# servers = ["http://127.0.0.1:8123"] +# +# ## If "auto_discovery"" is "true" plugin tries to connect to all servers available in the cluster +# ## with using same "user:password" described in "user" and "password" parameters +# ## and get this server hostname list from "system.clusters" table +# ## see +# ## - https://clickhouse.tech/docs/en/operations/system_tables/#system-clusters +# ## - https://clickhouse.tech/docs/en/operations/server_settings/settings/#server_settings_remote_servers +# ## - https://clickhouse.tech/docs/en/operations/table_engines/distributed/ +# ## - https://clickhouse.tech/docs/en/operations/table_engines/replication/#creating-replicated-tables +# ## example: auto_discovery = false +# # auto_discovery = true +# +# ## Filter cluster names in "system.clusters" when "auto_discovery" is "true" +# ## when this filter present then "WHERE cluster IN (...)" filter will apply +# ## please use only full cluster names here, regexp and glob filters is not allowed +# ## for "/etc/clickhouse-server/config.d/remote.xml" +# ## +# ## +# ## +# ## +# ## clickhouse-ru-1.local9000 +# ## clickhouse-ru-2.local9000 +# ## +# ## +# ## clickhouse-eu-1.local9000 +# ## clickhouse-eu-2.local9000 +# ## +# ## +# ## +# ## +# ## +# ## +# ## example: cluster_include = ["my-own-cluster"] +# # cluster_include = [] +# +# ## Filter cluster names in "system.clusters" when "auto_discovery" is "true" +# ## when this filter present then "WHERE cluster NOT IN (...)" filter will apply +# ## example: cluster_exclude = ["my-internal-not-discovered-cluster"] +# # cluster_exclude = [] +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Read metrics from Google PubSub +# [[inputs.cloud_pubsub]] +# ## Required. Name of Google Cloud Platform (GCP) Project that owns +# ## the given PubSub subscription. +# project = "my-project" +# +# ## Required. Name of PubSub subscription to ingest metrics from. +# subscription = "my-subscription" +# +# ## Required. Data format to consume. +# ## Each data format has its own unique set of configuration options. +# ## Read more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Optional. Filepath for GCP credentials JSON file to authorize calls to +# ## PubSub APIs. If not set explicitly, Telegraf will attempt to use +# ## Application Default Credentials, which is preferred. +# # credentials_file = "path/to/my/creds.json" +# +# ## Optional. Number of seconds to wait before attempting to restart the +# ## PubSub subscription receiver after an unexpected error. +# ## If the streaming pull for a PubSub Subscription fails (receiver), +# ## the agent attempts to restart receiving messages after this many seconds. +# # retry_delay_seconds = 5 +# +# ## Optional. Maximum byte length of a message to consume. +# ## Larger messages are dropped with an error. If less than 0 or unspecified, +# ## treated as no limit. +# # max_message_len = 1000000 +# +# ## Optional. Maximum messages to read from PubSub that have not been written +# ## to an output. Defaults to 1000. +# ## For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message contains 10 metrics and the output +# ## metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## The following are optional Subscription ReceiveSettings in PubSub. +# ## Read more about these values: +# ## https://godoc.org/cloud.google.com/go/pubsub#ReceiveSettings +# +# ## Optional. Maximum number of seconds for which a PubSub subscription +# ## should auto-extend the PubSub ACK deadline for each message. If less than +# ## 0, auto-extension is disabled. +# # max_extension = 0 +# +# ## Optional. Maximum number of unprocessed messages in PubSub +# ## (unacknowledged but not yet expired in PubSub). +# ## A value of 0 is treated as the default PubSub value. +# ## Negative values will be treated as unlimited. +# # max_outstanding_messages = 0 +# +# ## Optional. Maximum size in bytes of unprocessed messages in PubSub +# ## (unacknowledged but not yet expired in PubSub). +# ## A value of 0 is treated as the default PubSub value. +# ## Negative values will be treated as unlimited. +# # max_outstanding_bytes = 0 +# +# ## Optional. Max number of goroutines a PubSub Subscription receiver can spawn +# ## to pull messages from PubSub concurrently. This limit applies to each +# ## subscription separately and is treated as the PubSub default if less than +# ## 1. Note this setting does not limit the number of messages that can be +# ## processed concurrently (use "max_outstanding_messages" instead). +# # max_receiver_go_routines = 0 +# +# ## Optional. If true, Telegraf will attempt to base64 decode the +# ## PubSub message data before parsing +# # base64_data = false + + +# # Google Cloud Pub/Sub Push HTTP listener +# [[inputs.cloud_pubsub_push]] +# ## Address and port to host HTTP listener on +# service_address = ":8080" +# +# ## Application secret to verify messages originate from Cloud Pub/Sub +# # token = "" +# +# ## Path to listen to. +# # path = "/" +# +# ## Maximum duration before timing out read of the request +# # read_timeout = "10s" +# ## Maximum duration before timing out write of the response. This should be set to a value +# ## large enough that you can send at least 'metric_batch_size' number of messages within the +# ## duration. +# # write_timeout = "10s" +# +# ## Maximum allowed http request body size in bytes. +# ## 0 means to use the default of 524,288,00 bytes (500 mebibytes) +# # max_body_size = "500MB" +# +# ## Whether to add the pubsub metadata, such as message attributes and subscription as a tag. +# # add_meta = false +# +# ## Optional. Maximum messages to read from PubSub that have not been written +# ## to an output. Defaults to 1000. +# ## For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message contains 10 metrics and the output +# ## metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read logging output from the Docker engine +# [[inputs.docker_log]] +# ## Docker Endpoint +# ## To use TCP, set endpoint = "tcp://[ip]:[port]" +# ## To use environment variables (ie, docker-machine), set endpoint = "ENV" +# # endpoint = "unix:///var/run/docker.sock" +# +# ## When true, container logs are read from the beginning; otherwise +# ## reading begins at the end of the log. +# # from_beginning = false +# +# ## Timeout for Docker API calls. +# # timeout = "5s" +# +# ## Containers to include and exclude. Globs accepted. +# ## Note that an empty array for both will include all containers +# # container_name_include = [] +# # container_name_exclude = [] +# +# ## Container states to include and exclude. Globs accepted. +# ## When empty only containers in the "running" state will be captured. +# # container_state_include = [] +# # container_state_exclude = [] +# +# ## docker labels to include and exclude as tags. Globs accepted. +# ## Note that an empty array for both will include all labels as tags +# # docker_label_include = [] +# # docker_label_exclude = [] +# +# ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars +# source_tag = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # Azure Event Hubs service input plugin +# [[inputs.eventhub_consumer]] +# ## The default behavior is to create a new Event Hub client from environment variables. +# ## This requires one of the following sets of environment variables to be set: +# ## +# ## 1) Expected Environment Variables: +# ## - "EVENTHUB_NAMESPACE" +# ## - "EVENTHUB_NAME" +# ## - "EVENTHUB_CONNECTION_STRING" +# ## +# ## 2) Expected Environment Variables: +# ## - "EVENTHUB_NAMESPACE" +# ## - "EVENTHUB_NAME" +# ## - "EVENTHUB_KEY_NAME" +# ## - "EVENTHUB_KEY_VALUE" +# +# ## Uncommenting the option below will create an Event Hub client based solely on the connection string. +# ## This can either be the associated environment variable or hard coded directly. +# # connection_string = "" +# +# ## Set persistence directory to a valid folder to use a file persister instead of an in-memory persister +# # persistence_dir = "" +# +# ## Change the default consumer group +# # consumer_group = "" +# +# ## By default the event hub receives all messages present on the broker, alternative modes can be set below. +# ## The timestamp should be in https://github.com/toml-lang/toml#offset-date-time format (RFC 3339). +# ## The 3 options below only apply if no valid persister is read from memory or file (e.g. first run). +# # from_timestamp = +# # latest = true +# +# ## Set a custom prefetch count for the receiver(s) +# # prefetch_count = 1000 +# +# ## Add an epoch to the receiver(s) +# # epoch = 0 +# +# ## Change to set a custom user agent, "telegraf" is used by default +# # user_agent = "telegraf" +# +# ## To consume from a specific partition, set the partition_ids option. +# ## An empty array will result in receiving from all partitions. +# # partition_ids = ["0","1"] +# +# ## Max undelivered messages +# # max_undelivered_messages = 1000 +# +# ## Set either option below to true to use a system property as timestamp. +# ## You have the choice between EnqueuedTime and IoTHubEnqueuedTime. +# ## It is recommended to use this setting when the data itself has no timestamp. +# # enqueued_time_as_ts = true +# # iot_hub_enqueued_time_as_ts = true +# +# ## Tags or fields to create from keys present in the application property bag. +# ## These could for example be set by message enrichments in Azure IoT Hub. +# # application_property_tags = [] +# # application_property_fields = [] +# +# ## Tag or field name to use for metadata +# ## By default all metadata is disabled +# # sequence_number_field = "SequenceNumber" +# # enqueued_time_field = "EnqueuedTime" +# # offset_field = "Offset" +# # partition_id_tag = "PartitionID" +# # partition_key_tag = "PartitionKey" +# # iot_hub_device_connection_id_tag = "IoTHubDeviceConnectionID" +# # iot_hub_auth_generation_id_tag = "IoTHubAuthGenerationID" +# # iot_hub_connection_auth_method_tag = "IoTHubConnectionAuthMethod" +# # iot_hub_connection_module_id_tag = "IoTHubConnectionModuleID" +# # iot_hub_enqueued_time_field = "IoTHubEnqueuedTime" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Run executable as long-running input plugin +# [[inputs.execd]] +# ## Program to run as daemon +# command = ["telegraf-smartctl", "-d", "/dev/sda"] +# +# ## Define how the process is signaled on each collection interval. +# ## Valid values are: +# ## "none" : Do not signal anything. +# ## The process must output metrics by itself. +# ## "STDIN" : Send a newline on STDIN. +# ## "SIGHUP" : Send a HUP signal. Not available on Windows. +# ## "SIGUSR1" : Send a USR1 signal. Not available on Windows. +# ## "SIGUSR2" : Send a USR2 signal. Not available on Windows. +# signal = "none" +# +# ## Delay before the process is restarted after an unexpected termination +# restart_delay = "10s" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # gNMI telemetry input plugin +# [[inputs.gnmi]] +# ## Address and port of the gNMI GRPC server +# addresses = ["10.49.234.114:57777"] +# +# ## define credentials +# username = "cisco" +# password = "cisco" +# +# ## gNMI encoding requested (one of: "proto", "json", "json_ietf") +# # encoding = "proto" +# +# ## redial in case of failures after +# redial = "10s" +# +# ## enable client-side TLS and define CA to authenticate the device +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # insecure_skip_verify = true +# +# ## define client-side TLS certificate & key to authenticate to the device +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## gNMI subscription prefix (optional, can usually be left empty) +# ## See: https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md#222-paths +# # origin = "" +# # prefix = "" +# # target = "" +# +# ## Define additional aliases to map telemetry encoding paths to simple measurement names +# #[inputs.gnmi.aliases] +# # ifcounters = "openconfig:/interfaces/interface/state/counters" +# +# [[inputs.gnmi.subscription]] +# ## Name of the measurement that will be emitted +# name = "ifcounters" +# +# ## Origin and path of the subscription +# ## See: https://github.com/openconfig/reference/blob/master/rpc/gnmi/gnmi-specification.md#222-paths +# ## +# ## origin usually refers to a (YANG) data model implemented by the device +# ## and path to a specific substructure inside it that should be subscribed to (similar to an XPath) +# ## YANG models can be found e.g. here: https://github.com/YangModels/yang/tree/master/vendor/cisco/xr +# origin = "openconfig-interfaces" +# path = "/interfaces/interface/state/counters" +# +# # Subscription mode (one of: "target_defined", "sample", "on_change") and interval +# subscription_mode = "sample" +# sample_interval = "10s" +# +# ## Suppress redundant transmissions when measured values are unchanged +# # suppress_redundant = false +# +# ## If suppression is enabled, send updates at least every X seconds anyway +# # heartbeat_interval = "60s" + + +# # Accept metrics over InfluxDB 1.x HTTP API +# [[inputs.http_listener]] +# ## Address and port to host InfluxDB listener on +# service_address = ":8186" +# +# ## maximum duration before timing out read of the request +# read_timeout = "10s" +# ## maximum duration before timing out write of the response +# write_timeout = "10s" +# +# ## Maximum allowed HTTP request body size in bytes. +# ## 0 means to use the default of 32MiB. +# max_body_size = "32MiB" +# +# ## Optional tag name used to store the database. +# ## If the write has a database in the query string then it will be kept in this tag name. +# ## This tag can be used in downstream outputs. +# ## The default value of nothing means it will be off and the database will not be recorded. +# # database_tag = "" +# +# ## If set the retention policy specified in the write query will be added as +# ## the value of this tag name. +# # retention_policy_tag = "" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +# +# ## Optional username and password to accept for HTTP basic authentication. +# ## You probably want to make sure you have TLS configured above for this. +# # basic_username = "foobar" +# # basic_password = "barfoo" + + +# # Generic HTTP write listener +# [[inputs.http_listener_v2]] +# ## Address and port to host HTTP listener on +# service_address = ":8080" +# +# ## Path to listen to. +# # path = "/telegraf" +# +# ## HTTP methods to accept. +# # methods = ["POST", "PUT"] +# +# ## maximum duration before timing out read of the request +# # read_timeout = "10s" +# ## maximum duration before timing out write of the response +# # write_timeout = "10s" +# +# ## Maximum allowed http request body size in bytes. +# ## 0 means to use the default of 524,288,00 bytes (500 mebibytes) +# # max_body_size = "500MB" +# +# ## Part of the request to consume. Available options are "body" and +# ## "query". +# # data_source = "body" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Optional username and password to accept for HTTP basic authentication. +# ## You probably want to make sure you have TLS configured above for this. +# # basic_username = "foobar" +# # basic_password = "barfoo" +# +# ## Optional setting to map http headers into tags +# ## If the http header is not present on the request, no corresponding tag will be added +# ## If multiple instances of the http header are present, only the first value will be used +# # http_header_tags = {"HTTP_HEADER" = "TAG_NAME"} +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Accept metrics over InfluxDB 1.x HTTP API +# [[inputs.influxdb_listener]] +# ## Address and port to host InfluxDB listener on +# service_address = ":8186" +# +# ## maximum duration before timing out read of the request +# read_timeout = "10s" +# ## maximum duration before timing out write of the response +# write_timeout = "10s" +# +# ## Maximum allowed HTTP request body size in bytes. +# ## 0 means to use the default of 32MiB. +# max_body_size = "32MiB" +# +# ## Optional tag name used to store the database. +# ## If the write has a database in the query string then it will be kept in this tag name. +# ## This tag can be used in downstream outputs. +# ## The default value of nothing means it will be off and the database will not be recorded. +# # database_tag = "" +# +# ## If set the retention policy specified in the write query will be added as +# ## the value of this tag name. +# # retention_policy_tag = "" +# +# ## Set one or more allowed client CA certificate file names to +# ## enable mutually authenticated TLS connections +# tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Add service certificate and key +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +# +# ## Optional username and password to accept for HTTP basic authentication. +# ## You probably want to make sure you have TLS configured above for this. +# # basic_username = "foobar" +# # basic_password = "barfoo" + + +# # Read JTI OpenConfig Telemetry from listed sensors +# [[inputs.jti_openconfig_telemetry]] +# ## List of device addresses to collect telemetry from +# servers = ["localhost:1883"] +# +# ## Authentication details. Username and password are must if device expects +# ## authentication. Client ID must be unique when connecting from multiple instances +# ## of telegraf to the same device +# username = "user" +# password = "pass" +# client_id = "telegraf" +# +# ## Frequency to get data +# sample_frequency = "1000ms" +# +# ## Sensors to subscribe for +# ## A identifier for each sensor can be provided in path by separating with space +# ## Else sensor path will be used as identifier +# ## When identifier is used, we can provide a list of space separated sensors. +# ## A single subscription will be created with all these sensors and data will +# ## be saved to measurement with this identifier name +# sensors = [ +# "/interfaces/", +# "collection /components/ /lldp", +# ] +# +# ## We allow specifying sensor group level reporting rate. To do this, specify the +# ## reporting rate in Duration at the beginning of sensor paths / collection +# ## name. For entries without reporting rate, we use configured sample frequency +# sensors = [ +# "1000ms customReporting /interfaces /lldp", +# "2000ms collection /components", +# "/interfaces", +# ] +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Delay between retry attempts of failed RPC calls or streams. Defaults to 1000ms. +# ## Failed streams/calls will not be retried if 0 is provided +# retry_delay = "1000ms" +# +# ## To treat all string values as tags, set this to true +# str_as_tags = false + + +# # Read metrics from Kafka topics +# [[inputs.kafka_consumer]] +# ## Kafka brokers. +# brokers = ["localhost:9092"] +# +# ## Topics to consume. +# topics = ["telegraf"] +# +# ## When set this tag will be added to all metrics with the topic as the value. +# # topic_tag = "" +# +# ## Optional Client id +# # client_id = "Telegraf" +# +# ## Set the minimal supported Kafka version. Setting this enables the use of new +# ## Kafka features and APIs. Must be 0.10.2.0 or greater. +# ## ex: version = "1.1.0" +# # version = "" +# +# ## Optional TLS Config +# # enable_tls = true +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## SASL authentication credentials. These settings should typically be used +# ## with TLS encryption enabled using the "enable_tls" option. +# # sasl_username = "kafka" +# # sasl_password = "secret" +# +# ## SASL protocol version. When connecting to Azure EventHub set to 0. +# # sasl_version = 1 +# +# ## Name of the consumer group. +# # consumer_group = "telegraf_metrics_consumers" +# +# ## Initial offset position; one of "oldest" or "newest". +# # offset = "oldest" +# +# ## Consumer group partition assignment strategy; one of "range", "roundrobin" or "sticky". +# # balance_strategy = "range" +# +# ## Maximum length of a message to consume, in bytes (default 0/unlimited); +# ## larger messages are dropped +# max_message_len = 1000000 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from Kafka topic(s) +# [[inputs.kafka_consumer_legacy]] +# ## topic(s) to consume +# topics = ["telegraf"] +# +# ## an array of Zookeeper connection strings +# zookeeper_peers = ["localhost:2181"] +# +# ## Zookeeper Chroot +# zookeeper_chroot = "" +# +# ## the name of the consumer group +# consumer_group = "telegraf_metrics_consumers" +# +# ## Offset (must be either "oldest" or "newest") +# offset = "oldest" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Maximum length of a message to consume, in bytes (default 0/unlimited); +# ## larger messages are dropped +# max_message_len = 65536 + + +# # Configuration for the AWS Kinesis input. +# [[inputs.kinesis_consumer]] +# ## Amazon REGION of kinesis endpoint. +# region = "ap-southeast-2" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) Assumed credentials via STS if role_arn is specified +# ## 2) explicit credentials from 'access_key' and 'secret_key' +# ## 3) shared profile from 'profile' +# ## 4) environment variables +# ## 5) shared credentials file +# ## 6) EC2 Instance Profile +# # access_key = "" +# # secret_key = "" +# # token = "" +# # role_arn = "" +# # profile = "" +# # shared_credential_file = "" +# +# ## Endpoint to make request against, the correct endpoint is automatically +# ## determined and this option should only be set if you wish to override the +# ## default. +# ## ex: endpoint_url = "http://localhost:8000" +# # endpoint_url = "" +# +# ## Kinesis StreamName must exist prior to starting telegraf. +# streamname = "StreamName" +# +# ## Shard iterator type (only 'TRIM_HORIZON' and 'LATEST' currently supported) +# # shard_iterator_type = "TRIM_HORIZON" +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" +# +# ## Optional +# ## Configuration for a dynamodb checkpoint +# [inputs.kinesis_consumer.checkpoint_dynamodb] +# ## unique name for this consumer +# app_name = "default" +# table_name = "default" + + +# # Read metrics off Arista LANZ, via socket +# [[inputs.lanz]] +# ## URL to Arista LANZ endpoint +# servers = [ +# "tcp://127.0.0.1:50001" +# ] + + +# # Stream and parse log file(s). +# [[inputs.logparser]] +# ## Log files to parse. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## /var/log/**.log -> recursively find all .log files in /var/log +# ## /var/log/*/*.log -> find all .log files with a parent dir in /var/log +# ## /var/log/apache.log -> only tail the apache log file +# files = ["/var/log/apache/access.log"] +# +# ## Read files that currently exist from the beginning. Files that are created +# ## while telegraf is running (and that match the "files" globs) will always +# ## be read from the beginning. +# from_beginning = false +# +# ## Method used to watch for file updates. Can be either "inotify" or "poll". +# # watch_method = "inotify" +# +# ## Parse logstash-style "grok" patterns: +# [inputs.logparser.grok] +# ## This is a list of patterns to check the given log file(s) for. +# ## Note that adding patterns here increases processing time. The most +# ## efficient configuration is to have one pattern per logparser. +# ## Other common built-in patterns are: +# ## %{COMMON_LOG_FORMAT} (plain apache & nginx access logs) +# ## %{COMBINED_LOG_FORMAT} (access logs + referrer & agent) +# patterns = ["%{COMBINED_LOG_FORMAT}"] +# +# ## Name of the outputted measurement name. +# measurement = "apache_access_log" +# +# ## Full path(s) to custom pattern files. +# custom_pattern_files = [] +# +# ## Custom patterns can also be defined here. Put one pattern per line. +# custom_patterns = ''' +# ''' +# +# ## Timezone allows you to provide an override for timestamps that +# ## don't already include an offset +# ## e.g. 04/06/2016 12:41:45 data one two 5.43µs +# ## +# ## Default: "" which renders UTC +# ## Options are as follows: +# ## 1. Local -- interpret based on machine localtime +# ## 2. "Canada/Eastern" -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones +# ## 3. UTC -- or blank/unspecified, will return timestamp in UTC +# # timezone = "Canada/Eastern" +# +# ## When set to "disable", timestamp will not incremented if there is a +# ## duplicate. +# # unique_timestamp = "auto" + + +# # Read metrics from MQTT topic(s) +# [[inputs.mqtt_consumer]] +# ## Broker URLs for the MQTT server or cluster. To connect to multiple +# ## clusters or standalone servers, use a seperate plugin instance. +# ## example: servers = ["tcp://localhost:1883"] +# ## servers = ["ssl://localhost:1883"] +# ## servers = ["ws://localhost:1883"] +# servers = ["tcp://127.0.0.1:1883"] +# +# ## Topics that will be subscribed to. +# topics = [ +# "telegraf/host01/cpu", +# "telegraf/+/mem", +# "sensors/#", +# ] +# +# ## The message topic will be stored in a tag specified by this value. If set +# ## to the empty string no topic tag will be created. +# # topic_tag = "topic" +# +# ## QoS policy for messages +# ## 0 = at most once +# ## 1 = at least once +# ## 2 = exactly once +# ## +# ## When using a QoS of 1 or 2, you should enable persistent_session to allow +# ## resuming unacknowledged messages. +# # qos = 0 +# +# ## Connection timeout for initial connection in seconds +# # connection_timeout = "30s" +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Persistent session disables clearing of the client session on connection. +# ## In order for this option to work you must also set client_id to identify +# ## the client. To receive messages that arrived while the client is offline, +# ## also set the qos option to 1 or 2 and don't forget to also set the QoS when +# ## publishing. +# # persistent_session = false +# +# ## If unset, a random client ID will be generated. +# # client_id = "" +# +# ## Username and password to connect MQTT server. +# # username = "telegraf" +# # password = "metricsmetricsmetricsmetrics" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from NATS subject(s) +# [[inputs.nats_consumer]] +# ## urls of NATS servers +# servers = ["nats://localhost:4222"] +# +# ## subject(s) to consume +# subjects = ["telegraf"] +# +# ## name a queue group +# queue_group = "telegraf_consumers" +# +# ## Optional credentials +# # username = "" +# # password = "" +# +# ## Optional NATS 2.0 and NATS NGS compatible user credentials +# # credentials = "/etc/telegraf/nats.creds" +# +# ## Use Transport Layer Security +# # secure = false +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false +# +# ## Sets the limits for pending msgs and bytes for each subscription +# ## These shouldn't need to be adjusted except in very high throughput scenarios +# # pending_message_limit = 65536 +# # pending_bytes_limit = 67108864 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read NSQ topic for metrics. +# [[inputs.nsq_consumer]] +# ## Server option still works but is deprecated, we just prepend it to the nsqd array. +# # server = "localhost:4150" +# +# ## An array representing the NSQD TCP HTTP Endpoints +# nsqd = ["localhost:4150"] +# +# ## An array representing the NSQLookupd HTTP Endpoints +# nsqlookupd = ["localhost:4161"] +# topic = "telegraf" +# channel = "consumer" +# max_in_flight = 100 +# +# ## Maximum messages to read from the broker that have not been written by an +# ## output. For best throughput set based on the number of metrics within +# ## each message and the size of the output's metric_batch_size. +# ## +# ## For example, if each message from the queue contains 10 metrics and the +# ## output metric_batch_size is 1000, setting this to 100 will ensure that a +# ## full batch is collected and the write is triggered immediately without +# ## waiting until the next flush_interval. +# # max_undelivered_messages = 1000 +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Read metrics from one or many pgbouncer servers +# [[inputs.pgbouncer]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production +# ## +# ## All connection parameters are optional. +# ## +# address = "host=localhost user=pgbouncer sslmode=disable" + + +# # Read metrics from one or many postgresql servers +# [[inputs.postgresql]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production +# ## +# ## All connection parameters are optional. +# ## +# ## Without the dbname parameter, the driver will default to a database +# ## with the same name as the user. This dbname is just for instantiating a +# ## connection with the server and doesn't restrict the databases we are trying +# ## to grab metrics for. +# ## +# address = "host=localhost user=postgres sslmode=disable" +# ## A custom name for the database that will be used as the "server" tag in the +# ## measurement output. If not specified, a default one generated from +# ## the connection address is used. +# # outputaddress = "db01" +# +# ## connection configuration. +# ## maxlifetime - specify the maximum lifetime of a connection. +# ## default is forever (0s) +# max_lifetime = "0s" +# +# ## A list of databases to explicitly ignore. If not specified, metrics for all +# ## databases are gathered. Do NOT use with the 'databases' option. +# # ignored_databases = ["postgres", "template0", "template1"] +# +# ## A list of databases to pull metrics about. If not specified, metrics for all +# ## databases are gathered. Do NOT use with the 'ignored_databases' option. +# # databases = ["app_production", "testing"] + + +# # Read metrics from one or many postgresql servers +# [[inputs.postgresql_extensible]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqgotest password=... sslmode=... dbname=app_production +# # +# ## All connection parameters are optional. # +# ## Without the dbname parameter, the driver will default to a database +# ## with the same name as the user. This dbname is just for instantiating a +# ## connection with the server and doesn't restrict the databases we are trying +# ## to grab metrics for. +# # +# address = "host=localhost user=postgres sslmode=disable" +# +# ## connection configuration. +# ## maxlifetime - specify the maximum lifetime of a connection. +# ## default is forever (0s) +# max_lifetime = "0s" +# +# ## A list of databases to pull metrics about. If not specified, metrics for all +# ## databases are gathered. +# ## databases = ["app_production", "testing"] +# # +# ## A custom name for the database that will be used as the "server" tag in the +# ## measurement output. If not specified, a default one generated from +# ## the connection address is used. +# # outputaddress = "db01" +# # +# ## Define the toml config where the sql queries are stored +# ## New queries can be added, if the withdbname is set to true and there is no +# ## databases defined in the 'databases field', the sql query is ended by a +# ## 'is not null' in order to make the query succeed. +# ## Example : +# ## The sqlquery : "SELECT * FROM pg_stat_database where datname" become +# ## "SELECT * FROM pg_stat_database where datname IN ('postgres', 'pgbench')" +# ## because the databases variable was set to ['postgres', 'pgbench' ] and the +# ## withdbname was true. Be careful that if the withdbname is set to false you +# ## don't have to define the where clause (aka with the dbname) the tagvalue +# ## field is used to define custom tags (separated by commas) +# ## The optional "measurement" value can be used to override the default +# ## output measurement name ("postgresql"). +# ## +# ## The script option can be used to specify the .sql file path. +# ## If script and sqlquery options specified at same time, sqlquery will be used +# ## +# ## Structure : +# ## [[inputs.postgresql_extensible.query]] +# ## sqlquery string +# ## version string +# ## withdbname boolean +# ## tagvalue string (comma separated) +# ## measurement string +# [[inputs.postgresql_extensible.query]] +# sqlquery="SELECT * FROM pg_stat_database" +# version=901 +# withdbname=false +# tagvalue="" +# measurement="" +# [[inputs.postgresql_extensible.query]] +# sqlquery="SELECT * FROM pg_stat_bgwriter" +# version=901 +# withdbname=false +# tagvalue="postgresql.stats" + + +# # Read metrics from one or many prometheus clients +# [[inputs.prometheus]] +# ## An array of urls to scrape metrics from. +# urls = ["http://localhost:9100/metrics"] +# +# ## Metric version controls the mapping from Prometheus metrics into +# ## Telegraf metrics. When using the prometheus_client output, use the same +# ## value in both plugins to ensure metrics are round-tripped without +# ## modification. +# ## +# ## example: metric_version = 1; deprecated in 1.13 +# ## metric_version = 2; recommended version +# # metric_version = 1 +# +# ## Url tag name (tag containing scrapped url. optional, default is "url") +# # url_tag = "scrapeUrl" +# +# ## An array of Kubernetes services to scrape metrics from. +# # kubernetes_services = ["http://my-service-dns.my-namespace:9100/metrics"] +# +# ## Kubernetes config file to create client from. +# # kube_config = "/path/to/kubernetes.config" +# +# ## Scrape Kubernetes pods for the following prometheus annotations: +# ## - prometheus.io/scrape: Enable scraping for this pod +# ## - prometheus.io/scheme: If the metrics endpoint is secured then you will need to +# ## set this to 'https' & most likely set the tls config. +# ## - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. +# ## - prometheus.io/port: If port is not 9102 use this annotation +# # monitor_kubernetes_pods = true +# ## Restricts Kubernetes monitoring to a single namespace +# ## ex: monitor_kubernetes_pods_namespace = "default" +# # monitor_kubernetes_pods_namespace = "" +# # label selector to target pods which have the label +# # kubernetes_label_selector = "env=dev,app=nginx" +# # field selector to target pods +# # eg. To scrape pods on a specific node +# # kubernetes_field_selector = "spec.nodeName=$HOSTNAME" +# +# ## Use bearer token for authorization. ('bearer_token' takes priority) +# # bearer_token = "/path/to/bearer/token" +# ## OR +# # bearer_token_string = "abc_123" +# +# ## HTTP Basic Authentication username and password. ('bearer_token' and +# ## 'bearer_token_string' take priority) +# # username = "" +# # password = "" +# +# ## Specify timeout duration for slower prometheus clients (default is 3s) +# # response_timeout = "3s" +# +# ## Optional TLS Config +# # tls_ca = /path/to/cafile +# # tls_cert = /path/to/certfile +# # tls_key = /path/to/keyfile +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +# # SFlow V5 Protocol Listener +# [[inputs.sflow]] +# ## Address to listen for sFlow packets. +# ## example: service_address = "udp://:6343" +# ## service_address = "udp4://:6343" +# ## service_address = "udp6://:6343" +# service_address = "udp://:6343" +# +# ## Set the size of the operating system's receive buffer. +# ## example: read_buffer_size = "64KiB" +# # read_buffer_size = "" + + +# # Receive SNMP traps +# [[inputs.snmp_trap]] +# ## Transport, local address, and port to listen on. Transport must +# ## be "udp://". Omit local address to listen on all interfaces. +# ## example: "udp://127.0.0.1:1234" +# ## +# ## Special permissions may be required to listen on a port less than +# ## 1024. See README.md for details +# ## +# # service_address = "udp://:162" +# ## Timeout running snmptranslate command +# # timeout = "5s" +# ## Snmp version, defaults to 2c +# # version = "2c" +# ## SNMPv3 authentication and encryption options. +# ## +# ## Security Name. +# # sec_name = "myuser" +# ## Authentication protocol; one of "MD5", "SHA" or "". +# # auth_protocol = "MD5" +# ## Authentication password. +# # auth_password = "pass" +# ## Security Level; one of "noAuthNoPriv", "authNoPriv", or "authPriv". +# # sec_level = "authNoPriv" +# ## Privacy protocol used for encrypted messages; one of "DES", "AES", "AES192", "AES192C", "AES256", "AES256C" or "". +# # priv_protocol = "" +# ## Privacy password used for encrypted messages. +# # priv_password = "" + + +# # Generic socket listener capable of handling multiple socket types. +# [[inputs.socket_listener]] +# ## URL to listen on +# # service_address = "tcp://:8094" +# # service_address = "tcp://127.0.0.1:http" +# # service_address = "tcp4://:8094" +# # service_address = "tcp6://:8094" +# # service_address = "tcp6://[2001:db8::1]:8094" +# # service_address = "udp://:8094" +# # service_address = "udp4://:8094" +# # service_address = "udp6://:8094" +# # service_address = "unix:///tmp/telegraf.sock" +# # service_address = "unixgram:///tmp/telegraf.sock" +# +# ## Change the file mode bits on unix sockets. These permissions may not be +# ## respected by some platforms, to safely restrict write permissions it is best +# ## to place the socket into a directory that has previously been created +# ## with the desired permissions. +# ## ex: socket_mode = "777" +# # socket_mode = "" +# +# ## Maximum number of concurrent connections. +# ## Only applies to stream sockets (e.g. TCP). +# ## 0 (default) is unlimited. +# # max_connections = 1024 +# +# ## Read timeout. +# ## Only applies to stream sockets (e.g. TCP). +# ## 0 (default) is unlimited. +# # read_timeout = "30s" +# +# ## Optional TLS configuration. +# ## Only applies to stream sockets (e.g. TCP). +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Enables client authentication if set. +# # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +# +# ## Maximum socket buffer size (in bytes when no unit specified). +# ## For stream sockets, once the buffer fills up, the sender will start backing up. +# ## For datagram sockets, once the buffer fills up, metrics will start dropping. +# ## Defaults to the OS default. +# # read_buffer_size = "64KiB" +# +# ## Period between keep alive probes. +# ## Only applies to TCP sockets. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# # keep_alive_period = "5m" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# # data_format = "influx" +# +# ## Content encoding for message payloads, can be set to "gzip" to or +# ## "identity" to apply no encoding. +# # content_encoding = "identity" + + +# # Statsd UDP/TCP Server +# [[inputs.statsd]] +# ## Protocol, must be "tcp", "udp", "udp4" or "udp6" (default=udp) +# protocol = "udp" +# +# ## MaxTCPConnection - applicable when protocol is set to tcp (default=250) +# max_tcp_connections = 250 +# +# ## Enable TCP keep alive probes (default=false) +# tcp_keep_alive = false +# +# ## Specifies the keep-alive period for an active network connection. +# ## Only applies to TCP sockets and will be ignored if tcp_keep_alive is false. +# ## Defaults to the OS configuration. +# # tcp_keep_alive_period = "2h" +# +# ## Address and port to host UDP listener on +# service_address = ":8125" +# +# ## The following configuration options control when telegraf clears it's cache +# ## of previous values. If set to false, then telegraf will only clear it's +# ## cache when the daemon is restarted. +# ## Reset gauges every interval (default=true) +# delete_gauges = true +# ## Reset counters every interval (default=true) +# delete_counters = true +# ## Reset sets every interval (default=true) +# delete_sets = true +# ## Reset timings & histograms every interval (default=true) +# delete_timings = true +# +# ## Percentiles to calculate for timing & histogram stats +# percentiles = [50.0, 90.0, 99.0, 99.9, 99.95, 100.0] +# +# ## separator to use between elements of a statsd metric +# metric_separator = "_" +# +# ## Parses tags in the datadog statsd format +# ## http://docs.datadoghq.com/guides/dogstatsd/ +# parse_data_dog_tags = false +# +# ## Parses datadog extensions to the statsd format +# datadog_extensions = false +# +# ## Statsd data translation templates, more info can be read here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/TEMPLATE_PATTERN.md +# # templates = [ +# # "cpu.* measurement*" +# # ] +# +# ## Number of UDP messages allowed to queue up, once filled, +# ## the statsd server will start dropping packets +# allowed_pending_messages = 10000 +# +# ## Number of timing/histogram values to track per-measurement in the +# ## calculation of percentiles. Raising this limit increases the accuracy +# ## of percentiles but also increases the memory usage and cpu time. +# percentile_limit = 1000 + + +# # Suricata stats plugin +# [[inputs.suricata]] +# ## Data sink for Suricata stats log +# # This is expected to be a filename of a +# # unix socket to be created for listening. +# source = "/var/run/suricata-stats.sock" +# +# # Delimiter for flattening field keys, e.g. subitem "alert" of "detect" +# # becomes "detect_alert" when delimiter is "_". +# delimiter = "_" + + +# # Accepts syslog messages following RFC5424 format with transports as per RFC5426, RFC5425, or RFC6587 +# [[inputs.syslog]] +# ## Specify an ip or hostname with port - eg., tcp://localhost:6514, tcp://10.0.0.1:6514 +# ## Protocol, address and port to host the syslog receiver. +# ## If no host is specified, then localhost is used. +# ## If no port is specified, 6514 is used (RFC5425#section-4.1). +# server = "tcp://:6514" +# +# ## TLS Config +# # tls_allowed_cacerts = ["/etc/telegraf/ca.pem"] +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# +# ## Period between keep alive probes. +# ## 0 disables keep alive probes. +# ## Defaults to the OS configuration. +# ## Only applies to stream sockets (e.g. TCP). +# # keep_alive_period = "5m" +# +# ## Maximum number of concurrent connections (default = 0). +# ## 0 means unlimited. +# ## Only applies to stream sockets (e.g. TCP). +# # max_connections = 1024 +# +# ## Read timeout is the maximum time allowed for reading a single message (default = 5s). +# ## 0 means unlimited. +# # read_timeout = "5s" +# +# ## The framing technique with which it is expected that messages are transported (default = "octet-counting"). +# ## Whether the messages come using the octect-counting (RFC5425#section-4.3.1, RFC6587#section-3.4.1), +# ## or the non-transparent framing technique (RFC6587#section-3.4.2). +# ## Must be one of "octet-counting", "non-transparent". +# # framing = "octet-counting" +# +# ## The trailer to be expected in case of non-transparent framing (default = "LF"). +# ## Must be one of "LF", or "NUL". +# # trailer = "LF" +# +# ## Whether to parse in best effort mode or not (default = false). +# ## By default best effort parsing is off. +# # best_effort = false +# +# ## Character to prepend to SD-PARAMs (default = "_"). +# ## A syslog message can contain multiple parameters and multiple identifiers within structured data section. +# ## Eg., [id1 name1="val1" name2="val2"][id2 name1="val1" nameA="valA"] +# ## For each combination a field is created. +# ## Its name is created concatenating identifier, sdparam_separator, and parameter name. +# # sdparam_separator = "_" + + +# # Parse the new lines appended to a file +# [[inputs.tail]] +# ## File names or a pattern to tail. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/mymetrics.out"] +# +# ## Read file from beginning. +# # from_beginning = false +# +# ## Whether file is a named pipe +# # pipe = false +# +# ## Method used to watch for file updates. Can be either "inotify" or "poll". +# # watch_method = "inotify" +# +# ## Maximum lines of the file to process that have not yet be written by the +# ## output. For best throughput set based on the number of metrics on each +# ## line and the size of the output's metric_batch_size. +# # max_undelivered_lines = 1000 +# +# ## Character encoding to use when interpreting the file contents. Invalid +# ## characters are replaced using the unicode replacement character. When set +# ## to the empty string the data is not decoded to text. +# ## ex: character_encoding = "utf-8" +# ## character_encoding = "utf-16le" +# ## character_encoding = "utf-16be" +# ## character_encoding = "" +# # character_encoding = "" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + +# # Generic TCP listener +# [[inputs.tcp_listener]] +# # DEPRECATED: the TCP listener plugin has been deprecated in favor of the +# # socket_listener plugin +# # see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener + + +# # Generic UDP listener +# [[inputs.udp_listener]] +# # DEPRECATED: the TCP listener plugin has been deprecated in favor of the +# # socket_listener plugin +# # see https://github.com/influxdata/telegraf/tree/master/plugins/inputs/socket_listener + + +# # Read metrics from VMware vCenter +# [[inputs.vsphere]] +# ## List of vCenter URLs to be monitored. These three lines must be uncommented +# ## and edited for the plugin to work. +# vcenters = [ "https://vcenter.local/sdk" ] +# username = "user@corp.local" +# password = "secret" +# +# ## VMs +# ## Typical VM metrics (if omitted or empty, all metrics are collected) +# # vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected) +# # vm_exclude = [] # Inventory paths to exclude +# vm_metric_include = [ +# "cpu.demand.average", +# "cpu.idle.summation", +# "cpu.latency.average", +# "cpu.readiness.average", +# "cpu.ready.summation", +# "cpu.run.summation", +# "cpu.usagemhz.average", +# "cpu.used.summation", +# "cpu.wait.summation", +# "mem.active.average", +# "mem.granted.average", +# "mem.latency.average", +# "mem.swapin.average", +# "mem.swapinRate.average", +# "mem.swapout.average", +# "mem.swapoutRate.average", +# "mem.usage.average", +# "mem.vmmemctl.average", +# "net.bytesRx.average", +# "net.bytesTx.average", +# "net.droppedRx.summation", +# "net.droppedTx.summation", +# "net.usage.average", +# "power.power.average", +# "virtualDisk.numberReadAveraged.average", +# "virtualDisk.numberWriteAveraged.average", +# "virtualDisk.read.average", +# "virtualDisk.readOIO.latest", +# "virtualDisk.throughput.usage.average", +# "virtualDisk.totalReadLatency.average", +# "virtualDisk.totalWriteLatency.average", +# "virtualDisk.write.average", +# "virtualDisk.writeOIO.latest", +# "sys.uptime.latest", +# ] +# # vm_metric_exclude = [] ## Nothing is excluded by default +# # vm_instances = true ## true by default +# +# ## Hosts +# ## Typical host metrics (if omitted or empty, all metrics are collected) +# # host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected) +# # host_exclude [] # Inventory paths to exclude +# host_metric_include = [ +# "cpu.coreUtilization.average", +# "cpu.costop.summation", +# "cpu.demand.average", +# "cpu.idle.summation", +# "cpu.latency.average", +# "cpu.readiness.average", +# "cpu.ready.summation", +# "cpu.swapwait.summation", +# "cpu.usage.average", +# "cpu.usagemhz.average", +# "cpu.used.summation", +# "cpu.utilization.average", +# "cpu.wait.summation", +# "disk.deviceReadLatency.average", +# "disk.deviceWriteLatency.average", +# "disk.kernelReadLatency.average", +# "disk.kernelWriteLatency.average", +# "disk.numberReadAveraged.average", +# "disk.numberWriteAveraged.average", +# "disk.read.average", +# "disk.totalReadLatency.average", +# "disk.totalWriteLatency.average", +# "disk.write.average", +# "mem.active.average", +# "mem.latency.average", +# "mem.state.latest", +# "mem.swapin.average", +# "mem.swapinRate.average", +# "mem.swapout.average", +# "mem.swapoutRate.average", +# "mem.totalCapacity.average", +# "mem.usage.average", +# "mem.vmmemctl.average", +# "net.bytesRx.average", +# "net.bytesTx.average", +# "net.droppedRx.summation", +# "net.droppedTx.summation", +# "net.errorsRx.summation", +# "net.errorsTx.summation", +# "net.usage.average", +# "power.power.average", +# "storageAdapter.numberReadAveraged.average", +# "storageAdapter.numberWriteAveraged.average", +# "storageAdapter.read.average", +# "storageAdapter.write.average", +# "sys.uptime.latest", +# ] +# ## Collect IP addresses? Valid values are "ipv4" and "ipv6" +# # ip_addresses = ["ipv6", "ipv4" ] +# +# # host_metric_exclude = [] ## Nothing excluded by default +# # host_instances = true ## true by default +# +# +# ## Clusters +# # cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected) +# # cluster_exclude = [] # Inventory paths to exclude +# # cluster_metric_include = [] ## if omitted or empty, all metrics are collected +# # cluster_metric_exclude = [] ## Nothing excluded by default +# # cluster_instances = false ## false by default +# +# ## Datastores +# # datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected) +# # datastore_exclude = [] # Inventory paths to exclude +# # datastore_metric_include = [] ## if omitted or empty, all metrics are collected +# # datastore_metric_exclude = [] ## Nothing excluded by default +# # datastore_instances = false ## false by default +# +# ## Datacenters +# # datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected) +# # datacenter_exclude = [] # Inventory paths to exclude +# datacenter_metric_include = [] ## if omitted or empty, all metrics are collected +# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default. +# # datacenter_instances = false ## false by default +# +# ## Plugin Settings +# ## separator character to use for measurement and field names (default: "_") +# # separator = "_" +# +# ## number of objects to retrieve per query for realtime resources (vms and hosts) +# ## set to 64 for vCenter 5.5 and 6.0 (default: 256) +# # max_query_objects = 256 +# +# ## number of metrics to retrieve per query for non-realtime resources (clusters and datastores) +# ## set to 64 for vCenter 5.5 and 6.0 (default: 256) +# # max_query_metrics = 256 +# +# ## number of go routines to use for collection and discovery of objects and metrics +# # collect_concurrency = 1 +# # discover_concurrency = 1 +# +# ## the interval before (re)discovering objects subject to metrics collection (default: 300s) +# # object_discovery_interval = "300s" +# +# ## timeout applies to any of the api request made to vcenter +# # timeout = "60s" +# +# ## When set to true, all samples are sent as integers. This makes the output +# ## data types backwards compatible with Telegraf 1.9 or lower. Normally all +# ## samples from vCenter, with the exception of percentages, are integer +# ## values, but under some conditions, some averaging takes place internally in +# ## the plugin. Setting this flag to "false" will send values as floats to +# ## preserve the full precision when averaging takes place. +# # use_int_samples = true +# +# ## Custom attributes from vCenter can be very useful for queries in order to slice the +# ## metrics along different dimension and for forming ad-hoc relationships. They are disabled +# ## by default, since they can add a considerable amount of tags to the resulting metrics. To +# ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include +# ## to select the attributes you want to include. +# ## By default, since they can add a considerable amount of tags to the resulting metrics. To +# ## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include +# ## to select the attributes you want to include. +# # custom_attribute_include = [] +# # custom_attribute_exclude = ["*"] +# +# ## Optional SSL Config +# # ssl_ca = "/path/to/cafile" +# # ssl_cert = "/path/to/certfile" +# # ssl_key = "/path/to/keyfile" +# ## Use SSL but skip chain & host verification +# # insecure_skip_verify = false + + +# # A Webhooks Event collector +# [[inputs.webhooks]] +# ## Address and port to host Webhook listener on +# service_address = ":1619" +# +# [inputs.webhooks.filestack] +# path = "/filestack" +# +# [inputs.webhooks.github] +# path = "/github" +# # secret = "" +# +# [inputs.webhooks.mandrill] +# path = "/mandrill" +# +# [inputs.webhooks.rollbar] +# path = "/rollbar" +# +# [inputs.webhooks.papertrail] +# path = "/papertrail" +# +# [inputs.webhooks.particle] +# path = "/particle" + + +# # This plugin implements the Zipkin http server to gather trace and timing data needed to troubleshoot latency problems in microservice architectures. +# [[inputs.zipkin]] +# # path = "/api/v1/spans" # URL path for span data +# # port = 9411 # Port on which Telegraf listens + diff --git a/playbooks/roles/telegraf/tasks/common.yml b/playbooks/roles/telegraf/tasks/common.yml index 6e531449..e0904f7b 100644 --- a/playbooks/roles/telegraf/tasks/common.yml +++ b/playbooks/roles/telegraf/tasks/common.yml @@ -16,18 +16,21 @@ include_role: name: safe_yum +- name: Check for nvidia-smi + shell: nvidia-smi + register: nvidiasmi + ignore_errors: yes + - name: copy telegraf.conf become: true - copy: - src: "{{ item }}" - dest: /etc/telegraf/{{item}} + copy: + src: "{% if nvidiasmi is failed %}telegraf.conf{% else%}telegraf_gpu.conf{% endif %}" + dest: /etc/telegraf/telegraf.conf force: yes backup: yes owner: telegraf group: telegraf mode: 0744 - with_items: - - telegraf.conf - name: render conf files become: true @@ -43,7 +46,24 @@ - infiniband.conf - influxdb.conf - net.conf - - infiniband_hw_counters.conf + - ethtool_counters.conf + - infiniband_mlx5_0_hw_counters.conf + - infiniband_mlx5_1_hw_counters.conf + - infiniband_mlx5_2_hw_counters.conf + - infiniband_mlx5_3_hw_counters.conf + - infiniband_mlx5_4_hw_counters.conf + - infiniband_mlx5_5_hw_counters.conf + - infiniband_mlx5_6_hw_counters.conf + - infiniband_mlx5_7_hw_counters.conf + - infiniband_mlx5_8_hw_counters.conf + - infiniband_mlx5_9_hw_counters.conf + - infiniband_mlx5_10_hw_counters.conf + - infiniband_mlx5_11_hw_counters.conf + - infiniband_mlx5_12_hw_counters.conf + - infiniband_mlx5_13_hw_counters.conf + - infiniband_mlx5_14_hw_counters.conf + - infiniband_mlx5_15_hw_counters.conf + - infiniband_mlx5_16_hw_counters.conf - name: restart telegraf become: true service: diff --git a/playbooks/roles/telegraf/templates/ethtool_counters.conf.j2 b/playbooks/roles/telegraf/templates/ethtool_counters.conf.j2 new file mode 100644 index 00000000..1f2a8ec4 --- /dev/null +++ b/playbooks/roles/telegraf/templates/ethtool_counters.conf.j2 @@ -0,0 +1,12 @@ +# Returns ethtool statistics for given interfaces + [[inputs.ethtool]] + + interface_include = ["rdma*","enp*"] + interval = "300s" + fieldpass = ["tx_pci_signal_integrity","rx_steer_missed_packets","rx_vport_multicast_bytes","rx_vport_rdma_unicast_packets", + "rx_vport_rdma_unicast_bytes","tx_vport_rdma_unicast_packets","tx_vport_rdma_unicast_bytes","tx_packets_phy","rx_packets_phy", + "tx_bytes_phy","rx_bytes_phy", "rx_multicast_phy","rx_65_to_127_bytes_phy","rx_2048_to_4095_bytes_phy", "rx_4096_to_8191_bytes_phy", + "rx_crc_errors_phy","rx_symbol_err_phy", "rx_discards_phy","tx_discards_phy","tx_errors_phy","rx_64_bytes_phy","link_down_events_phy", + "rx_out_of_buffer","module_bus_stuck","module_high_temp","rx_buffer_passed_thres_phy","tx_pause_storm_warning_events","tx_pause_storm_error_events", + "rx_pcs_symbol_err_phy","rx_pci_signal_integrity","tx_pci_signal_integrity","rx_prio0_bytes","rx_prio0_packets","tx_prio0_bytes", + "tx_prio0_packets","rx_prio0_buf_discard","rx_prio0_cong_discard","rx_prio0_marked","outbound_pci_buffer_overflow"] diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_0_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_0_hw_counters.conf.j2 new file mode 100644 index 00000000..7af06d64 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_0_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_0_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_0" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_0/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_10_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_10_hw_counters.conf.j2 new file mode 100644 index 00000000..ec03e76f --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_10_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_10_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_10" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_10/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_11_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_11_hw_counters.conf.j2 new file mode 100644 index 00000000..1aa74384 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_11_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_11_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "60s" + + [[inputs.multifile.tags]] + device="mlx5_11" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_11/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_12_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_12_hw_counters.conf.j2 new file mode 100644 index 00000000..69400e45 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_12_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_12_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_12" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_12/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_13_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_13_hw_counters.conf.j2 new file mode 100644 index 00000000..c83f844b --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_13_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_13_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_13" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_13/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_14_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_14_hw_counters.conf.j2 new file mode 100644 index 00000000..c6396f7c --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_14_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_14_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_14" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_14/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_15_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_15_hw_counters.conf.j2 new file mode 100644 index 00000000..3a23905d --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_15_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_15_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_15" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_15/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_16_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_16_hw_counters.conf.j2 new file mode 100644 index 00000000..4f6d4ce0 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_16_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_16_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_16" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_16/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_1_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_1_hw_counters.conf.j2 new file mode 100644 index 00000000..e7a5a263 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_1_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_1_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_1" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_1/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_2_hw_counters.conf.j2 similarity index 99% rename from playbooks/roles/telegraf/templates/infiniband_hw_counters.conf.j2 rename to playbooks/roles/telegraf/templates/infiniband_mlx5_2_hw_counters.conf.j2 index 42b87363..5fdda547 100644 --- a/playbooks/roles/telegraf/templates/infiniband_hw_counters.conf.j2 +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_2_hw_counters.conf.j2 @@ -3,7 +3,7 @@ [[inputs.multifile]] name_override = "infiniband_mlx5_2_hw_counters" base_dir = "/sys/class/infiniband" - interval = "60s" + interval = "300s" [[inputs.multifile.tags]] device="mlx5_2" @@ -110,4 +110,5 @@ file = "mlx5_2/ports/1/hw_counters/rx_write_requests" conversion = "int" - + + diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_3_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_3_hw_counters.conf.j2 new file mode 100644 index 00000000..3ec24947 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_3_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_3_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_3" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_3/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_4_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_4_hw_counters.conf.j2 new file mode 100644 index 00000000..265a6828 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_4_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_4_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_4" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_4/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_5_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_5_hw_counters.conf.j2 new file mode 100644 index 00000000..46c3bc42 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_5_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_5_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_5" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_5/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_6_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_6_hw_counters.conf.j2 new file mode 100644 index 00000000..d0e0e6c9 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_6_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_6_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_6" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_6/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_7_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_7_hw_counters.conf.j2 new file mode 100644 index 00000000..3451ad57 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_7_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_7_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_7" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_7/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_8_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_8_hw_counters.conf.j2 new file mode 100644 index 00000000..c6d9eba5 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_8_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_8_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_8" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_8/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/roles/telegraf/templates/infiniband_mlx5_9_hw_counters.conf.j2 b/playbooks/roles/telegraf/templates/infiniband_mlx5_9_hw_counters.conf.j2 new file mode 100644 index 00000000..d666c4e0 --- /dev/null +++ b/playbooks/roles/telegraf/templates/infiniband_mlx5_9_hw_counters.conf.j2 @@ -0,0 +1,49 @@ +[[inputs.multifile]] + name_override = "infiniband_mlx5_9_hw_counters" + base_dir = "/sys/class/infiniband" + interval = "300s" + + [[inputs.multifile.tags]] + device="mlx5_9" + port="1" + type="hw_counters" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/np_ecn_marked_roce_packets" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/out_of_sequence" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/packet_seq_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/local_ack_timeout_err" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/roce_adp_retrans" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/np_cnp_sent" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/rp_cnp_handled" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/rp_cnp_ignored" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/rx_icrc_encapsulated" + conversion = "int" + + [[inputs.multifile.file]] + file = "mlx5_9/ports/1/hw_counters/roce_slow_restart" + conversion = "int" diff --git a/playbooks/site.yml b/playbooks/site.yml index 7f3dc2c5..0ee6eb8f 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -6,8 +6,10 @@ vars_files: - "/opt/oci-hpc/conf/queues.conf" gather_facts: true - roles: - - hostname + tasks: + - include_role: + name: hostname + when: slurm | default(false) | bool # for ubuntu, on all compute nodes, run --fix-broken install - hosts: compute, login @@ -138,7 +140,7 @@ local_path: "/home" export_host: "{{ hostvars[groups['bastion'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/home" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "all" when: home_nfs|bool and (not home_fss|bool) - include_role: @@ -205,7 +207,7 @@ vars: local_path: "{{ cluster_nfs_path }}" export_host: "{{ hostvars[groups['bastion'][0]]['ansible_default_ipv4']['address'] }}" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" export_path: "/export/cluster" lock: "all" when: cluster_nfs|default(true)|bool @@ -215,7 +217,7 @@ local_path: "{{ scratch_nfs_path }}" export_host: "{{ hostvars[groups['nfs'][0]]['ansible_default_ipv4']['address'] }}" export_path: "/mnt/localdisk/nfs" - options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,cto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" + options: "defaults,noatime,bg,timeo=100,ac,actimeo=120,nocto,rsize=1048576,wsize=1048576,nolock,local_lock={{ lock }},mountproto=tcp,sec=sys,_netdev" lock: "none" when: scratch_nfs|default(true)|bool and ( (autoscaling|default(true)|bool and 'compute' in group_names) or not autoscaling|default(true)|bool ) - include_role: diff --git a/queues.conf b/queues.conf index 5b2bc355..2c43b0a2 100644 --- a/queues.conf +++ b/queues.conf @@ -9,6 +9,7 @@ instance_keyword: hpc permanent: false cluster_network: ${cluster_network} + compute_cluster: ${compute_cluster} max_number_nodes: 1000 max_cluster_size: 50 max_cluster_count: 1000 @@ -32,6 +33,7 @@ instance_keyword: permanent permanent: true cluster_network: ${cluster_network} + compute_cluster: ${compute_cluster} max_number_nodes: 1000 max_cluster_size: 50 max_cluster_count: 1000 diff --git a/samples/gpu/nccl_run_allreduce.sh b/samples/gpu/nccl_run_allreduce.sh index 7548ede3..a24d8dde 100644 --- a/samples/gpu/nccl_run_allreduce.sh +++ b/samples/gpu/nccl_run_allreduce.sh @@ -14,6 +14,7 @@ else fi ORDEREDMACHINEFILE="ordered_hostfile_system_name" +ORDEREDRANKMACHINEFILE="rankfile_system_name" echo INPUTFILE cat $hostfile @@ -26,9 +27,12 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then fi hostfile=$ORDEREDMACHINEFILE +rankfile=$ORDEREDRANKMACHINEFILE echo ORDEREDMACHINEFILE cat $ORDEREDMACHINEFILE +echo ORDEREDRANKMACHINEFILE +cat $ORDEREDRANKMACHINEFILE # The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used. if [ -n "$3" ]; then @@ -46,7 +50,7 @@ do echo $x >> $logfile date >> $logfile - hostfile=$hostfile; np=$np ; iter=20; + rankfile=$rankfile; np=$np ; iter=20; mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` source $mpivars_path @@ -80,7 +84,7 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $np --hostfile $hostfile -N 8 /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile + --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile tail -n 32 $logfile diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sh b/samples/gpu/qfabv1_nccl_run_allreduce.sh index 82062b34..1e00f3c6 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sh +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sh @@ -14,6 +14,7 @@ else fi ORDEREDMACHINEFILE="ordered_hostfile_system_name" +ORDEREDRANKMACHINEFILE="rankfile_system_name" echo INPUTFILE cat $hostfile @@ -25,10 +26,12 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then fi hostfile=$ORDEREDMACHINEFILE +rankfile=$ORDEREDRANKMACHINEFILE echo ORDEREDMACHINEFILE cat $ORDEREDMACHINEFILE - +echo ORDEREDRANKMACHINEFILE +cat $ORDEREDRANKMACHINEFILE # The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used. if [ -n "$3" ]; then @@ -46,7 +49,7 @@ do echo $x >> $logfile date >> $logfile - hostfile=$hostfile; np=$np ; iter=20; + rankfile=$rankfile; np=$np ; iter=20; mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` source $mpivars_path @@ -82,7 +85,7 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $np --hostfile $hostfile -N 8 /opt/oci-hpc/nccl-test/build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile + --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile tail -n 32 $logfile diff --git a/schema.yaml b/schema.yaml index c128bfcc..42a9890d 100755 --- a/schema.yaml +++ b/schema.yaml @@ -34,6 +34,8 @@ variableGroups: - ${bastion_custom_memory} - ${bastion_memory} - ${bastion_boot_volume_size} + - ${bastion_boot_volume_backup} + - ${bastion_object_storage_par} - title: "Compute node options" variables: - ${use_multiple_ads} @@ -41,6 +43,10 @@ variableGroups: - ${secondary_ad} - ${third_ad} - ${cluster_network} + - ${compute_cluster} + - ${compute_cluster_exists} + - ${compute_cluster_id} + - ${compute_cluster_start_index} - ${cluster_network_shape} - ${instance_pool_shape} - ${instance_pool_ocpus} @@ -184,6 +190,10 @@ variableGroups: - ${ssh_cidr} - ${marketplace_source_images} - ${marketplace_version_id} + - ${bastion_boot_volume_backup_period} + - ${bastion_boot_volume_backup_retention_seconds} + - ${bastion_boot_volume_backup_time_zone} + - ${bastion_boot_volume_backup_type} visible: false - title: "Debug" variables: @@ -357,7 +367,12 @@ variables: - and: - ${bastion_custom_memory} required: true - + bastion_object_storage_par: + title: Create Object Storage PAR + description: "Create a PAR (i.e. Pre-Authenticated Request), so that user could use that PAR to upload monitoring metrics to + to Object Storage and share the URL with OCI service teams." + type: boolean + default: true use_standard_image: type: boolean title: "use standard bastion image" @@ -372,8 +387,13 @@ variables: type: boolean default: false visible: - not: - - ${use_standard_image} + or: + - not: + - ${use_standard_image} + - not: + - eq: + - ${compute_username} + - "opc" bastion_username: title: "Default username for bastion" @@ -382,8 +402,13 @@ variables: default: "opc" required: true visible: - not: - - ${use_standard_image} + or: + - not: + - ${use_standard_image} + - not: + - eq: + - ${compute_username} + - "opc" unsupported_bastion_image: title: "Image OCID" @@ -392,9 +417,13 @@ variables: required: true visible: and: - - ${unsupported_bastion} - - not: + - or: + - not: - ${use_standard_image} + - not: + - eq: + - ${compute_username} + - "opc" default: "image.ocid" bastion_image_compartment: @@ -403,24 +432,34 @@ variables: default: ${targetCompartment} visible: and: + - or: - not: - - ${use_standard_image} - - not: - - ${unsupported_bastion} + - ${use_standard_image} + - not: + - eq: + - ${compute_username} + - "opc" + - not: + - ${unsupported_bastion} required: true custom_bastion_image: title: "Bastion Image ID" - description: "Custom image ID for bastion nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as bastion image at this moment. " + description: "Custom image ID for bastion nodes. Please note that only Oracle Linux 7, 8 and Ubuntu 20.04 are supported as bastion image at this moment. " type: oci:core:image:id dependsOn: compartmentId: ${bastion_image_compartment} visible: and: + - or: - not: - - ${use_standard_image} - - not: - - ${unsupported_bastion} + - ${use_standard_image} + - not: + - eq: + - ${compute_username} + - "opc" + - not: + - ${unsupported_bastion} required: true bastion_boot_volume_size: @@ -429,6 +468,13 @@ variables: minimum: 50 title: "Size of the boot volume in GB" default: 50 + + bastion_boot_volume_backup: + type: boolean + title: "Enable boot volume backup" + description: "Schedule: Daily, Type: Incremental, Start Time: 00:00 Regional Time, Retention: 90 days." + default: true + bastion_block: type: boolean title: Additional block volume for shared space @@ -496,12 +542,41 @@ variables: - ${use_advanced} - ${use_cluster_nfs} reguired: true + cluster_network: title: Use cluster network type: boolean description: Use ROCEv2 cluster network default: true + compute_cluster: + title: Use compute cluster rather than cluster network + type: boolean + description: Use compute cluster instead of cluster network + default: false + visible: ${cluster_network} + + compute_cluster_exists: + title: Use existing Compute Cluster + type: boolean + description: The compute cluster already exists + default: false + visible: ${compute_cluster} + + compute_cluster_id: + title: Compute Cluster ID + type: string + description: Specify the compute cluster OCID + default: "" + visible: ${compute_cluster_exists} + + compute_cluster_start_index: + title: Start Index + type: integer + description: Specify the start Index of the nodes in the compute cluster + default: 0 + visible: ${compute_cluster_exists} + cluster_network_shape: type: enum enum: @@ -510,7 +585,7 @@ variables: - "BM.GPU.B4.8" - "BM.GPU.A100-v2.8" - "BM.Optimized3.36" - - "BM.HPC.E5.128" + - "BM.HPC.E5.144" default: "BM.HPC2.36" title: "Shape of the Compute Nodes" description: "Shape of compute nodes used in permanent/initial cluster" @@ -1044,7 +1119,7 @@ variables: type: boolean title: "Enable Limits for Slurm jobs" default: false - description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs of the right limits are not set" + description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs if the right limits are not set" visible: ${slurm} monitoring: @@ -1560,4 +1635,4 @@ variables: - ${use_marketplace_image_login} - ${use_old_marketplace_image_login} - not: - - ${use_standard_image_login} \ No newline at end of file + - ${use_standard_image_login} diff --git a/scripts/check_firmware_version.sh b/scripts/check_firmware_version.sh new file mode 100644 index 00000000..56de8ddc --- /dev/null +++ b/scripts/check_firmware_version.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# check_firmware_version.sh + +# Script to check the firmware version on the nodes. +# Needs one argument which is a hostfile (one host per line). +# Example: ./check_firmware_version.sh hosts + +# check if host file is passed +if [ -n "$1" ]; then + HOST_FILE=$1 +else + echo "scriptname " + echo "host file is missing, pass a file with list of hostname, one host per line" + exit 1; +fi + +# check if ubuntu or oracle +source /etc/os-release + +if [ $ID == "ol" ] ; then + echo "oracle" + USERNAME=opc +fi + +if [ $ID == "ubuntu" ] ; then + echo "ubuntu" + USERNAME=ubuntu +fi + +for h in `less $HOST_FILE` ; + do + echo $h + ssh $USERNAME@$h "/usr/sbin/ibstat | grep 'Firmware version'" + done \ No newline at end of file diff --git a/scripts/collect_logs.py b/scripts/collect_logs.py new file mode 100644 index 00000000..05506261 --- /dev/null +++ b/scripts/collect_logs.py @@ -0,0 +1,241 @@ +import os +from datetime import datetime +import argparse +import shlex +import subprocess +import sys +import requests + +def getDateTime(): + # datetime object containing current date and time + now = datetime.now() + dt_string = now.strftime("%m%d%Y%H%M%S") + return dt_string + + +# create directory to hold results +def createDir(hostname): + # Parent Directory path + username = os.getlogin() + parent_dir = "/home/" + username + "/" + # directory name + directory = str(hostname) + "_" + getDateTime() + # Path + path = os.path.join(parent_dir, directory) + try: + os.mkdir(path) + except OSError as error: + print(error) + sys.exit(-1) + return path + + +def run_cmd(cmd=None): + """ Run command on shell""" + cmd_split = shlex.split(cmd) + try: + results = subprocess.run(cmd_split, shell=False, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, check=True, encoding='utf8') + output = results.stdout.splitlines() + except subprocess.CalledProcessError as e_process_error: + return (9000, f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") + return output + +def run_sosreport(cmd, host, path): + raw_result = run_cmd(cmd) + if isinstance(raw_result, tuple): + if raw_result[0] == 9000: + print("Error in running sosreport for " + host) + print(raw_result[1]) + return False + else: + filename = [match for match in raw_result if ".tar.xz" in match] + sosfile = filename[0].strip() + sosrepfile = sosfile.rsplit('/', 1)[1] + cmd = f'ssh {host} "sudo mv /tmp/{sosrepfile} {path}"' + run_cmd(cmd) + changeOwner(path) + sosrepfile_sha256 = sosrepfile + ".sha256" + cmd = f'ssh {host} "sudo mv /tmp/{sosrepfile_sha256} {path}"' + run_cmd(cmd) + changeOwner(path) + return True + +# run nvidia bug report +def nvidiaBugReport(host, path): + cmd = f'ssh {host} "cd {path}; sudo /usr/bin/nvidia-bug-report.sh"' + raw_result = run_cmd(cmd) + if isinstance(raw_result, tuple): + if raw_result[0] == 9000: + print("Error in running nvidia bug report script for " + host) + print(raw_result[1]) + return False + else: + username = os.getlogin() + cmd = f'mv /home/{username}/nvidia-bug-report.log.gz {path}' + run_cmd(cmd) + changeOwner(path) + return True + + +# run sosreport +def sosReport(host, path): + os_version = getOS(host) + if os_version == "error": + return False + result = getSosReport(host, os_version) + if result == "error": + return False + install_cmd = "" + if result != "installed" and os_version == "Oracle": + install_cmd = f'ssh {host} "sudo yum install -y sos"' + if result != "installed" and os_version == "Ubuntu": + install_cmd = f'ssh {host} "sudo apt install -y sosreport"' + if os_version == "Oracle": + cmd = f'ssh {host} "sudo sosreport --batch -q -k rpm.rpmva=off --tmp-dir /tmp/"' + if os_version == "Ubuntu": + cmd = f'ssh {host} "sudo sos report --batch -q -k rpm.rpmva=off --tmp-dir /tmp/"' + if install_cmd != "": + install_sos = run_cmd(install_cmd) + if isinstance(install_sos, tuple): + if install_sos[0] == 9000: + print("Error in running sosreport for " + host) + print(install_sos[1]) + return False + return run_sosreport(cmd, host, path) + else: + return run_sosreport(cmd, host, path) + +# get console history logs +def consoleHistoryLogs(host, path, compartment): + if compartment is None: + res = requests.get('http://169.254.169.254/opc/v1/instance') + compartment_id = res.json()['compartmentId'] + else: + compartment_id = compartment + out = subprocess.Popen(["cat /etc/hosts | grep "+host+" | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + name = stdout.split() + instance_display_name = name[0] + cmd = f'oci compute instance list --compartment-id {compartment_id} --display-name {instance_display_name} --auth instance_principal --query data[0].id' + instance_id = run_cmd(cmd) + if isinstance(instance_id, tuple): + if instance_id[0] == 9000: + print("Error in getting instance OCID." + instance_id[1]) + return False + instance_id_str = instance_id[0] + cmd = f'oci compute console-history capture --instance-id {instance_id_str} --auth instance_principal --query data.id' + instance_console_history = run_cmd(cmd) + if isinstance(instance_console_history, tuple): + if instance_console_history[0] == 9000: + print("Error in getting instance OCID." + instance_console_history[1]) + return False + instance_console_history_str = instance_console_history[0] + filename = "/" + host + "_" + "console_history" + full_path = path + filename + cmd = f'oci compute console-history get-content --file {full_path} --instance-console-history-id {instance_console_history_str} --length 100000 --auth instance_principal' + raw_result = run_cmd(cmd) + if isinstance(raw_result, tuple): + if raw_result[0] == 9000: + print("Error in getting console history log for " + host) + print(raw_result[1]) + return False + else: + return True + + +# change ownership of all files to user so that the files can be copied +def changeOwner(path): + username = os.getlogin() + cmd = f'sudo chown -R {username}:{username} {path}' + run_cmd(cmd) + + +def isNodeSshable(host): + cmd = f'ssh -o ConnectTimeout=10 {host} "cat /etc/os-release | grep PRETTY_NAME"' + raw_result = run_cmd(cmd) + if isinstance(raw_result, tuple): + if raw_result[0] == 9000: + return False + elif 'PRETTY_NAME' in raw_result[0]: + return True + else: + return False + +def getOS(host): + cmd = f'ssh -o ConnectTimeout=10 {host} "cat /etc/os-release | grep PRETTY_NAME"' + raw_result = run_cmd(cmd) + if isinstance(raw_result, tuple): + print("Error in determining OS") + if raw_result[0] == 9000: + return "error" + elif 'Oracle' in raw_result[0]: + return "Oracle" + elif 'Ubuntu' in raw_result[0]: + return "Ubuntu" + else: + return "error" + +def getSosReport(host, os_version): + if os_version == "Oracle": + cmd = f'ssh -o ConnectTimeout=10 {host} "sudo yum list installed | grep sos"' + elif os_version == "Ubuntu": + cmd = f'ssh -o ConnectTimeout=10 {host} "sudo apt list --installed | grep sosreport"' + else: + return "error" + raw_result = run_cmd(cmd) + if isinstance(raw_result, tuple): + if raw_result[0] == 9000: + return "notinstalled" + if os_version == "Oracle": + if 'sos' in raw_result[0]: + return "installed" + elif os_version == "Ubuntu": + if 'sosreport' in raw_result[3]: + return "installed" + else: + print("Cannot determine if sosreport is installed or not") + return "error" + + +parser = argparse.ArgumentParser(description = 'Get nvidia bug report, sosreport, console history logs for a particular host if it is reachable. If it is not reachable, then console history logs are generated.') +parser.add_argument('--hostname', help = "Provide a hostname", required = True) +parser.add_argument('--compartment-id', help = "Provide the compartment OCID where the given host is") +args = parser.parse_args() +hostname = args.hostname +compartment = args.compartment_id +if hostname is None: + sys.exit(-1) +else: + path = createDir(hostname) + changeOwner(path) + node_reachable = isNodeSshable(hostname) + if node_reachable: + bug_report = False + sos_report = False + console_logs = False + if nvidiaBugReport(hostname, path): + bug_report = True + if sosReport(hostname, path): + sos_report = True + if consoleHistoryLogs(hostname, path, compartment): + console_logs = True + if bug_report and sos_report and console_logs: + print("The nvidia bug report, sosreport, and console history logs for " + hostname + " are at " + path) + elif bug_report and not sos_report and not console_logs: + print("The nvidia bug report for " + hostname + " is at " + path) + elif bug_report and sos_report and not console_logs: + print("The nvidia bug report and sosreport for " + hostname + " are at " + path) + elif bug_report and not sos_report and console_logs: + print("The nvidia bug report and console history logs for " + hostname + " are at " + path) + elif not bug_report and sos_report and not console_logs: + print("The sosreport for " + hostname + " is at " + path) + elif not bug_report and not sos_report and console_logs: + print("The console history logs for " + hostname + " is at " + path) + elif not bug_report and sos_report and console_logs: + print("The sosreport and console history logs for " + hostname + " are at " + path) + else: + sys.exit(-1) + else: + if consoleHistoryLogs(hostname, path, compartment): + print(hostname + " is not reachable. The console history logs are at " + path) diff --git a/bin/gpu_throttle.sh b/scripts/gpu_throttle.sh similarity index 100% rename from bin/gpu_throttle.sh rename to scripts/gpu_throttle.sh diff --git a/scripts/ib_write_bw.sh b/scripts/ib_write_bw.sh new file mode 100644 index 00000000..4af85b6e --- /dev/null +++ b/scripts/ib_write_bw.sh @@ -0,0 +1,263 @@ +#!/bin/bash +#ib_write_bw.sh +#This script can be used to check ib_write_bw between two gpu nodes in the cluster. +#Currently supported shapes are BM.GPU.B4.8,BM.GPU.A100-v2.8,BM.GPU4.8 +#If cuda is installed on the node, script execution will recompile perftest with cuda. + +dis_help() +{ + echo + echo "Usage:" + echo + echo "./ib_write_bw.sh -s -n -c -g " + echo + echo "Options:" + echo "s Server hostname" + echo "n Client hostname." + echo "c Enable cuda (Default: Disabled)" + echo "g GPU id (Default: 0)" + echo "h Print this help." + echo + echo "Logs are stored at /tmp/logs" + echo + echo "e.g., sh ./ib_write_bw.sh -s compute-permanent-node-1 -n compute-permanent-node-2 -c y -g 2 + echo + echo "Supported shapes: BM.GPU.B4.8,BM.GPU.A100-v2.8,BM.GPU4.8" + echo +} + +#Exit if no arguments passed +if [ "$#" -eq 0 ] +then + dis_help + exit 1 +fi + +#Display options +gid=0 +cuda=n +while getopts "s:n:c:g:h" option +do + case $option in + s) server=${OPTARG};; + n) client=${OPTARG};; + c) cuda=${OPTARG};; + g) gid=${OPTARG};; + h) dis_help + exit;; + \?) # Invalid option + echo "Error: Invalid option" + exit;; + esac +done + +#Set variables +cuda_path=`ssh $server /usr/sbin/alternatives --list|grep cuda | awk -F" " '{print $3}'|tail -1`/targets/x86_64-linux/include/cuda.h +server_ip=`grep $server /etc/hosts |grep -v rdma|awk '{print $1}'` +logdir=/tmp/logs/ib_bw/`date +%F-%H` +outdir=/tmp/ib_bw/ +gpu_count=`ssh $server nvidia-smi -L |wc -l` + + +#Check node shape +shape=`ssh $server 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape'` +if [ "$shape" == \"BM.GPU.B4.8\" ] || [ "$shape" == \"BM.GPU.A100-v2.8\" ] || [ "$shape" == \"BM.GPU4.8\" ]; +then +echo +echo "Shape: $shape" +echo "Server: $server" +echo "Client: $client" +echo "Cuda: $cuda" +echo "GPU id: $gid" +else + echo + echo "Shape $shape is not supported by this script" + dis_help +exit +fi + +#check cuda installation +ssh -q $server [[ -f $cuda_path ]] && echo " " || echo "Please check cuda installation; exit 1"; + +#Set interface to be skipped based on node shape +if [ "$shape" == \"BM.GPU.B4.8\" ] || [ "$shape" == \"BM.GPU.A100-v2.8\" ] +then +skip_if=mlx5_0 + elif [ "$shape" == \"BM.GPU4.8\" ] + then + skip_if=mlx5_4 +fi + +#Validate GPU ID +if [ "$gid" -gt "$gpu_count" ] +then +echo +echo "GPU id value should be less than or equal to the total number of GPUs installed. That is $gpu_count" +exit 1 +fi + +#Check active interfaces +echo +printf "Checking interfaces...\n" +srv_if_count=`ssh $server ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if|wc -l` +client_if_count=`ssh $client ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if|wc -l` + +if [ "$srv_if_count" != "$client_if_count" ] +then + echo + echo "Active interfaces are different on both nodes. Please fix it before running this script" + echo "Interface count on server: $srv_if_count" + echo "Interface count on client: $client_if_count" + exit 1 +fi + +#Generate ansible playbook +if [ "$cuda" == "y" ] || [ "$cuda" == "yes" ]; +then +cat > /tmp/ib_bw_gpu.yml << EOF +--- +- hosts: all + become: true + tasks: + - name: check cuda + stat: + path: $cuda_path + register: cuda_data + + - block: + - name: yum remove perftest + yum: + name: perftest + state: absent + + - name: Git checkout perftest + ansible.builtin.git: + repo: 'https://github.com/linux-rdma/perftest.git' + dest: /tmp/perftest + + - name: Run autogen.sh + ansible.builtin.shell: /tmp/perftest/autogen.sh + args: + chdir: /tmp/perftest + + - name: Run configure + ansible.builtin.shell: ./configure CUDA_H_PATH=$cuda_path + args: + chdir: /tmp/perftest + + - name: Build 'all' target with extra arguments + make: + chdir: /tmp/perftest + target: all + + - name: Copy files + shell: cp /tmp/perftest/ib_* /usr/bin + when: + - use_cuda is defined + - use_cuda == "yes" or use_cuda == "y" + - cuda_data.stat.exists +EOF + +#Create ansible inventory +cat > /tmp/inventory << EOF +$server +$client +EOF +ansible-playbook /tmp/ib_bw_gpu.yml -i /tmp/inventory -e "use_cuda=$cuda" +fi + +#Set interface to be skipped based on node shape +shape=`ssh $server 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape'` +if [ "$shape" == \"BM.GPU.B4.8\" ] || [ "$shape" == \"BM.GPU.A100-v2.8\" ] +then +skip_if=mlx5_0 + elif [ "$shape" == \"BM.GPU4.8\" ] + then + skip_if=mlx5_4 +fi + +#Check active interfaces +printf "Testing active interfaces...\n" +echo +ssh $server ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if + +#Generate server script +cat > /tmp/ib_server.sh << 'EOF' +#! /bin/bash + +out_dir=/tmp/ib_bw +mkdir -p $out_dir +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] +then +skip_if=mlx5_0 + elif [ $shape == \"BM.GPU4.8\" ] + then + skip_if=mlx5_4 +fi +for interface in `ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if` +do +echo +echo "Server Interface: $interface" +echo +ib_write_bw -d $interface -a -F &> $out_dir/ib_server-$interface +sleep 10 +done +EOF + +#Generate client script +cat > /tmp/ib_client.sh << 'EOF' +#! /bin/bash + +out_dir=/tmp/ib_bw +mkdir -p $out_dir +#interfaces +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] +then +skip_if=mlx5_0 + elif [ $shape == \"BM.GPU4.8\" ] + then + skip_if=mlx5_4 +fi +for interface in `ibv_devinfo |egrep "hca_id|state"|tac|sed '/PORT_DOWN/I,+1d'|tac|sed -e '/PORT_ACTIVE/d'|awk -F: '{print $2}'|sed 's/[[:space:]]//g'|sort -t _ -k2.2|grep -v $skip_if` +do +ib_write_bw -d $interface -F $server_ip -D 10 --cpu_util --report_gbits &> $out_dir/ib_client-$interface +cat $out_dir/ib_client-$interface +sleep 15 +done +EOF + +#Update server ip in ib_client.sh +sed -i "/#interfaces/a server_ip=$server_ip" /tmp/ib_client.sh +chmod +x /tmp/ib_server.sh /tmp/ib_client.sh + +#Update scripts to use cuda if selected +if [ "$cuda" == "yes" ] || [ "$cuda" == "y" ]; +then + sed -i 's/ib_write_bw.*/ib_write_bw -d $interface --use_cuda=0 -F > $out_dir\/ib_server-$interface/g' /tmp/ib_server.sh + sed -i 's/ib_write_bw.*/ib_write_bw -d $interface --use_cuda=0 -D 10 -I 0 $server_ip --cpu_util --report_gbits/g' /tmp/ib_client.sh + sed -i -e "s/--use_cuda=0/--use_cuda=${gid:=0}/g" /tmp/ib_server.sh + sed -i -e "s/--use_cuda=0/--use_cuda=${gid:=0}/g" /tmp/ib_client.sh +fi +echo + +#Copy and run scripts +scp /tmp/ib_server.sh $server:/tmp +scp /tmp/ib_client.sh $client:/tmp +ssh $server "/tmp/ib_server.sh" & +ssh $client "/tmp/ib_client.sh" + +#Sync results to bastion +mkdir -p $logdir +rsync -a opc@$client:$outdir $logdir + +#Generate test summary +echo +echo "************** Test Summary **************" +for i in `ls -ltr $logdir | awk -F" " '{print $9}'|awk -F- '{print $2}'`; do +echo +echo Server interface: $i | tee -a /tmp/ib_write_bw_log.txt +echo +grep -A2 MsgRate $logdir/ib_client-$i | tee -a /tmp/ib_write_bw_log.txt +done diff --git a/bin/max_nodes_partition.py b/scripts/max_nodes_partition.py similarity index 100% rename from bin/max_nodes_partition.py rename to scripts/max_nodes_partition.py diff --git a/scripts/mlx_firmware_update.sh b/scripts/mlx_firmware_update.sh new file mode 100644 index 00000000..3c8fb2eb --- /dev/null +++ b/scripts/mlx_firmware_update.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# mlx_firmware_update.sh + +# This script updates the roce_tx_window_en setting and oci-cn-auth package. +# It needs mandatory one argument which is a hostfile (one host per line). +# After updating, it also returns the roce_tx_window_en setting and oci-cn-auth version. +# If you specify the optional 2nd argument "check", then it will not update but only return the current roce_tx_window_en setting +# and oci-cn-auth version. +# Example: +# ./mlx_firmware_update.sh hosts +# ./mlx_firmware_update.sh hosts check + +MODE=update + +# check if host file is passed +if [ -n "$1" ]; then + HOST_FILE=$1 +else + echo "scriptname " + echo "host file is missing, pass a file with list of hostname, one host per line" + exit 1; +fi + +# optional parameter to check the changes +if [ -n "$2" ]; then + if [ "$2" == "check" ]; then + MODE="check" + fi +fi + + + +# check if ubuntu or oracle +source /etc/os-release + +if [ $ID == "ol" ] ; then + echo "oracle" + USERNAME=opc +fi + +if [ $ID == "ubuntu" ] ; then + echo "ubuntu" + USERNAME=ubuntu +fi + + +function check_roce_tx_window_en { + cat > ./check_roce_tx_window_en.sh << EOF +#!/bin/bash +# check roce_tx_window_en setting +# +# +mlxreg=\$(which mlxreg) +shape=\$(curl -q -s 169.254.169.254/opc/v1/instance/shape) +for pci_id in \$(cat /opt/oci-hpc/oci-cn-auth/configs/shapes.json | jq '.["hpc-shapes"]' | jq ".[] | select(.shape==\"\$shape\") " | jq -r '.["rdma-nics"] | .[].pci') ; do +echo \$pci_id ; \$mlxreg --yes -d \$pci_id --reg_name ROCE_ACCL --get | grep roce_tx_window_en +done + +EOF + +chmod +x ./check_roce_tx_window_en.sh + +for h in `less $HOST_FILE` ; + do + echo $h + scp ./check_roce_tx_window_en.sh $USERNAME@$h:/tmp/ + done + + +for h in `less $HOST_FILE` ; + do + echo $h + ssh $USERNAME@$h "sudo /tmp/check_roce_tx_window_en.sh" + done + } + +function check_oci_cn_auth_version { + for h in `less $HOST_FILE` ; + do + echo $h + ssh $USERNAME@$h "cat /opt/oci-hpc/oci-cn-auth/.version-oci_cn_auth" + done +} + +if [ $MODE == "check" ] ; then + check_roce_tx_window_en + check_oci_cn_auth_version + +else + + +# generate ./update_roce_tx_window_en.sh file +cat > ./update_roce_tx_window_en.sh << EOF +#!/bin/bash +# Script to set roce_tx_window_en=0 +# +# +mlxreg=\$(which mlxreg) +shape=\$(curl -q -s 169.254.169.254/opc/v1/instance/shape) +for pci_id in \$(cat /opt/oci-hpc/oci-cn-auth/configs/shapes.json | jq '.["hpc-shapes"]' | jq ".[] | select(.shape==\"\$shape\") " | jq -r '.["rdma-nics"] | .[].pci') ; do +echo \$pci_id ; \$mlxreg --yes -d \$pci_id --reg_name ROCE_ACCL --set roce_tx_window_en=0 +done + +EOF + +chmod +x ./update_roce_tx_window_en.sh + +# generate install file +cat > ./install_oci-cn-auth-package.sh << EOF +#!/bin/bash + +#DEBIAN_FRONTEND=noninteractive + +# check if ubuntu or oracle +source /etc/os-release + +# download file +UBUNTU_PACKAGE_URL="https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/oci-cn-auth_2.1.4-compute_all.deb" +UBUNTU_PACKAGE="/tmp/oci-cn-auth_2.1.4-compute_all.deb" +ORACLE_PACKAGE_URL="https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/F7gihhVuJbrnsV8KjAMA7XblkZYRBYJ2xAH2FPmaIJrgtYcuy5wJRWAQXMfw9hLD/n/hpc/b/source/o/oci-cn-auth-2.1.4-compute.el7.noarch.rpm" +ORACLE_PACKAGE="/tmp/oci-cn-auth-2.1.4-compute.el7.noarch.rpm" + + +if [ \$ID == "ol" ] ; then + echo "oracle" + USERNAME=opc + wget -O \$ORACLE_PACKAGE \$ORACLE_PACKAGE_URL + sudo yum localinstall -y -q \$ORACLE_PACKAGE +fi + +if [ \$ID == "ubuntu" ] ; then + echo "ubuntu" + USERNAME=ubuntu + wget -O \$UBUNTU_PACKAGE \$UBUNTU_PACKAGE_URL + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -q \$UBUNTU_PACKAGE +fi + + +EOF + +chmod +x ./install_oci-cn-auth-package.sh + + +# Run for loop to copy file to all nodes and execute them +for h in `less $HOST_FILE` ; + do + echo $h + scp ./install_oci-cn-auth-package.sh $USERNAME@$h:/tmp/ + scp ./update_roce_tx_window_en.sh $USERNAME@$h:/tmp/ + done + +#exit 0 + +for h in `less $HOST_FILE` ; + do + echo $h + ssh $USERNAME@$h "sudo /tmp/update_roce_tx_window_en.sh" + ssh $USERNAME@$h "sudo /tmp/install_oci-cn-auth-package.sh" + done + +check_roce_tx_window_en +check_oci_cn_auth_version + +fi diff --git a/scripts/pcie.sh b/scripts/pcie.sh new file mode 100644 index 00000000..a57e0ee7 --- /dev/null +++ b/scripts/pcie.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + for dev in `/usr/sbin/lspci | grep ConnectX-5 | awk '{print $1}'` + do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: + done +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + for dev in `/usr/bin/lspci | grep ConnectX-5 | awk '{print $1}'` + do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: + done +fi + diff --git a/bin/validation.py b/scripts/validation.py similarity index 94% rename from bin/validation.py rename to scripts/validation.py index 8c28b2a1..d3c56174 100644 --- a/bin/validation.py +++ b/scripts/validation.py @@ -412,26 +412,14 @@ def inventoryNodes(metadata, cluster_names): def pcie_check(hostfile, path): - out = subprocess.Popen(["cat /etc/os-release | grep PRETTY_NAME="],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["sudo cp /opt/oci-hpc/scripts/pcie.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() - os_name = stdout.split("\n") - del os_name[-1] - if "Linux" in os_name[0]: - out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_el.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie_el.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - elif "Ubuntu" in os_name[0]: - out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_ubuntu.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie_ubuntu.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) - stdout,stderr = out.communicate() - else: - print("Cannot run pcie check as OS is not determined to be Linux or Ubuntu") def gpu_throttle(hostfile, path): - out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/gpu_throttle.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + out = subprocess.Popen(["sudo cp /opt/oci-hpc/scripts/gpu_throttle.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/gpu_throttle.sh\" ; done > "+path+"/gpu-throttle-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) stdout,stderr = out.communicate() diff --git a/slurm_ha.tf b/slurm_ha.tf index ee137421..896b5d28 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -186,7 +186,7 @@ resource "null_resource" "backup" { } resource "null_resource" "cluster_backup" { count = var.slurm_ha ? 1 : 0 - depends_on = [null_resource.backup, oci_core_cluster_network.cluster_network, oci_core_instance.backup, oci_core_volume_attachment.backup_volume_attachment ] + depends_on = [null_resource.backup, oci_core_compute_cluster.compute_cluster, oci_core_cluster_network.cluster_network, oci_core_instance.backup, oci_core_volume_attachment.backup_volume_attachment ] triggers = { cluster_instances = join(", ", local.cluster_instances_names) } @@ -299,6 +299,7 @@ resource "null_resource" "cluster_backup" { provisioner "file" { content = templatefile("${path.module}/queues.conf", { cluster_network = var.cluster_network, + compute_cluster = var.compute_cluster, marketplace_listing = var.marketplace_listing, image = local.image_ocid, use_marketplace_image = var.use_marketplace_image, diff --git a/variables.tf b/variables.tf index 28a615a0..b49f100d 100755 --- a/variables.tf +++ b/variables.tf @@ -7,10 +7,15 @@ variable "third_ad" { default = "" } variable "use_multiple_ads" { default = false } variable "ssh_key" { } variable "cluster_network" { default = true } +variable "compute_cluster" { default = false } +variable "compute_cluster_exists" { default = false } +variable "compute_cluster_id" { default = "" } +variable "compute_cluster_start_index" { default = 0 } variable "use_custom_name" { default = false } variable "cluster_name" { default = "" } variable "bastion_ad" {} variable "bastion_shape" { default = "VM.Standard2.4" } +variable "bastion_object_storage_par" { default = true } variable "use_standard_image" { default= true } variable "use_standard_image_login" { default= true } variable "custom_bastion_image" { @@ -22,6 +27,11 @@ variable "custom_login_image" { default = "image.ocid" } variable "bastion_boot_volume_size" {} +variable "bastion_boot_volume_backup" {} +variable "bastion_boot_volume_backup_type" {default = "INCREMENTAL"} +variable "bastion_boot_volume_backup_period" {default = "ONE_DAY"} +variable "bastion_boot_volume_backup_retention_seconds" {default = "7776000"} +variable "bastion_boot_volume_backup_time_zone" {default = "REGIONAL_DATA_CENTER_TIME"} variable "cluster_network_shape" { default = "BM.HPC2.36" } variable "instance_pool_shape" { default = "VM.Standard2.4" } variable "node_count" { default = 2 } @@ -49,7 +59,7 @@ variable "private_subnet" { default = "172.16.4.0/22" } variable "ssh_cidr" { default = "0.0.0.0/0" } variable "slurm" { default = false } variable "slurm_ha" { default = false } -variable "login_node" { default = false } +variable "login_node" { default = true } variable "login_ad" {default = ""} variable "login_shape" { default = "VM.Standard2.4" } variable "login_boot_volume_size" {default = 50}