From 128eff9b0e219b5dc154a9fabd877012ac2cccd8 Mon Sep 17 00:00:00 2001 From: galal-hussein Date: Thu, 31 Oct 2019 23:06:54 +0200 Subject: [PATCH] Add perf test automation --- tests/perf/.gitignore | 7 + tests/perf/Makefile | 21 + tests/perf/agents/data.tf | 44 + .../agents/files/pool_worker_userdata.tmpl | 33 + tests/perf/agents/main.tf | 79 ++ tests/perf/agents/outputs.tf | 0 tests/perf/agents/variables.tf | 28 + tests/perf/agents/versions.tf | 4 + tests/perf/scripts/config | 28 + tests/perf/scripts/perf | 83 ++ tests/perf/scripts/test | 48 ++ tests/perf/server/data.tf | 52 ++ tests/perf/server/files/metrics.yaml | 227 ++++++ tests/perf/server/files/prom.yaml | 86 ++ tests/perf/server/files/server_userdata.tmpl | 55 ++ tests/perf/server/files/worker_userdata.tmpl | 29 + tests/perf/server/main.tf | 188 +++++ tests/perf/server/outputs.tf | 15 + tests/perf/server/variables.tf | 78 ++ tests/perf/server/versions.tf | 4 + .../tests/density/2000_nodes/override.yaml | 1 + .../tests/density/5000_nodes/override.yaml | 1 + .../600_nodes/high_density_override.yaml | 1 + tests/perf/tests/density/config.yaml | 248 ++++++ tests/perf/tests/density/deployment.yaml | 37 + tests/perf/tests/load/config.yaml | 765 ++++++++++++++++++ tests/perf/tests/load/configmap.yaml | 9 + .../tests/load/daemonset-priorityclass.yaml | 9 + tests/perf/tests/load/daemonset.yaml | 41 + tests/perf/tests/load/deployment.yaml | 63 ++ tests/perf/tests/load/job.yaml | 39 + tests/perf/tests/load/networkpolicy.yaml | 19 + tests/perf/tests/load/pvc.yaml | 4 + tests/perf/tests/load/secret.yaml | 7 + tests/perf/tests/load/service.yaml | 16 + tests/perf/tests/load/statefulset.yaml | 61 ++ .../perf/tests/load/statefulset_service.yaml | 10 + 37 files changed, 2440 insertions(+) create mode 100644 tests/perf/.gitignore create mode 100644 tests/perf/Makefile create mode 100644 tests/perf/agents/data.tf create mode 100644 tests/perf/agents/files/pool_worker_userdata.tmpl create mode 100644 tests/perf/agents/main.tf create mode 100644 tests/perf/agents/outputs.tf create mode 100644 tests/perf/agents/variables.tf create mode 100644 tests/perf/agents/versions.tf create mode 100755 tests/perf/scripts/config create mode 100755 tests/perf/scripts/perf create mode 100755 tests/perf/scripts/test create mode 100644 tests/perf/server/data.tf create mode 100644 tests/perf/server/files/metrics.yaml create mode 100644 tests/perf/server/files/prom.yaml create mode 100644 tests/perf/server/files/server_userdata.tmpl create mode 100644 tests/perf/server/files/worker_userdata.tmpl create mode 100644 tests/perf/server/main.tf create mode 100644 tests/perf/server/outputs.tf create mode 100644 tests/perf/server/variables.tf create mode 100644 tests/perf/server/versions.tf create mode 100644 tests/perf/tests/density/2000_nodes/override.yaml create mode 100644 tests/perf/tests/density/5000_nodes/override.yaml create mode 100644 tests/perf/tests/density/600_nodes/high_density_override.yaml create mode 100644 tests/perf/tests/density/config.yaml create mode 100644 tests/perf/tests/density/deployment.yaml create mode 100644 tests/perf/tests/load/config.yaml create mode 100644 tests/perf/tests/load/configmap.yaml create mode 100644 tests/perf/tests/load/daemonset-priorityclass.yaml create mode 100644 tests/perf/tests/load/daemonset.yaml create mode 100644 tests/perf/tests/load/deployment.yaml create mode 100644 tests/perf/tests/load/job.yaml create mode 100644 tests/perf/tests/load/networkpolicy.yaml create mode 100644 tests/perf/tests/load/pvc.yaml create mode 100644 tests/perf/tests/load/secret.yaml create mode 100644 tests/perf/tests/load/service.yaml create mode 100644 tests/perf/tests/load/statefulset.yaml create mode 100644 tests/perf/tests/load/statefulset_service.yaml diff --git a/tests/perf/.gitignore b/tests/perf/.gitignore new file mode 100644 index 0000000000..99829f7ce8 --- /dev/null +++ b/tests/perf/.gitignore @@ -0,0 +1,7 @@ +.terraform* +*.tfstate* +*.tfvars* +*.plan* +*tests_results* +*junit.xml +*kubeconfig.yaml diff --git a/tests/perf/Makefile b/tests/perf/Makefile new file mode 100644 index 0000000000..a1d63a52ad --- /dev/null +++ b/tests/perf/Makefile @@ -0,0 +1,21 @@ +MODULE := $(shell basename $$PWD) + +.PHONY: init config apply destroy clean test + +init: + @scripts/perf init + +config: + @scripts/perf config + +apply: + @scripts/perf apply + +destroy: + @scripts/perf destroy + +clean: + @scripts/perf clean + +test: + @scripts/test test_load diff --git a/tests/perf/agents/data.tf b/tests/perf/agents/data.tf new file mode 100644 index 0000000000..bff5eb3ea3 --- /dev/null +++ b/tests/perf/agents/data.tf @@ -0,0 +1,44 @@ +data "terraform_remote_state" "server" { + backend = "local" + + config = { + path = "${path.module}/../server/server.tfstate" + } +} + +data "aws_vpc" "default" { + default = true +} + +data "aws_subnet_ids" "available" { + vpc_id = data.aws_vpc.default.id +} + +data "aws_subnet" "selected" { + id = "${tolist(data.aws_subnet_ids.available.ids)[1]}" +} + +data "aws_ami" "ubuntu" { + most_recent = true + owners = ["099720109477"] + + filter { + name = "name" + values = ["ubuntu-minimal/images/*/ubuntu-bionic-18.04-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "root-device-type" + values = ["ebs"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } +} diff --git a/tests/perf/agents/files/pool_worker_userdata.tmpl b/tests/perf/agents/files/pool_worker_userdata.tmpl new file mode 100644 index 0000000000..6e08a5d300 --- /dev/null +++ b/tests/perf/agents/files/pool_worker_userdata.tmpl @@ -0,0 +1,33 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- apt-get update +- apt-get install -y software-properties-common +- apt-get install -y resolvconf linux-headers-$(uname -r) && echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail && systemctl start resolvconf +- DEBIAN_FRONTEND=noninteractive apt-get upgrade -y +- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s +- cp k3s /usr/local/bin/k3s +- chmod +x /usr/local/bin/k3s +- until (curl -sfL https://get.k3s.io | K3S_URL=https://${k3s_url}:6443 K3S_CLUSTER_SECRET="${k3s_cluster_secret}" K3S_CLUSTER_SECRET="${k3s_cluster_secret}" INSTALL_K3S_VERSION="${install_k3s_version}" sh -); do echo 'Error installing k3s agent'; sleep 1; done diff --git a/tests/perf/agents/main.tf b/tests/perf/agents/main.tf new file mode 100644 index 0000000000..975117cddd --- /dev/null +++ b/tests/perf/agents/main.tf @@ -0,0 +1,79 @@ +terraform { + backend "local" { + path = "pool.tfstate" + } +} + +locals { + name = var.name + k3s_cluster_secret = "pvc-6476dcaf-73a0-11e9-b8e5-06943b744282" +} + +provider "aws" { + region = "us-west-2" + profile = "rancher-eng" +} + +resource "aws_security_group" "k3s" { + name = "${local.name}-pool" + vpc_id = data.aws_vpc.default.id + + ingress { + from_port = 22 + to_port = 22 + protocol = "TCP" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + self = true + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +module "k3s-pool-worker-asg" { + source = "terraform-aws-modules/autoscaling/aws" + version = "3.0.0" + name = "${local.name}-pool" + asg_name = "${local.name}-pool" + instance_type = var.worker_instance_type + image_id = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/pool_worker_userdata.tmpl", { k3s_url = data.terraform_remote_state.server.outputs.public_ip, k3s_cluster_secret = local.k3s_cluster_secret, extra_ssh_keys = var.extra_ssh_keys, install_k3s_version = var.k3s_version })) + ebs_optimized = true + + desired_capacity = var.node_count + health_check_type = "EC2" + max_size = var.node_count + min_size = var.node_count + vpc_zone_identifier = [data.aws_subnet.selected.id] + spot_price = "0.680" + + security_groups = [ + aws_security_group.k3s.id, + ] + + lc_name = "${local.name}-pool" + + root_block_device = [ + { + volume_size = "100" + volume_type = "gp2" + }, + ] +} diff --git a/tests/perf/agents/outputs.tf b/tests/perf/agents/outputs.tf new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/perf/agents/variables.tf b/tests/perf/agents/variables.tf new file mode 100644 index 0000000000..37a587d413 --- /dev/null +++ b/tests/perf/agents/variables.tf @@ -0,0 +1,28 @@ +variable "node_count" { + description = "Number of nodes to run k3s agents on." + type = number + # default = 10 +} + +variable "worker_instance_type" { + type = string + default = "t3.2xlarge" +} + +variable "extra_ssh_keys" { + type = list + default = [] + description = "Extra ssh keys to inject into Rancher instances" +} + +variable "k3s_version" { + default = "v0.9.1" + type = string + description = "Version of K3S to install" +} + +variable "name" { + default = "k3s-loadtest" + type = string + description = "Name to identify this cluster" +} diff --git a/tests/perf/agents/versions.tf b/tests/perf/agents/versions.tf new file mode 100644 index 0000000000..ac97c6ac8e --- /dev/null +++ b/tests/perf/agents/versions.tf @@ -0,0 +1,4 @@ + +terraform { + required_version = ">= 0.12" +} diff --git a/tests/perf/scripts/config b/tests/perf/scripts/config new file mode 100755 index 0000000000..8e5f09a3fd --- /dev/null +++ b/tests/perf/scripts/config @@ -0,0 +1,28 @@ +## MAIN VARIABLES ## +#################### +CLUSTER_NAME="hgalal-k3s" +K3S_VERSION="v0.10.0" +EXTRA_SSH_KEYS="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDZBAE6I9J733HJfCBVu7iWSUuJ7th0U4P4IFfpFDca52n/Hk4yFFr8SPR8JJc1n42c3vEVCbExp/MD4ihqEBy9+pLewxA+fkb7UAT4cT2eLfvZdTTVe8KSiw6lVN6tWSoNXmNqY+wH7zWQ04lfjXPa/c01L1n2XwV/O+5xii9vEuSxN9YhfQ/s61SdLFqQ5yS8gPsM0qQW+bFt5KGGbapqztDO+h9lxGbZRcRAKbCzZ5kF1mhjI/+VubTWKtoVLCumjzjYqILYyx9g/mLSo26qjDEZvtwBQB9KLugDAtnalLVp0HgivC5YfLHr8PxViVSHfIIKS2DhUpn07jr8eKi9" +PRIVATE_KEY_PATH="/home/hussein/.ssh/id_rsa" #this has to be a full path + + +## K3S SERVER VARIABLES ## +########################## +K3S_HA=1 +MASTER_COUNT=3 +DB_INSTANCE_TYPE="db.m4.4xlarge" +SERVER_INSTANCE_TYPE="m5.2xlarge" +DEBUG=1 + + +## PROMETHEUS SERVER VARIABLES ## +################################# +PROM_WORKER_NODE_COUNT=1 +PROM_HOST="prometheus-load.eng.rancher.space" +GRAF_HOST="prometheus-load.eng.rancher.space" + + +## K3S AGENTS VARIABLES ## +########################## +AGENT_NODE_COUNT=100 +WORKER_INSTANCE_TYPE="m5.xlarge" diff --git a/tests/perf/scripts/perf b/tests/perf/scripts/perf new file mode 100755 index 0000000000..bf60a715e0 --- /dev/null +++ b/tests/perf/scripts/perf @@ -0,0 +1,83 @@ +#!/bin/bash -ex + +TERRAFORM_PLAN_CMD="terraform plan --var-file variables.tfvars --out k3s.plan" +TERRAFORM_APPLY_CMD="terraform apply k3s.plan" +TERRAFORM_DESTROY_CMD="terraform destroy --var-file variables.tfvars --force" + +for bin in docker kubectl terraform; do + type $bin >/dev/null 2>&1 || (echo "$bin is not in the path. Please make sure it is installed and in PATH."; exit 1) +done + +init() { + for i in server agents; do + pushd $i + terraform init + popd + done +} + +apply() { + # init terraform + config + # Run apply for server and agents + for i in server agents; do + if [ $i == "agents" ]; then + echo "Sleeping 1 minute until server(s) is initialized" + sleep 60 + fi + pushd $i + $TERRAFORM_PLAN_CMD + $TERRAFORM_APPLY_CMD + popd + done +} + +config() { + source scripts/config + pushd ./server +cat <
variables.tfvars +name = "${CLUSTER_NAME}" +db_instance_type = "${DB_INSTANCE_TYPE}" +server_instance_type = "${SERVER_INSTANCE_TYPE}" +extra_ssh_keys = ["${EXTRA_SSH_KEYS}"] +master_count = ${MASTER_COUNT} +k3s_ha = ${K3S_HA} +k3s_version = "${K3S_VERSION}" +prom_worker_node_count = ${PROM_WORKER_NODE_COUNT} +prom_host = "${PROM_HOST}" +graf_host = "${GRAF_HOST}" +ssh_key_path = "${PRIVATE_KEY_PATH}" +debug = ${DEBUG} +MAIN +popd + +pushd ./agents +cat <
variables.tfvars +name = "${CLUSTER_NAME}" +node_count = ${AGENT_NODE_COUNT} +extra_ssh_keys = ["${EXTRA_SSH_KEYS}"] +k3s_version = "${K3S_VERSION}" +worker_instance_type = "${WORKER_INSTANCE_TYPE}" +MAIN +popd +} + +clean() { + # clean server and agents + for i in server agents; do + pushd $i + rm -f *.plan *.tfvars *.tfstate* + popd + done +} + +destroy() { + for i in agents server; do + pushd $i + terraform destroy --var-file variables.tfvars --force + popd + done + clean +} + +$@ diff --git a/tests/perf/scripts/test b/tests/perf/scripts/test new file mode 100755 index 0000000000..150bd9eff9 --- /dev/null +++ b/tests/perf/scripts/test @@ -0,0 +1,48 @@ +#!/bin/bash -ex + +test_load() { + source scripts/config + masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2` + pushd tests/ + docker run -v $PRIVATE_KEY_PATH:/opt/priv_key \ + -e KUBE_SSH_USER=ubuntu \ + -e LOCAL_SSH_KEY=/opt/priv_key \ + -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \ + clusterloader --testconfig /opt/k3s/perf-tests/load/config.yaml \ + --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml \ + --masterip $masterips \ + --provider=local \ + --report-dir /opt/k3s/perf-tests/load_tests_results \ + --enable-prometheus-server \ + --tear-down-prometheus-server=0 + popd +} + +test_density() { + source scripts/config + masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2` + pushd tests/ + docker run -e KUBE_SSH_USER=ubuntu \ + -v $PRIVATE_KEY_PATH:/opt/priv_key \ + -e LOCAL_SSH_KEY=/opt/priv_key \ + -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \ + clusterloader --testconfig /opt/k3s/perf-tests/density/config.yaml \ + --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml \ + --masterip $masterips \ + --provider=local \ + --report-dir /opt/k3s/perf-tests/density_tests_results \ + --enable-prometheus-server \ + --tear-down-prometheus-server=0 + popd +} + +clean() { + # clean kubeconfig + pushd tests/ + rm -f kubeconfig + rm -rf load_tests_results/ + rm -rf density_tests_results/ + popd +} + +$@ diff --git a/tests/perf/server/data.tf b/tests/perf/server/data.tf new file mode 100644 index 0000000000..9a269d4e1e --- /dev/null +++ b/tests/perf/server/data.tf @@ -0,0 +1,52 @@ +data "aws_vpc" "default" { + default = true +} + +data "aws_subnet_ids" "available" { + vpc_id = data.aws_vpc.default.id +} + +data "aws_subnet" "selected" { + id = "${tolist(data.aws_subnet_ids.available.ids)[1]}" +} + +data "aws_ami" "ubuntu" { + most_recent = true + owners = ["099720109477"] + + filter { + name = "name" + values = ["ubuntu-minimal/images/*/ubuntu-bionic-18.04-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "root-device-type" + values = ["ebs"] + } + + filter { + name = "architecture" + values = ["x86_64"] + } +} + +data "template_file" "metrics" { + template = file("${path.module}/files/metrics.yaml") + vars = { + prom_worker_node_count = local.prom_worker_node_count + + } +} +data "template_file" "k3s-prom-yaml" { + template = file("${path.module}/files/prom.yaml") + vars = { + prom_host = var.prom_host + graf_host = var.graf_host + prom_worker_node_count = local.prom_worker_node_count + } +} diff --git a/tests/perf/server/files/metrics.yaml b/tests/perf/server/files/metrics.yaml new file mode 100644 index 0000000000..d3cfb79659 --- /dev/null +++ b/tests/perf/server/files/metrics.yaml @@ -0,0 +1,227 @@ +%{ if prom_worker_node_count != 0 } +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: [""] + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] +- apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + - ingresses + verbs: ["list", "watch"] +- apiGroups: ["apps"] + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: ["list", "watch"] +- apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] +- apiGroups: ["autoscaling.k8s.io"] + resources: + - verticalpodautoscalers + verbs: ["list", "watch"] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + k8s-app: kube-state-metrics + name: kube-state-metrics + namespace: kube-system +spec: + selector: + matchLabels: + k8s-app: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + k8s-app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v1.7.2 + ports: + - name: http-metrics + containerPort: 8080 + - name: telemetry + containerPort: 8081 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + k8s-app: kube-state-metrics + annotations: + prometheus.io/scrape: 'true' +spec: + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + protocol: TCP + - name: telemetry + port: 8081 + targetPort: telemetry + protocol: TCP + selector: + k8s-app: kube-state-metrics +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: slo-monitor +subjects: +- kind: ServiceAccount + name: slo-monitor + namespace: kube-system +roleRef: + kind: ClusterRole + name: slo-monitor + apiGroup: rbac.authorization.k8s.io +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: slo-monitor + namespace: kube-system +rules: +- apiGroups: [""] + resources: ["pods", "events"] + verbs: ["get", "watch", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: slo-monitor + namespace: kube-system +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: slo-monitor + namespace: kube-system + labels: + app: slo-monitor +spec: + selector: + matchLabels: + app: slo-monitor + template: + metadata: + labels: + app: slo-monitor + annotations: + prometheus.io/scrape: "true" + spec: + containers: + - name: slo-monitor + image: gcr.io/google-containers/slo-monitor:0.12.0 + command: + - /slo-monitor + - --alsologtostderr=true + imagePullPolicy: Always + ports: + - name: metrics + containerPort: 8080 + resources: + requests: + cpu: 300m + memory: 100Mi + limits: + cpu: 300m + memory: 100Mi + restartPolicy: Always + serviceAccountName: slo-monitor +--- +apiVersion: v1 +kind: Service +metadata: + name: slo-monitor + namespace: kube-system + labels: + app: slo-monitor +spec: + selector: + app: slo-monitor + ports: + - name: metrics + port: 80 + targetPort: metrics + type: ClusterIP +%{ endif } diff --git a/tests/perf/server/files/prom.yaml b/tests/perf/server/files/prom.yaml new file mode 100644 index 0000000000..369a922548 --- /dev/null +++ b/tests/perf/server/files/prom.yaml @@ -0,0 +1,86 @@ +%{ if prom_worker_node_count != 0 } +--- +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + +--- +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: prometheus + namespace: kube-system +spec: + chart: https://raw.githubusercontent.com/galal-hussein/charts/master/prometheus-9.2.0.tgz + targetNamespace: monitoring + valuesContent: |- + alertmanager: + nodeSelector: + prom: "true" + persistentVolume: + enabled: false + kubeStateMetrics: + nodeSelector: + prom: "true" + nodeExporter: + nodeSelector: + prom: "true" + server: + nodeSelector: + prom: "true" + ingress: + enabled: true + hosts: + - ${prom_host} + persistentVolume: + enabled: false + pushgateway: + nodeSelector: + prom: "true" + persistentVolume: + enabled: false + serverFiles: + prometheus.yml: + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - job_name: kubernetes-apiservers + scrape_interval: 10s + scrape_timeout: 10s + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - api_server: null + role: endpoints + namespaces: + names: [] + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + separator: ; + regex: default;kubernetes;https + replacement: $1 + action: keep +--- +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: grafana + namespace: kube-system +spec: + chart: stable/grafana + targetNamespace: monitoring + valuesContent: |- + ingress: + enabled: true + hosts: + - ${graf_host} + nodeSelector: + prom: "true" +%{ endif } diff --git a/tests/perf/server/files/server_userdata.tmpl b/tests/perf/server/files/server_userdata.tmpl new file mode 100644 index 0000000000..17cad50b88 --- /dev/null +++ b/tests/perf/server/files/server_userdata.tmpl @@ -0,0 +1,55 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +write_files: +- path: /var/lib/rancher/k3s/server/manifests/metrics.yaml + permissions: "0755" + owner: root:root + encoding: b64 + content: ${metrics_yaml} +- path: /var/lib/rancher/k3s/server/manifests/prom.yaml + permissions: "0755" + owner: root:root + encoding: b64 + content: ${prom_yaml} +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- apt-get update +- apt-get install -y git vim software-properties-common resolvconf linux-headers-$(uname -r) +- echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail +- echo "RateLimitIntervalSec=0" >> /etc/systemd/journald.conf +- echo "RateLimitBurst=0" >> /etc/systemd/journald.conf +- systemctl restart systemd-journald.service +- systemctl start resolvconf +- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s +- cp k3s /usr/local/bin/k3s +- chmod +x /usr/local/bin/k3s +%{if master_index != 0 } +- sleep 20 +%{ endif } +- until (curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_DOWNLOAD=true K3S_CLUSTER_SECRET="${k3s_cluster_secret}" INSTALL_K3S_VERSION="${install_k3s_version}" INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address} %{ if use_ha == "true" } --storage-endpoint="postgres://${db_username}:${db_password}@${db_address}:5432/${db_name}" %{ if master_index == 0 }--bootstrap-save%{ endif } %{ endif }" sh -); do echo 'Error installing k3s'; sleep 1; done +%{if debug != 0 } +- sed -i 's/bin\/k3s/bin\/k3s --debug/g' /etc/systemd/system/k3s.service +- systemctl daemon-reload +- systemctl restart k3s +%{ endif } diff --git a/tests/perf/server/files/worker_userdata.tmpl b/tests/perf/server/files/worker_userdata.tmpl new file mode 100644 index 0000000000..90712c0bdc --- /dev/null +++ b/tests/perf/server/files/worker_userdata.tmpl @@ -0,0 +1,29 @@ +#cloud-config +%{ if length(extra_ssh_keys) > 0 } +ssh_authorized_keys: +%{ for ssh_key in extra_ssh_keys } +- ${ssh_key} +%{ endfor } +%{ endif } +runcmd: +- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf +- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf +- echo "fs.file-max = 12000500" >> /etc/sysctl.conf +- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf +- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf +- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf +- ulimit -n 20000 +- echo "# " >> /etc/security/limits.d/limits.conf +- echo " * soft nofile 20000" >> /etc/security/limits.d/limits.conf +- echo " * hard nofile 20000" >> /etc/security/limits.d/limits.conf +- sysctl -p +- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s +- cp k3s /usr/local/bin/k3s +- chmod +x /usr/local/bin/k3s +- until (curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${install_k3s_version} INSTALL_K3S_EXEC="${k3s_exec}" K3S_URL=https://${k3s_url}:6443 K3S_CLUSTER_SECRET="${k3s_cluster_secret}" sh -); do echo 'k3s did not install correctly'; sleep 1; done diff --git a/tests/perf/server/main.tf b/tests/perf/server/main.tf new file mode 100644 index 0000000000..bffc863b2a --- /dev/null +++ b/tests/perf/server/main.tf @@ -0,0 +1,188 @@ +terraform { + backend "local" { + path = "server.tfstate" + } +} + +locals { + name = var.name + k3s_cluster_secret = var.k3s_cluster_secret + install_k3s_version = var.k3s_version + prom_worker_node_count = var.prom_worker_node_count +} + +provider "aws" { + region = "us-west-2" + profile = "rancher-eng" +} + +resource "aws_security_group" "k3s" { + name = "${local.name}-sg" + vpc_id = data.aws_vpc.default.id + + ingress { + from_port = 22 + to_port = 22 + protocol = "TCP" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 6443 + to_port = 6443 + protocol = "TCP" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 0 + protocol = "-1" + self = true + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_db_instance" "k3s_db" { + count = "${var.k3s_ha}" + allocated_storage = 100 #baseline iops is 300 with gp2 + storage_type = "io1" + iops = "3000" + engine = "postgres" + engine_version = "11.5" + instance_class = "${var.db_instance_type}" + name = "${var.db_name}" + username = "${var.db_username}" + password = "${var.db_password}" + skip_final_snapshot = true + multi_az = false +} + +resource "aws_lb" "k3s-master-nlb" { + name = "${local.name}-nlb" + internal = false + load_balancer_type = "network" + subnets = [data.aws_subnet.selected.id] +} + +resource "aws_lb_target_group" "k3s-master-nlb-tg" { + name = "${local.name}-nlb-tg" + port = "6443" + protocol = "TCP" + vpc_id = data.aws_vpc.default.id + deregistration_delay = "300" + health_check { + interval = "30" + port = "6443" + protocol = "TCP" + healthy_threshold = "10" + unhealthy_threshold= "10" + } +} + +resource "aws_lb_listener" "k3s-master-nlb-tg" { + load_balancer_arn = "${aws_lb.k3s-master-nlb.arn}" + port = "6443" + protocol = "TCP" + default_action { + target_group_arn = "${aws_lb_target_group.k3s-master-nlb-tg.arn}" + type = "forward" + } +} + +resource "aws_lb_target_group_attachment" "test" { + count = "${var.master_count}" + target_group_arn = "${aws_lb_target_group.k3s-master-nlb-tg.arn}" + target_id = "${aws_spot_instance_request.k3s-server[count.index].spot_instance_id}" + port = 6443 +} + +resource "aws_spot_instance_request" "k3s-server" { + count = "${var.master_count}" + instance_type = var.server_instance_type + ami = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/server_userdata.tmpl", + { + extra_ssh_keys = var.extra_ssh_keys, + metrics_yaml = base64encode(data.template_file.metrics.rendered), + prom_yaml = base64encode(data.template_file.k3s-prom-yaml.rendered), + k3s_cluster_secret = local.k3s_cluster_secret, + install_k3s_version = local.install_k3s_version, + k3s_server_args = var.k3s_server_args, + db_address = aws_db_instance.k3s_db[0].address, + db_name = aws_db_instance.k3s_db[0].name, + db_username = aws_db_instance.k3s_db[0].username, + db_password = aws_db_instance.k3s_db[0].password, + use_ha = "${var.k3s_ha == 1 ? "true": "false"}", + master_index = count.index, + lb_address = aws_lb.k3s-master-nlb.dns_name, + prom_worker_node_count = local.prom_worker_node_count, + debug = var.debug,})) + + wait_for_fulfillment = true + security_groups = [ + aws_security_group.k3s.name, + ] + + root_block_device { + volume_size = "100" + volume_type = "gp2" + } + + tags = { + Name = "${local.name}-server-${count.index}" + } + provisioner "local-exec" { + command = "sleep 10" + } +} + +module "k3s-prom-worker-asg" { + source = "terraform-aws-modules/autoscaling/aws" + version = "3.0.0" + name = "${local.name}-prom-worker" + asg_name = "${local.name}-prom-worker" + instance_type = "m5.large" + image_id = data.aws_ami.ubuntu.id + user_data = base64encode(templatefile("${path.module}/files/worker_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, k3s_url = aws_lb.k3s-master-nlb.dns_name, k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_exec = "--node-label prom=true" })) + + desired_capacity = local.prom_worker_node_count + health_check_type = "EC2" + max_size = local.prom_worker_node_count + min_size = local.prom_worker_node_count + vpc_zone_identifier = [data.aws_subnet.selected.id] + spot_price = "0.340" + + security_groups = [ + aws_security_group.k3s.id, + ] + + lc_name = "${local.name}-prom-worker" + + root_block_device = [ + { + volume_size = "100" + volume_type = "gp2" + }, + ] +} + +resource "null_resource" "get-kubeconfig" { + provisioner "local-exec" { + interpreter = ["bash", "-c"] + command = "until ssh -i ${var.ssh_key_path} ubuntu@${aws_spot_instance_request.k3s-server[0].public_ip} 'sudo sed \"s/localhost/$aws_lb.k3s-master-nlb.dns_name}/g;s/127.0.0.1/${aws_lb.k3s-master-nlb.dns_name}/g\" /etc/rancher/k3s/k3s.yaml' >| ../tests/kubeconfig.yaml; do sleep 5; done" + } +} diff --git a/tests/perf/server/outputs.tf b/tests/perf/server/outputs.tf new file mode 100644 index 0000000000..6e2ffd61ea --- /dev/null +++ b/tests/perf/server/outputs.tf @@ -0,0 +1,15 @@ +output "public_ip" { + value = aws_lb.k3s-master-nlb.dns_name +} + +output "install_k3s_version" { + value = local.install_k3s_version +} + +output "k3s_cluster_secret" { + value = local.k3s_cluster_secret +} + +output "k3s_server_ips" { + value = join(",", aws_spot_instance_request.k3s-server.*.public_ip) +} diff --git a/tests/perf/server/variables.tf b/tests/perf/server/variables.tf new file mode 100644 index 0000000000..0a7209ed42 --- /dev/null +++ b/tests/perf/server/variables.tf @@ -0,0 +1,78 @@ +variable "server_instance_type" { + # default = "c4.8xlarge" +} + +variable "k3s_version" { + default = "v0.9.1" + type = string + description = "Version of K3S to install" +} + +variable "k3s_server_args" { + default = "" +} + +variable "prom_worker_node_count" { + default = 0 + type = number + description = "The number of workers to create labeled for prometheus" +} + +variable "k3s_cluster_secret" { + default = "pvc-6476dcaf-73a0-11e9-b8e5-06943b744282" + type = string + description = "Cluster secret for k3s cluster registration" +} +variable "prom_host" { + default = "" +} +variable "graf_host" { + default = "" +} +variable "name" { + default = "k3s-loadtest" + type = string + description = "Name to identify this cluster" +} + +variable "ssh_key_path" { + default = "~/.ssh/id_rsa" + type = string + description = "Path of the private key to ssh to the nodes" +} + +variable "extra_ssh_keys" { + type = list + default = [] + description = "Extra ssh keys to inject into Rancher instances" +} + +variable "k3s_ha" { + default = 0 + description = "Enable k3s in HA mode" +} + +variable "db_instance_type" { +} + +variable "db_name" { + default = "k3s" +} + +variable "db_username" { + default = "postgres" +} + +variable "db_password" { + default = "b58bf234c4bd0133fc7a92b782e498a6" +} + +variable "master_count" { + default = 1 + description = "Count of k3s master servers" +} + +variable "debug" { + default = 0 + description = "Enable Debug log" +} diff --git a/tests/perf/server/versions.tf b/tests/perf/server/versions.tf new file mode 100644 index 0000000000..ac97c6ac8e --- /dev/null +++ b/tests/perf/server/versions.tf @@ -0,0 +1,4 @@ + +terraform { + required_version = ">= 0.12" +} diff --git a/tests/perf/tests/density/2000_nodes/override.yaml b/tests/perf/tests/density/2000_nodes/override.yaml new file mode 100644 index 0000000000..8d38cbac56 --- /dev/null +++ b/tests/perf/tests/density/2000_nodes/override.yaml @@ -0,0 +1 @@ +NODE_MODE: masteranddns diff --git a/tests/perf/tests/density/5000_nodes/override.yaml b/tests/perf/tests/density/5000_nodes/override.yaml new file mode 100644 index 0000000000..8d38cbac56 --- /dev/null +++ b/tests/perf/tests/density/5000_nodes/override.yaml @@ -0,0 +1 @@ +NODE_MODE: masteranddns diff --git a/tests/perf/tests/density/600_nodes/high_density_override.yaml b/tests/perf/tests/density/600_nodes/high_density_override.yaml new file mode 100644 index 0000000000..56d78a0775 --- /dev/null +++ b/tests/perf/tests/density/600_nodes/high_density_override.yaml @@ -0,0 +1 @@ +PODS_PER_NODE: 95 diff --git a/tests/perf/tests/density/config.yaml b/tests/perf/tests/density/config.yaml new file mode 100644 index 0000000000..802d47acde --- /dev/null +++ b/tests/perf/tests/density/config.yaml @@ -0,0 +1,248 @@ +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). + +#Constants +{{$DENSITY_RESOURCE_CONSTRAINTS_FILE := DefaultParam .DENSITY_RESOURCE_CONSTRAINTS_FILE ""}} +{{$NODE_MODE := DefaultParam .NODE_MODE "allnodes"}} +{{$NODES_PER_NAMESPACE := DefaultParam .NODES_PER_NAMESPACE 100}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$DENSITY_TEST_THROUGHPUT := DefaultParam .DENSITY_TEST_THROUGHPUT 20}} +# LATENCY_POD_MEMORY and LATENCY_POD_CPU are calculated for 1-core 4GB node. +# Increasing allocation of both memory and cpu by 10% +# decreases the value of priority function in scheduler by one point. +# This results in decreased probability of choosing the same node again. +{{$LATENCY_POD_CPU := DefaultParam .LATENCY_POD_CPU 100}} +{{$LATENCY_POD_MEMORY := DefaultParam .LATENCY_POD_MEMORY 350}} +{{$MIN_LATENCY_PODS := 500}} +{{$MIN_SATURATION_PODS_TIMEOUT := 180}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$ENABLE_PROMETHEUS_API_RESPONSIVENESS := DefaultParam .ENABLE_PROMETHEUS_API_RESPONSIVENESS false}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$podsPerNamespace := MultiplyInt $PODS_PER_NODE $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $podsPerNamespace $namespaces}} +{{$latencyReplicas := DivideInt (MaxInt $MIN_LATENCY_PODS .Nodes) $namespaces}} +{{$totalLatencyPods := MultiplyInt $namespaces $latencyReplicas}} +{{$saturationDeploymentTimeout := DivideFloat $totalPods $DENSITY_TEST_THROUGHPUT | AddInt $MIN_SATURATION_PODS_TIMEOUT}} +# saturationDeploymentHardTimeout must be at least 20m to make sure that ~10m node +# failure won't fail the test. See https://github.com/kubernetes/kubernetes/issues/73461#issuecomment-467338711 +{{$saturationDeploymentHardTimeout := MaxInt $saturationDeploymentTimeout 1200}} + +name: density +automanagedNamespaces: {{$namespaces}} +tuningSets: +- name: Uniform5qps + qpsLoad: + qps: 5 +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 1m + jitterFactor: 10.0 + simulatedDowntime: 10m +{{end}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: reset + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + # TODO(oxddr): figure out how many probers to run in function of cluster + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + nodeMode: {{$NODE_MODE}} + resourceConstraints: {{$DENSITY_RESOURCE_CONSTRAINTS_FILE}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + +- name: Starting saturation pod measurements + measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = saturation + threshold: {{$saturationDeploymentTimeout}}s + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = saturation + operationTimeout: {{$saturationDeploymentHardTimeout}}s + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = saturation + +- name: Creating saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: {{$podsPerNamespace}} + Group: saturation + CpuRequest: 1m + MemoryRequest: 10M + +- name: Collecting saturation pod measurements + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: gather +- measurements: + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + +- name: Starting latency pod measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = latency + operationTimeout: 15m + +- name: Creating latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$latencyReplicas}} + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: 1 + Group: latency + CpuRequest: {{$LATENCY_POD_CPU}}m + MemoryRequest: {{$LATENCY_POD_MEMORY}}M + +- name: Waiting for latency pods to be running + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Deleting latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: deployment.yaml + +- name: Waiting for latency pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + +- name: Deleting saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: deployment.yaml + +- name: Waiting for saturation pods to be deleted + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + {{if $ENABLE_PROMETHEUS_API_RESPONSIVENESS}} + enableViolations: true + {{end}} + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + {{if not $USE_SIMPLE_LATENCY_QUERY}} + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + {{end}} + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: gather + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: gather + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} diff --git a/tests/perf/tests/density/deployment.yaml b/tests/perf/tests/density/deployment.yaml new file mode 100644 index 0000000000..1903dbaf89 --- /dev/null +++ b/tests/perf/tests/density/deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + containers: + - image: k8s.gcr.io/pause:3.1 + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{.CpuRequest}} + memory: {{.MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/tests/perf/tests/load/config.yaml b/tests/perf/tests/load/config.yaml new file mode 100644 index 0000000000..413fd81eec --- /dev/null +++ b/tests/perf/tests/load/config.yaml @@ -0,0 +1,765 @@ +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). +# - The number of created SVCs is half the number of created Deployments. +# - Only half of Deployments will be assigned 1-1 to existing SVCs. + +#Constants +{{$NODE_MODE := DefaultParam .NODE_MODE "allnodes"}} +{{$NODES_PER_NAMESPACE := DefaultParam .NODES_PER_NAMESPACE 100}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$LOAD_TEST_THROUGHPUT := DefaultParam .LOAD_TEST_THROUGHPUT 10}} +{{$BIG_GROUP_SIZE := 1000}} +{{$MEDIUM_GROUP_SIZE := 500}} +{{$SMALL_GROUP_SIZE := 50}} +{{$SMALL_STATEFUL_SETS_PER_NAMESPACE := 1}} +{{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE := 1}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} +{{$ENABLE_PROMETHEUS_API_RESPONSIVENESS := DefaultParam .ENABLE_PROMETHEUS_API_RESPONSIVENESS false}} +{{$ENABLE_CONFIGMAPS := DefaultParam .ENABLE_CONFIGMAPS false}} +{{$ENABLE_DAEMONSETS := DefaultParam .ENABLE_DAEMONSETS false}} +{{$ENABLE_JOBS := DefaultParam .ENABLE_JOBS false}} +{{$ENABLE_PVS := DefaultParam .ENABLE_PVS false}} +{{$ENABLE_SECRETS := DefaultParam .ENABLE_SECRETS false}} +{{$ENABLE_STATEFULSETS := DefaultParam .ENABLE_STATEFULSETS false}} +{{$ENABLE_NETWORKPOLICIES := DefaultParam .ENABLE_NETWORKPOLICIES false}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $namespaces $NODES_PER_NAMESPACE $PODS_PER_NODE}} +{{$podsPerNamespace := DivideInt $totalPods $namespaces}} +{{$saturationTime := DivideInt $totalPods $LOAD_TEST_THROUGHPUT}} +# bigDeployments - 1/4 of namespace pods should be in big Deployments. +{{$bigDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $BIG_GROUP_SIZE)}} +# mediumDeployments - 1/4 of namespace pods should be in medium Deployments. +{{$mediumDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $MEDIUM_GROUP_SIZE)}} +# smallDeployments - 1/2 of namespace pods should be in small Deployments. +{{$smallDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 2 $SMALL_GROUP_SIZE)}} +# If StatefulSets are enabled reduce the number of small and medium deployments per namespace +{{$smallDeploymentsPerNamespace := SubtractInt $smallDeploymentsPerNamespace (IfThenElse $ENABLE_STATEFULSETS $SMALL_STATEFUL_SETS_PER_NAMESPACE 0)}} +{{$mediumDeploymentsPerNamespace := SubtractInt $mediumDeploymentsPerNamespace (IfThenElse $ENABLE_STATEFULSETS $MEDIUM_STATEFUL_SETS_PER_NAMESPACE 0)}} + +# If Jobs are enabled reduce the number of small, medium, big deployments per namespace. +{{$smallDeploymentsPerNamespace := SubtractInt $smallDeploymentsPerNamespace (IfThenElse $ENABLE_JOBS 1 0)}} +{{$mediumDeploymentsPerNamespace := SubtractInt $mediumDeploymentsPerNamespace (IfThenElse $ENABLE_JOBS 1 0)}} +{{$bigDeploymentsPerNamespace := SubtractInt $bigDeploymentsPerNamespace (IfThenElse $ENABLE_JOBS 1 0)}} + +name: load +automanagedNamespaces: {{$namespaces}} +tuningSets: +- name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 +- name: RandomizedSaturationTimeLimited + RandomizedTimeLimitedLoad: + timeLimit: {{$saturationTime}}s +- name: RandomizedScalingTimeLimited + RandomizedTimeLimitedLoad: + # The expected number of created/deleted pods is totalPods/4 when scaling, + # as each RS changes its size from X to a uniform random value in [X/2, 3X/2]. + # To match 10 [pods/s] requirement, we need to divide saturationTime by 4. + timeLimit: {{DivideInt $saturationTime 4}}s +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 1m + jitterFactor: 10.0 + simulatedDowntime: 10m +{{end}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: reset + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = load + threshold: 1h + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} + - Identifier: NetworkProgrammingLatency + Method: NetworkProgrammingLatency + Params: + action: start + {{end}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + nodeMode: {{$NODE_MODE}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + +- name: Creating SVCs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{DivideInt (AddInt $bigDeploymentsPerNamespace 1) 2}} + tuningSet: Sequence + objectBundle: + - basename: big-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{DivideInt (AddInt $mediumDeploymentsPerNamespace 1) 2}} + tuningSet: Sequence + objectBundle: + - basename: medium-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{DivideInt (AddInt $smallDeploymentsPerNamespace 1) 2}} + tuningSet: Sequence + objectBundle: + - basename: small-service + objectTemplatePath: service.yaml + +{{if $ENABLE_DAEMONSETS}} +- name: Creating PriorityClass for DaemonSets + phases: + - replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: daemonset-priorityclass + objectTemplatePath: daemonset-priorityclass.yaml +{{end}} + +- name: Starting measurement for waiting for pods + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = load + operationTimeout: 15m + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: StatefulSet + labelSelector: group = load + operationTimeout: 15m + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: DaemonSet + labelSelector: group = load + operationTimeout: 15m + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: batch/v1 + kind: Job + labelSelector: group = load + operationTimeout: 15m + {{end}} + +- name: Creating objects + phases: + {{if $ENABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + templateFillMap: + Image: k8s.gcr.io/pause:3.0 + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigDeploymentsPerNamespace}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{if $ENABLE_CONFIGMAPS}} + - basename: big-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: big-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: big-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - basename: big-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{$BIG_GROUP_SIZE}} + ReplicasMax: {{$BIG_GROUP_SIZE}} + SvcName: big-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumDeploymentsPerNamespace}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{if $ENABLE_CONFIGMAPS}} + - basename: medium-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: medium-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: medium-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - basename: medium-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{$MEDIUM_GROUP_SIZE}} + ReplicasMax: {{$MEDIUM_GROUP_SIZE}} + SvcName: medium-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallDeploymentsPerNamespace}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{if $ENABLE_CONFIGMAPS}} + - basename: small-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: small-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: small-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - basename: small-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{$SMALL_GROUP_SIZE}} + ReplicasMax: {{$SMALL_GROUP_SIZE}} + SvcName: small-service + {{if $ENABLE_STATEFULSETS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset_service.yaml + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{$SMALL_GROUP_SIZE}} + ReplicasMax: {{$SMALL_GROUP_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset_service.yaml + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{$MEDIUM_GROUP_SIZE}} + ReplicasMax: {{$MEDIUM_GROUP_SIZE}} + {{end}} + {{if $ENABLE_JOBS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{$SMALL_GROUP_SIZE}} + ReplicasMax: {{$SMALL_GROUP_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{$MEDIUM_GROUP_SIZE}} + ReplicasMax: {{$MEDIUM_GROUP_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{$BIG_GROUP_SIZE}} + ReplicasMax: {{$BIG_GROUP_SIZE}} + {{end}} + +- name: Waiting for pods to be running + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + +- name: Scaling and updating objects + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigDeploymentsPerNamespace}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: big-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $BIG_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $BIG_GROUP_SIZE 1.5}} + SvcName: big-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumDeploymentsPerNamespace}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: medium-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $MEDIUM_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $MEDIUM_GROUP_SIZE 1.5}} + SvcName: medium-service + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallDeploymentsPerNamespace}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: small-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $SMALL_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $SMALL_GROUP_SIZE 1.5}} + SvcName: small-service + {{if $ENABLE_STATEFULSETS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $SMALL_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $SMALL_GROUP_SIZE 1.5}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $MEDIUM_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $MEDIUM_GROUP_SIZE 1.5}} + {{end}} + {{if $ENABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + templateFillMap: + Image: k8s.gcr.io/pause:3.1 + {{end}} + {{if $ENABLE_JOBS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $SMALL_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $SMALL_GROUP_SIZE 1.5}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $MEDIUM_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $MEDIUM_GROUP_SIZE 1.5}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: RandomizedScalingTimeLimited + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $BIG_GROUP_SIZE 0.5}} + ReplicasMax: {{MultiplyInt $BIG_GROUP_SIZE 1.5}} + {{end}} + +- name: Waiting for objects to become scaled + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + +- name: Deleting objects + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: big-deployment + objectTemplatePath: deployment.yaml + {{if $ENABLE_CONFIGMAPS}} + - basename: big-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: big-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: big-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-deployment + objectTemplatePath: deployment.yaml + {{if $ENABLE_CONFIGMAPS}} + - basename: medium-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: medium-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: medium-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-deployment + objectTemplatePath: deployment.yaml + {{if $ENABLE_CONFIGMAPS}} + - basename: small-deployment + objectTemplatePath: configmap.yaml + {{end}} + {{if $ENABLE_SECRETS}} + - basename: small-deployment + objectTemplatePath: secret.yaml + {{end}} + {{if $ENABLE_NETWORKPOLICIES}} + - basename: small-deployment + objectTemplatePath: networkpolicy.yaml + {{end}} + {{if $ENABLE_STATEFULSETS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + - basename: small-statefulset + objectTemplatePath: statefulset_service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + - basename: medium-statefulset + objectTemplatePath: statefulset_service.yaml + {{end}} + {{if $ENABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + {{end}} + {{if $ENABLE_JOBS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + {{end}} + # If both StatefulSets and PVs were enabled we need to delete PVs manually. + {{if and $ENABLE_STATEFULSETS $ENABLE_PVS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{range $ssIndex := Seq $SMALL_STATEFUL_SETS_PER_NAMESPACE}} + - basename: pv-small-statefulset-{{$ssIndex}} + objectTemplatePath: pvc.yaml + listUnknownObjectOptions: + labelSelector: + matchLabels: + name: small-statefulset-{{$ssIndex}} + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: RandomizedSaturationTimeLimited + objectBundle: + {{range $ssIndex := Seq $MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + - basename: pv-medium-statefulset-{{$ssIndex}} + objectTemplatePath: pvc.yaml + listUnknownObjectOptions: + labelSelector: + matchLabels: + name: medium-statefulset-{{$ssIndex}} + {{end}} + {{end}} + +- name: Waiting for pods to be deleted + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{if $ENABLE_STATEFULSETS}} + - Identifier: WaitForRunningStatefulSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_DAEMONSETS}} + - Identifier: WaitForRunningDaemonSets + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if $ENABLE_JOBS}} + - Identifier: WaitForRunningJobs + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} + {{if and $ENABLE_STATEFULSETS $ENABLE_PVS}} + - Identifier: WaitForPVCsToBeDeleted + Method: WaitForBoundPVCs + Params: + desiredPVCCount: 0 + labelSelector: group = load + timeout: 15m + {{end}} + +{{if $ENABLE_DAEMONSETS}} +- name: Deleting PriorityClass for DaemonSets + phases: + - replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: daemonset-priorityclass + objectTemplatePath: daemonset-priorityclass.yaml +{{end}} + +- name: Deleting SVCs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: big-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: medium-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: small-service + objectTemplatePath: service.yaml + +- name: Collecting measurements + measurements: + - Identifier: APIResponsiveness + Method: APIResponsiveness + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + {{if $ENABLE_PROMETHEUS_API_RESPONSIVENESS}} + enableViolations: true + {{end}} + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + {{if not $USE_SIMPLE_LATENCY_QUERY}} + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + {{end}} + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: gather + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: gather + {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} + - Identifier: NetworkProgrammingLatency + Method: NetworkProgrammingLatency + Params: + action: gather + {{end}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} diff --git a/tests/perf/tests/load/configmap.yaml b/tests/perf/tests/load/configmap.yaml new file mode 100644 index 0000000000..b249a39143 --- /dev/null +++ b/tests/perf/tests/load/configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{.Name}} +data: + data.yaml: |- + a: 1 + b: 2 + c: 3 diff --git a/tests/perf/tests/load/daemonset-priorityclass.yaml b/tests/perf/tests/load/daemonset-priorityclass.yaml new file mode 100644 index 0000000000..e264a740d5 --- /dev/null +++ b/tests/perf/tests/load/daemonset-priorityclass.yaml @@ -0,0 +1,9 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: {{.Name}} +value: 1000000 +globalDefault: false +description: "Designated priority class to be used for DaemonSet pods. This is + to make sure they have higher priority than other test pods and there is always + place for them on each node, see kubernetes/kubernetes#82818." diff --git a/tests/perf/tests/load/daemonset.yaml b/tests/perf/tests/load/daemonset.yaml new file mode 100644 index 0000000000..68acfefaec --- /dev/null +++ b/tests/perf/tests/load/daemonset.yaml @@ -0,0 +1,41 @@ +{{$Image := DefaultParam .Image "k8s.gcr.io/pause:3.1"}} + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{.Name}} + labels: + group: load +spec: + updateStrategy: + rollingUpdate: + maxUnavailable: {{MaxInt 10 (DivideInt .Nodes 20)}} # 5% of nodes, but not less than 10 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: {{$Image}} + resources: + requests: + cpu: 10m + memory: "10M" + priorityClassName: daemonset-priorityclass-0 # Name is autogenerated, hence the -0 prefix. + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/tests/perf/tests/load/deployment.yaml b/tests/perf/tests/load/deployment.yaml new file mode 100644 index 0000000000..8a2f3a798b --- /dev/null +++ b/tests/perf/tests/load/deployment.yaml @@ -0,0 +1,63 @@ +{{$EnableConfigMaps := DefaultParam .ENABLE_CONFIGMAPS false}} +{{$EnableSecrets := DefaultParam .ENABLE_SECRETS false}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: load + svc: {{.SvcName}}-{{.Index}} +spec: + replicas: {{RandIntRange .ReplicasMin .ReplicasMax}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + svc: {{.SvcName}}-{{.Index}} + spec: + containers: + - image: k8s.gcr.io/pause:3.1 + name: {{.Name}} + resources: + requests: + cpu: 10m + memory: "10M" + volumeMounts: + {{if and $EnableConfigMaps (eq (Mod .Index 20) 0 19) }} # .Index % 20 in {0,19} - 10% deployments will have ConfigMap + - name: configmap + mountPath: /var/configmap + {{end}} + {{if and $EnableSecrets (eq (Mod .Index 20) 10 19) }} # .Index % 20 in {10,19} - 10% deployments will have Secret + - name: secret + mountPath: /var/secret + {{end}} + dnsPolicy: Default + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + volumes: + {{if and $EnableConfigMaps (eq (Mod .Index 20) 0 19) }} # .Index % 20 in {0,19} - 10% deployments will have ConfigMap + - name: configmap + configMap: + name: {{.BaseName}}-{{.Index}} + {{end}} + {{if and $EnableSecrets (eq (Mod .Index 20) 10 19) }} # .Index % 20 in {10,19} - 10% deployments will have Secret + - name: secret + secret: + secretName: {{.BaseName}}-{{.Index}} + {{end}} + diff --git a/tests/perf/tests/load/job.yaml b/tests/perf/tests/load/job.yaml new file mode 100644 index 0000000000..f28e1b3ee2 --- /dev/null +++ b/tests/perf/tests/load/job.yaml @@ -0,0 +1,39 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{.Name}} + labels: + group: load +spec: + manualSelector: true + parallelism: {{RandIntRange .ReplicasMin .ReplicasMax}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + # TODO(#799): We should test the "run-to-completion" workflow and hence don't use pause pods. + image: k8s.gcr.io/pause:3.1 + resources: + requests: + cpu: 10m + memory: "10M" + restartPolicy: Never + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/tests/perf/tests/load/networkpolicy.yaml b/tests/perf/tests/load/networkpolicy.yaml new file mode 100644 index 0000000000..1aae9b23c0 --- /dev/null +++ b/tests/perf/tests/load/networkpolicy.yaml @@ -0,0 +1,19 @@ +{{if eq (Mod .Index 10) 0}} # Create for only 10% of deployments +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{.Name}} +spec: + podSelector: + matchLabels: + name: {{.BaseName}}-{{.Index}} + policyTypes: + - Egress + egress: + - to: + - ipBlock: + cidr: 10.0.0.0/24 + ports: + - protocol: TCP + port: 8080 +{{end}} diff --git a/tests/perf/tests/load/pvc.yaml b/tests/perf/tests/load/pvc.yaml new file mode 100644 index 0000000000..d19d23053e --- /dev/null +++ b/tests/perf/tests/load/pvc.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{.Name}} diff --git a/tests/perf/tests/load/secret.yaml b/tests/perf/tests/load/secret.yaml new file mode 100644 index 0000000000..67134b355f --- /dev/null +++ b/tests/perf/tests/load/secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{.Name}} +type: Opaque +data: + password: c2NhbGFiaWxpdHkK diff --git a/tests/perf/tests/load/service.yaml b/tests/perf/tests/load/service.yaml new file mode 100644 index 0000000000..ed6a22c8cf --- /dev/null +++ b/tests/perf/tests/load/service.yaml @@ -0,0 +1,16 @@ +{{$SetServiceProxyLabel := DefaultParam .SetServiceProxyLabel false}} + +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} +{{if and $SetServiceProxyLabel (eq (Mod .Index 2) 0)}} + labels: + service.kubernetes.io/service-proxy-name: foo +{{end}} +spec: + selector: + svc: {{.Name}} + ports: + - port: 80 + targetPort: 80 diff --git a/tests/perf/tests/load/statefulset.yaml b/tests/perf/tests/load/statefulset.yaml new file mode 100644 index 0000000000..43157b7928 --- /dev/null +++ b/tests/perf/tests/load/statefulset.yaml @@ -0,0 +1,61 @@ +{{$EnablePVs := DefaultParam .ENABLE_PVS false}} + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{.Name}} + labels: + group: load +spec: + podManagementPolicy: Parallel + selector: + matchLabels: + group: load + name: {{.Name}} + serviceName: {{.Name}} + replicas: {{RandIntRange .ReplicasMin .ReplicasMax}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: k8s.gcr.io/pause:3.1 + ports: + - containerPort: 80 + name: web + resources: + requests: + cpu: 10m + memory: "10M" + {{if $EnablePVs}} + volumeMounts: + - name: pv + mountPath: /var/pv + {{end}} + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $EnablePVs}} + # NOTE: PVs created this way should be cleaned-up manually, as deleting the StatefulSet doesn't automatically delete PVs. + # To avoid deleting all the PVs at once during namespace deletion, they should be deleted explicitly via Phase. + volumeClaimTemplates: + - metadata: + name: pv + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 100Mi + {{end}} diff --git a/tests/perf/tests/load/statefulset_service.yaml b/tests/perf/tests/load/statefulset_service.yaml new file mode 100644 index 0000000000..5e16a47a19 --- /dev/null +++ b/tests/perf/tests/load/statefulset_service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + name: {{.Name}} +spec: + clusterIP: None + selector: + name: {{.Name}}