Merge branch 'master' into add-cilium
commit
4262df1669
|
@ -17,7 +17,8 @@ defaults:
|
||||||
scope:
|
scope:
|
||||||
path: ""
|
path: ""
|
||||||
values:
|
values:
|
||||||
version: "v1.6.0"
|
fullversion: "v1.6.0"
|
||||||
|
version: "v1.6"
|
||||||
githubbranch: "master"
|
githubbranch: "master"
|
||||||
docsbranch: "master"
|
docsbranch: "master"
|
||||||
-
|
-
|
||||||
|
|
|
@ -10,6 +10,8 @@ toc:
|
||||||
- title: Working with Kubernetes Objects
|
- title: Working with Kubernetes Objects
|
||||||
section:
|
section:
|
||||||
- docs/concepts/overview/working-with-objects/kubernetes-objects.md
|
- docs/concepts/overview/working-with-objects/kubernetes-objects.md
|
||||||
|
- docs/concepts/overview/working-with-objects/names.md
|
||||||
|
- docs/concepts/overview/working-with-objects/namespaces.md
|
||||||
- docs/concepts/overview/working-with-objects/labels.md
|
- docs/concepts/overview/working-with-objects/labels.md
|
||||||
- docs/concepts/overview/working-with-objects/annotations.md
|
- docs/concepts/overview/working-with-objects/annotations.md
|
||||||
- docs/concepts/overview/kubernetes-api.md
|
- docs/concepts/overview/kubernetes-api.md
|
||||||
|
@ -24,17 +26,26 @@ toc:
|
||||||
- title: Pods
|
- title: Pods
|
||||||
section:
|
section:
|
||||||
- docs/concepts/workloads/pods/pod-overview.md
|
- docs/concepts/workloads/pods/pod-overview.md
|
||||||
|
- docs/concepts/workloads/pods/pod.md
|
||||||
- docs/concepts/workloads/pods/pod-lifecycle.md
|
- docs/concepts/workloads/pods/pod-lifecycle.md
|
||||||
- docs/concepts/workloads/pods/init-containers.md
|
- docs/concepts/workloads/pods/init-containers.md
|
||||||
- title: Controllers
|
- title: Controllers
|
||||||
section:
|
section:
|
||||||
|
- docs/concepts/workloads/controllers/replicaset.md
|
||||||
|
- docs/concepts/workloads/controllers/replicationcontroller.md
|
||||||
|
- docs/concepts/workloads/controllers/deployment.md
|
||||||
- docs/concepts/workloads/controllers/statefulset.md
|
- docs/concepts/workloads/controllers/statefulset.md
|
||||||
- docs/concepts/workloads/controllers/petset.md
|
- docs/concepts/workloads/controllers/petset.md
|
||||||
|
- docs/concepts/workloads/controllers/daemonset.md
|
||||||
- docs/concepts/workloads/controllers/garbage-collection.md
|
- docs/concepts/workloads/controllers/garbage-collection.md
|
||||||
- title: Jobs
|
- title: Jobs
|
||||||
section:
|
section:
|
||||||
- docs/concepts/jobs/run-to-completion-finite-workloads.md
|
- docs/concepts/jobs/run-to-completion-finite-workloads.md
|
||||||
|
|
||||||
|
- title: Nodes
|
||||||
|
section:
|
||||||
|
- docs/concepts/nodes/node.md
|
||||||
|
|
||||||
- title: Cluster Administration
|
- title: Cluster Administration
|
||||||
section:
|
section:
|
||||||
- docs/concepts/cluster-administration/manage-deployment.md
|
- docs/concepts/cluster-administration/manage-deployment.md
|
||||||
|
@ -44,6 +55,7 @@ toc:
|
||||||
- docs/concepts/cluster-administration/audit.md
|
- docs/concepts/cluster-administration/audit.md
|
||||||
- docs/concepts/cluster-administration/resource-usage-monitoring.md
|
- docs/concepts/cluster-administration/resource-usage-monitoring.md
|
||||||
- docs/concepts/cluster-administration/out-of-resource.md
|
- docs/concepts/cluster-administration/out-of-resource.md
|
||||||
|
- docs/concepts/cluster-administration/cluster-management.md
|
||||||
- docs/concepts/cluster-administration/multiple-clusters.md
|
- docs/concepts/cluster-administration/multiple-clusters.md
|
||||||
- docs/concepts/cluster-administration/federation.md
|
- docs/concepts/cluster-administration/federation.md
|
||||||
- docs/concepts/cluster-administration/federation-service-discovery.md
|
- docs/concepts/cluster-administration/federation-service-discovery.md
|
||||||
|
@ -57,11 +69,14 @@ toc:
|
||||||
- title: Storage
|
- title: Storage
|
||||||
section:
|
section:
|
||||||
- docs/concepts/storage/volumes.md
|
- docs/concepts/storage/volumes.md
|
||||||
|
- docs/concepts/storage/persistent-volumes.md
|
||||||
|
|
||||||
- title: Services, Load Balancing, and Networking
|
- title: Services, Load Balancing, and Networking
|
||||||
section:
|
section:
|
||||||
- docs/concepts/services-networking/dns-pod-service.md
|
- docs/concepts/services-networking/dns-pod-service.md
|
||||||
- docs/concepts/services-networking/connect-applications-service.md
|
- docs/concepts/services-networking/connect-applications-service.md
|
||||||
|
- docs/concepts/services-networking/ingress.md
|
||||||
|
- docs/concepts/services-networking/networkpolicies.md
|
||||||
|
|
||||||
- title: Configuration
|
- title: Configuration
|
||||||
section:
|
section:
|
||||||
|
@ -69,6 +84,7 @@ toc:
|
||||||
- docs/concepts/configuration/container-command-args.md
|
- docs/concepts/configuration/container-command-args.md
|
||||||
- docs/concepts/configuration/manage-compute-resources-container.md
|
- docs/concepts/configuration/manage-compute-resources-container.md
|
||||||
- docs/concepts/configuration/assign-pod-node.md
|
- docs/concepts/configuration/assign-pod-node.md
|
||||||
|
- docs/concepts/configuration/secret.md
|
||||||
|
|
||||||
- title: Policies
|
- title: Policies
|
||||||
section:
|
section:
|
||||||
|
|
|
@ -1,10 +1,9 @@
|
||||||
bigheader: "Kubernetes Documentation"
|
bigheader: "Documentation Home"
|
||||||
abstract: "Documentation for using and learning about Kubernetes."
|
abstract: "Documentation for using and learning about Kubernetes."
|
||||||
toc:
|
toc:
|
||||||
- docs/home/index.md
|
- docs/home/index.md
|
||||||
|
|
||||||
- title: Kubernetes Documentation Home
|
- docs/home/index.md
|
||||||
path: index.md
|
|
||||||
|
|
||||||
- title: Release Notes
|
- title: Release Notes
|
||||||
path: https://github.com/kubernetes/kubernetes/releases/
|
path: https://github.com/kubernetes/kubernetes/releases/
|
||||||
|
|
|
@ -41,7 +41,6 @@ toc:
|
||||||
- docs/api-reference/v1/operations.html
|
- docs/api-reference/v1/operations.html
|
||||||
- docs/api-reference/v1/definitions.html
|
- docs/api-reference/v1/definitions.html
|
||||||
- docs/api-reference/labels-annotations-taints.md
|
- docs/api-reference/labels-annotations-taints.md
|
||||||
- kubernetes/third_party/swagger-ui/index.md
|
|
||||||
|
|
||||||
- title: Autoscaling API
|
- title: Autoscaling API
|
||||||
section:
|
section:
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<td>
|
<td>
|
||||||
<p><b>NOTICE</b></p>
|
<p><b>NOTICE</b></p>
|
||||||
<p>As of March 14, 2017, the Kubernetes SIG-Docs-Maintainers group have begun migration of the User Guide content as announced previously to the <a href="https://github.com/kubernetes/community/tree/master/sig-docs">SIG Docs community</a> through the <a href="https://groups.google.com/forum/#!forum/kubernetes-sig-docs">kubernetes-sig-docs</a> group and <a href="https://kubernetes.slack.com/messages/sig-docs/">kubernetes.slack.com #sig-docs</a> channel.</p>
|
<p>As of March 14, 2017, the Kubernetes SIG-Docs-Maintainers group have begun migration of the User Guide content as announced previously to the <a href="https://github.com/kubernetes/community/tree/master/sig-docs">SIG Docs community</a> through the <a href="https://groups.google.com/forum/#!forum/kubernetes-sig-docs">kubernetes-sig-docs</a> group and <a href="https://kubernetes.slack.com/messages/sig-docs/">kubernetes.slack.com #sig-docs</a> channel.</p>
|
||||||
<p>The user guides within this section are being refactored into topics within Tutorials, Tasks, and Concepts. Anything that has been moved will have a notice placed in its previous location as well as a link to its new location. The reorganization implements the table of contents (ToC) outlined in the <a href="https://docs.google.com/a/google.com/document/d/18hRCIorVarExB2eBVHTUR6eEJ2VVk5xq1iBmkQv8O6I/edit?usp=sharing">kubernetes-docs-toc</a> document and should improve the documentation's findability and readability for a wider range of audiences.</p>
|
<p>The user guides within this section are being refactored into topics within Tutorials, Tasks, and Concepts. Anything that has been moved will have a notice placed in its previous location as well as a link to its new location. The reorganization implements a new table of contents and should improve the documentation's findability and readability for a wider range of audiences.</p>
|
||||||
<p>For any questions, please contact: <a href="mailto:kubernetes-sig-docs@googlegroups.com">kubernetes-sig-docs@googlegroups.com</a></p>
|
<p>For any questions, please contact: <a href="mailto:kubernetes-sig-docs@googlegroups.com">kubernetes-sig-docs@googlegroups.com</a></p>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
|
@ -5,197 +5,6 @@ assignees:
|
||||||
title: Cluster Management Guide
|
title: Cluster Management Guide
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
This document describes several topics related to the lifecycle of a cluster: creating a new cluster,
|
[Cluster Management](/docs/concepts/cluster-administration/cluster-management/)
|
||||||
upgrading your cluster's
|
|
||||||
master and worker nodes, performing node maintenance (e.g. kernel upgrades), and upgrading the Kubernetes API version of a
|
|
||||||
running cluster.
|
|
||||||
|
|
||||||
## Creating and configuring a Cluster
|
|
||||||
|
|
||||||
To install Kubernetes on a set of machines, consult one of the existing [Getting Started guides](/docs/getting-started-guides/) depending on your environment.
|
|
||||||
|
|
||||||
## Upgrading a cluster
|
|
||||||
|
|
||||||
The current state of cluster upgrades is provider dependent, and some releases may require special care when upgrading. It is recommended that administrators consult both the [release notes](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md), as well as the version specific upgrade notes prior to upgrading their clusters.
|
|
||||||
|
|
||||||
* [Upgrading to 1.6](/docs/admin/upgrade-1-6)
|
|
||||||
|
|
||||||
### Upgrading Google Compute Engine clusters
|
|
||||||
|
|
||||||
Google Compute Engine Open Source (GCE-OSS) support master upgrades by deleting and
|
|
||||||
recreating the master, while maintaining the same Persistent Disk (PD) to ensure that data is retained across the
|
|
||||||
upgrade.
|
|
||||||
|
|
||||||
Node upgrades for GCE use a [Managed Instance Group](https://cloud.google.com/compute/docs/instance-groups/), each node
|
|
||||||
is sequentially destroyed and then recreated with new software. Any Pods that are running on that node need to be
|
|
||||||
controlled by a Replication Controller, or manually re-created after the roll out.
|
|
||||||
|
|
||||||
Upgrades on open source Google Compute Engine (GCE) clusters are controlled by the `cluster/gce/upgrade.sh` script.
|
|
||||||
|
|
||||||
Get its usage by running `cluster/gce/upgrade.sh -h`.
|
|
||||||
|
|
||||||
For example, to upgrade just your master to a specific version (v1.0.2):
|
|
||||||
|
|
||||||
```shell
|
|
||||||
cluster/gce/upgrade.sh -M v1.0.2
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, to upgrade your entire cluster to the latest stable release:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
cluster/gce/upgrade.sh release/stable
|
|
||||||
```
|
|
||||||
|
|
||||||
### Upgrading Google Container Engine (GKE) clusters
|
|
||||||
|
|
||||||
Google Container Engine automatically updates master components (e.g. `kube-apiserver`, `kube-scheduler`) to the latest
|
|
||||||
version. It also handles upgrading the operating system and other components that the master runs on.
|
|
||||||
|
|
||||||
The node upgrade process is user-initiated and is described in the [GKE documentation.](https://cloud.google.com/container-engine/docs/clusters/upgrade)
|
|
||||||
|
|
||||||
### Upgrading clusters on other platforms
|
|
||||||
|
|
||||||
Different providers, and tools, will manage upgrades differently. It is recommended that you consult their main documentation regarding upgrades.
|
|
||||||
|
|
||||||
* [kops](https://github.com/kubernetes/kops)
|
|
||||||
* [kargo](https://github.com/kubernetes-incubator/kargo)
|
|
||||||
* [CoreOS Tectonic](https://coreos.com/tectonic/docs/latest/admin/upgrade.html)
|
|
||||||
* ...
|
|
||||||
|
|
||||||
## Resizing a cluster
|
|
||||||
|
|
||||||
If your cluster runs short on resources you can easily add more machines to it if your cluster is running in [Node self-registration mode](/docs/admin/node/#self-registration-of-nodes).
|
|
||||||
If you're using GCE or GKE it's done by resizing Instance Group managing your Nodes. It can be accomplished by modifying number of instances on `Compute > Compute Engine > Instance groups > your group > Edit group` [Google Cloud Console page](https://console.developers.google.com) or using gcloud CLI:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
gcloud compute instance-groups managed resize kubernetes-minion-group --size 42 --zone $ZONE
|
|
||||||
```
|
|
||||||
|
|
||||||
Instance Group will take care of putting appropriate image on new machines and start them, while Kubelet will register its Node with API server to make it available for scheduling. If you scale the instance group down, system will randomly choose Nodes to kill.
|
|
||||||
|
|
||||||
In other environments you may need to configure the machine yourself and tell the Kubelet on which machine API server is running.
|
|
||||||
|
|
||||||
### Cluster autoscaling
|
|
||||||
|
|
||||||
If you are using GCE or GKE, you can configure your cluster so that it is automatically rescaled based on
|
|
||||||
pod needs.
|
|
||||||
|
|
||||||
As described in [Compute Resource](/docs/user-guide/compute-resources/), users can reserve how much CPU and memory is allocated to pods.
|
|
||||||
This information is used by the Kubernetes scheduler to find a place to run the pod. If there is
|
|
||||||
no node that has enough free capacity (or doesn't match other pod requirements) then the pod has
|
|
||||||
to wait until some pods are terminated or a new node is added.
|
|
||||||
|
|
||||||
Cluster autoscaler looks for the pods that cannot be scheduled and checks if adding a new node, similar
|
|
||||||
to the other in the cluster, would help. If yes, then it resizes the cluster to accommodate the waiting pods.
|
|
||||||
|
|
||||||
Cluster autoscaler also scales down the cluster if it notices that some node is not needed anymore for
|
|
||||||
an extended period of time (10min but it may change in the future).
|
|
||||||
|
|
||||||
Cluster autoscaler is configured per instance group (GCE) or node pool (GKE).
|
|
||||||
|
|
||||||
If you are using GCE then you can either enable it while creating a cluster with kube-up.sh script.
|
|
||||||
To configure cluster autoscaler you have to set three environment variables:
|
|
||||||
|
|
||||||
* `KUBE_ENABLE_CLUSTER_AUTOSCALER` - it enables cluster autoscaler if set to true.
|
|
||||||
* `KUBE_AUTOSCALER_MIN_NODES` - minimum number of nodes in the cluster.
|
|
||||||
* `KUBE_AUTOSCALER_MAX_NODES` - maximum number of nodes in the cluster.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
KUBE_ENABLE_CLUSTER_AUTOSCALER=true KUBE_AUTOSCALER_MIN_NODES=3 KUBE_AUTOSCALER_MAX_NODES=10 NUM_NODES=5 ./cluster/kube-up.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
On GKE you configure cluster autoscaler either on cluster creation or update or when creating a particular node pool
|
|
||||||
(which you want to be autoscaled) by passing flags `--enable-autoscaling` `--min-nodes` and `--max-nodes`
|
|
||||||
to the corresponding `gcloud` commands.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
gcloud container clusters create mytestcluster --zone=us-central1-b --enable-autoscaling --min-nodes=3 --max-nodes=10 --num-nodes=5
|
|
||||||
```
|
|
||||||
|
|
||||||
```shell
|
|
||||||
gcloud container clusters update mytestcluster --enable-autoscaling --min-nodes=1 --max-nodes=15
|
|
||||||
```
|
|
||||||
|
|
||||||
**Cluster autoscaler expects that nodes have not been manually modified (e.g. by adding labels via kubectl) as those properties would not be propagated to the new nodes within the same instance group.**
|
|
||||||
|
|
||||||
## Maintenance on a Node
|
|
||||||
|
|
||||||
If you need to reboot a node (such as for a kernel upgrade, libc upgrade, hardware repair, etc.), and the downtime is
|
|
||||||
brief, then when the Kubelet restarts, it will attempt to restart the pods scheduled to it. If the reboot takes longer
|
|
||||||
(the default time is 5 minutes, controlled by `--pod-eviction-timeout` on the controller-manager),
|
|
||||||
then the node controller will terminate the pods that are bound to the unavailable node. If there is a corresponding
|
|
||||||
replica set (or replication controller), then a new copy of the pod will be started on a different node. So, in the case where all
|
|
||||||
pods are replicated, upgrades can be done without special coordination, assuming that not all nodes will go down at the same time.
|
|
||||||
|
|
||||||
If you want more control over the upgrading process, you may use the following workflow:
|
|
||||||
|
|
||||||
Use `kubectl drain` to gracefully terminate all pods on the node while marking the node as unschedulable:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
kubectl drain $NODENAME
|
|
||||||
```
|
|
||||||
|
|
||||||
This keeps new pods from landing on the node while you are trying to get them off.
|
|
||||||
|
|
||||||
For pods with a replica set, the pod will be replaced by a new pod which will be scheduled to a new node. Additionally, if the pod is part of a service, then clients will automatically be redirected to the new pod.
|
|
||||||
|
|
||||||
For pods with no replica set, you need to bring up a new copy of the pod, and assuming it is not part of a service, redirect clients to it.
|
|
||||||
|
|
||||||
Perform maintenance work on the node.
|
|
||||||
|
|
||||||
Make the node schedulable again:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
kubectl uncordon $NODENAME
|
|
||||||
```
|
|
||||||
|
|
||||||
If you deleted the node's VM instance and created a new one, then a new schedulable node resource will
|
|
||||||
be created automatically (if you're using a cloud provider that supports
|
|
||||||
node discovery; currently this is only Google Compute Engine, not including CoreOS on Google Compute Engine using kube-register). See [Node](/docs/admin/node) for more details.
|
|
||||||
|
|
||||||
## Advanced Topics
|
|
||||||
|
|
||||||
### Upgrading to a different API version
|
|
||||||
|
|
||||||
When a new API version is released, you may need to upgrade a cluster to support the new API version (e.g. switching from 'v1' to 'v2' when 'v2' is launched).
|
|
||||||
|
|
||||||
This is an infrequent event, but it requires careful management. There is a sequence of steps to upgrade to a new API version.
|
|
||||||
|
|
||||||
1. Turn on the new API version.
|
|
||||||
1. Upgrade the cluster's storage to use the new version.
|
|
||||||
1. Upgrade all config files. Identify users of the old API version endpoints.
|
|
||||||
1. Update existing objects in the storage to new version by running `cluster/update-storage-objects.sh`.
|
|
||||||
1. Turn off the old API version.
|
|
||||||
|
|
||||||
### Turn on or off an API version for your cluster
|
|
||||||
|
|
||||||
Specific API versions can be turned on or off by passing `--runtime-config=api/<version>` flag while bringing up the API server. For example: to turn off v1 API, pass `--runtime-config=api/v1=false`.
|
|
||||||
runtime-config also supports 2 special keys: api/all and api/legacy to control all and legacy APIs respectively.
|
|
||||||
For example, for turning off all API versions except v1, pass `--runtime-config=api/all=false,api/v1=true`.
|
|
||||||
For the purposes of these flags, _legacy_ APIs are those APIs which have been explicitly deprecated (e.g. `v1beta3`).
|
|
||||||
|
|
||||||
### Switching your cluster's storage API version
|
|
||||||
|
|
||||||
The objects that are stored to disk for a cluster's internal representation of the Kubernetes resources active in the cluster are written using a particular version of the API.
|
|
||||||
When the supported API changes, these objects may need to be rewritten in the newer API. Failure to do this will eventually result in resources that are no longer decodable or usable
|
|
||||||
by the Kubernetes API server.
|
|
||||||
|
|
||||||
`KUBE_API_VERSIONS` environment variable for the `kube-apiserver` binary which controls the API versions that are supported in the cluster. The first version in the list is used as the cluster's storage version. Hence, to set a specific version as the storage version, bring it to the front of list of versions in the value of `KUBE_API_VERSIONS`. You need to restart the `kube-apiserver` binary
|
|
||||||
for changes to this variable to take effect.
|
|
||||||
|
|
||||||
### Switching your config files to a new API version
|
|
||||||
|
|
||||||
You can use `kubectl convert` command to convert config files between different API versions.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
kubectl convert -f pod.yaml --output-version v1
|
|
||||||
```
|
|
||||||
|
|
||||||
For more options, please refer to the usage of [kubectl convert](/docs/user-guide/kubectl/kubectl_convert/) command.
|
|
||||||
|
|
|
@ -4,170 +4,6 @@ assignees:
|
||||||
title: Daemon Sets
|
title: Daemon Sets
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## What is a DaemonSet?
|
[DaemonSets](/docs/concepts/workloads/controllers/daemonset/)
|
||||||
|
|
||||||
A _DaemonSet_ ensures that all (or some) nodes run a copy of a pod. As nodes are added to the
|
|
||||||
cluster, pods are added to them. As nodes are removed from the cluster, those pods are garbage
|
|
||||||
collected. Deleting a DaemonSet will clean up the pods it created.
|
|
||||||
|
|
||||||
Some typical uses of a DaemonSet are:
|
|
||||||
|
|
||||||
- running a cluster storage daemon, such as `glusterd`, `ceph`, on each node.
|
|
||||||
- running a logs collection daemon on every node, such as `fluentd` or `logstash`.
|
|
||||||
- running a node monitoring daemon on every node, such as [Prometheus Node Exporter](
|
|
||||||
https://github.com/prometheus/node_exporter), `collectd`, New Relic agent, or Ganglia `gmond`.
|
|
||||||
|
|
||||||
In a simple case, one DaemonSet, covering all nodes, would be used for each type of daemon.
|
|
||||||
A more complex setup might use multiple DaemonSets for a single type of daemon, but with
|
|
||||||
different flags and/or different memory and cpu requests for different hardware types.
|
|
||||||
|
|
||||||
## Writing a DaemonSet Spec
|
|
||||||
|
|
||||||
### Required Fields
|
|
||||||
|
|
||||||
As with all other Kubernetes config, a DaemonSet needs `apiVersion`, `kind`, and `metadata` fields. For
|
|
||||||
general information about working with config files, see [deploying applications](/docs/user-guide/deploying-applications/),
|
|
||||||
[configuring containers](/docs/user-guide/configuring-containers/), and [working with resources](/docs/user-guide/working-with-resources/) documents.
|
|
||||||
|
|
||||||
A DaemonSet also needs a [`.spec`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) section.
|
|
||||||
|
|
||||||
### Pod Template
|
|
||||||
|
|
||||||
The `.spec.template` is the only required field of the `.spec`.
|
|
||||||
|
|
||||||
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template).
|
|
||||||
It has exactly the same schema as a [pod](/docs/user-guide/pods), except
|
|
||||||
it is nested and does not have an `apiVersion` or `kind`.
|
|
||||||
|
|
||||||
In addition to required fields for a pod, a pod template in a DaemonSet has to specify appropriate
|
|
||||||
labels (see [pod selector](#pod-selector)).
|
|
||||||
|
|
||||||
A pod template in a DaemonSet must have a [`RestartPolicy`](/docs/user-guide/pod-states)
|
|
||||||
equal to `Always`, or be unspecified, which defaults to `Always`.
|
|
||||||
|
|
||||||
### Pod Selector
|
|
||||||
|
|
||||||
The `.spec.selector` field is a pod selector. It works the same as the `.spec.selector` of
|
|
||||||
a [Job](/docs/concepts/jobs/run-to-completion-finite-workloads/) or other new resources.
|
|
||||||
|
|
||||||
The `spec.selector` is an object consisting of two fields:
|
|
||||||
|
|
||||||
* `matchLabels` - works the same as the `.spec.selector` of a [ReplicationController](/docs/user-guide/replication-controller/)
|
|
||||||
* `matchExpressions` - allows to build more sophisticated selectors by specifying key,
|
|
||||||
list of values and an operator that relates the key and values.
|
|
||||||
|
|
||||||
When the two are specified the result is ANDed.
|
|
||||||
|
|
||||||
If the `.spec.selector` is specified, it must match the `.spec.template.metadata.labels`. If not
|
|
||||||
specified, they are defaulted to be equal. Config with these not matching will be rejected by the API.
|
|
||||||
|
|
||||||
Also you should not normally create any pods whose labels match this selector, either directly, via
|
|
||||||
another DaemonSet, or via other controller such as ReplicationController. Otherwise, the DaemonSet
|
|
||||||
controller will think that those pods were created by it. Kubernetes will not stop you from doing
|
|
||||||
this. One case where you might want to do this is manually create a pod with a different value on
|
|
||||||
a node for testing.
|
|
||||||
|
|
||||||
### Running Pods on Only Some Nodes
|
|
||||||
|
|
||||||
If you specify a `.spec.template.spec.nodeSelector`, then the DaemonSet controller will
|
|
||||||
create pods on nodes which match that [node
|
|
||||||
selector](/docs/user-guide/node-selection/). Likewise if you specify a `.spec.template.spec.affinity`
|
|
||||||
then DaemonSet controller will create pods on nodes which match that [node affinity](../../user-guide/node-selection/index.md).
|
|
||||||
If you do not specify either, then the DaemonSet controller will create pods on all nodes.
|
|
||||||
|
|
||||||
## How Daemon Pods are Scheduled
|
|
||||||
|
|
||||||
Normally, the machine that a pod runs on is selected by the Kubernetes scheduler. However, pods
|
|
||||||
created by the Daemon controller have the machine already selected (`.spec.nodeName` is specified
|
|
||||||
when the pod is created, so it is ignored by the scheduler). Therefore:
|
|
||||||
|
|
||||||
- the [`unschedulable`](/docs/admin/node/#manual-node-administration) field of a node is not respected
|
|
||||||
by the DaemonSet controller.
|
|
||||||
- DaemonSet controller can make pods even when the scheduler has not been started, which can help cluster
|
|
||||||
bootstrap.
|
|
||||||
|
|
||||||
Daemon pods do respect [taints and tolerations](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature), but they are
|
|
||||||
created with `NoExecute` tolerations for the `node.alpha.kubernetes.io/notReady` and `node.alpha.kubernetes.io/unreachable`
|
|
||||||
taints with no `tolerationSeconds`. This ensures that when the `TaintBasedEvictions` alpha feature is enabled,
|
|
||||||
they will not be evicted when there are node problems such as a network partition. (When the
|
|
||||||
`TaintBasedEvictions` feature is not enabled, they are also not evicted in these scenarios, but
|
|
||||||
due to hard-coded behavior of the NodeController rather than due to tolerations).
|
|
||||||
|
|
||||||
|
|
||||||
## Communicating with Daemon Pods
|
|
||||||
|
|
||||||
Some possible patterns for communicating with pods in a DaemonSet are:
|
|
||||||
|
|
||||||
- **Push**: Pods in the DaemonSet are configured to send updates to another service, such
|
|
||||||
as a stats database. They do not have clients.
|
|
||||||
- **NodeIP and Known Port**: Pods in the DaemonSet use a `hostPort`, so that the pods are reachable via the node IPs. Clients know the list of nodes ips somehow, and know the port by convention.
|
|
||||||
- **DNS**: Create a [headless service](/docs/user-guide/services/#headless-services) with the same pod selector,
|
|
||||||
and then discover DaemonSets using the `endpoints` resource or retrieve multiple A records from
|
|
||||||
DNS.
|
|
||||||
- **Service**: Create a service with the same pod selector, and use the service to reach a
|
|
||||||
daemon on a random node. (No way to reach specific node.)
|
|
||||||
|
|
||||||
## Updating a DaemonSet
|
|
||||||
|
|
||||||
If node labels are changed, the DaemonSet will promptly add pods to newly matching nodes and delete
|
|
||||||
pods from newly not-matching nodes.
|
|
||||||
|
|
||||||
You can modify the pods that a DaemonSet creates. However, pods do not allow all
|
|
||||||
fields to be updated. Also, the DaemonSet controller will use the original template the next
|
|
||||||
time a node (even with the same name) is created.
|
|
||||||
|
|
||||||
|
|
||||||
You can delete a DaemonSet. If you specify `--cascade=false` with `kubectl`, then the pods
|
|
||||||
will be left on the nodes. You can then create a new DaemonSet with a different template.
|
|
||||||
the new DaemonSet with the different template will recognize all the existing pods as having
|
|
||||||
matching labels. It will not modify or delete them despite a mismatch in the pod template.
|
|
||||||
You will need to force new pod creation by deleting the pod or deleting the node.
|
|
||||||
|
|
||||||
In Kubernetes version 1.6 and later, you can [perform a rolling update](/docs/tasks/manage-daemon/update-daemon-set/) on a DaemonSet.
|
|
||||||
|
|
||||||
Future releases of Kubernetes will support controlled updating of nodes.
|
|
||||||
|
|
||||||
## Alternatives to DaemonSet
|
|
||||||
|
|
||||||
### Init Scripts
|
|
||||||
|
|
||||||
It is certainly possible to run daemon processes by directly starting them on a node (e.g. using
|
|
||||||
`init`, `upstartd`, or `systemd`). This is perfectly fine. However, there are several advantages to
|
|
||||||
running such processes via a DaemonSet:
|
|
||||||
|
|
||||||
- Ability to monitor and manage logs for daemons in the same way as applications.
|
|
||||||
- Same config language and tools (e.g. pod templates, `kubectl`) for daemons and applications.
|
|
||||||
- Future versions of Kubernetes will likely support integration between DaemonSet-created
|
|
||||||
pods and node upgrade workflows.
|
|
||||||
- Running daemons in containers with resource limits increases isolation between daemons from app
|
|
||||||
containers. However, this can also be accomplished by running the daemons in a container but not in a pod
|
|
||||||
(e.g. start directly via Docker).
|
|
||||||
|
|
||||||
### Bare Pods
|
|
||||||
|
|
||||||
It is possible to create pods directly which specify a particular node to run on. However,
|
|
||||||
a DaemonSet replaces pods that are deleted or terminated for any reason, such as in the case of
|
|
||||||
node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, you should
|
|
||||||
use a DaemonSet rather than creating individual pods.
|
|
||||||
|
|
||||||
### Static Pods
|
|
||||||
|
|
||||||
It is possible to create pods by writing a file to a certain directory watched by Kubelet. These
|
|
||||||
are called [static pods](/docs/admin/static-pods/).
|
|
||||||
Unlike DaemonSet, static pods cannot be managed with kubectl
|
|
||||||
or other Kubernetes API clients. Static pods do not depend on the apiserver, making them useful
|
|
||||||
in cluster bootstrapping cases. Also, static pods may be deprecated in the future.
|
|
||||||
|
|
||||||
### Replication Controller
|
|
||||||
|
|
||||||
DaemonSet are similar to [Replication Controllers](/docs/user-guide/replication-controller) in that
|
|
||||||
they both create pods, and those pods have processes which are not expected to terminate (e.g. web servers,
|
|
||||||
storage servers).
|
|
||||||
|
|
||||||
Use a replication controller for stateless services, like frontends, where scaling up and down the
|
|
||||||
number of replicas and rolling out updates are more important than controlling exactly which host
|
|
||||||
the pod runs on. Use a Daemon Controller when it is important that a copy of a pod always run on
|
|
||||||
all or certain hosts, and when it needs to start before other pods.
|
|
||||||
|
|
|
@ -84,7 +84,7 @@ sudo docker run -it --rm --privileged --net=host \
|
||||||
gcr.io/google_containers/node-test:0.2
|
gcr.io/google_containers/node-test:0.2
|
||||||
```
|
```
|
||||||
|
|
||||||
Node conformance test is a containerized version of [node e2e test](https://github.com/kubernetes/kubernetes/blob/{{page.version}}/docs/devel/e2e-node-tests.md).
|
Node conformance test is a containerized version of [node e2e test](https://github.com/kubernetes/community/blob/master/contributors/devel/e2e-node-tests.md).
|
||||||
By default, it runs all conformance tests.
|
By default, it runs all conformance tests.
|
||||||
|
|
||||||
Theoretically, you can run any node e2e test if you configure the container and
|
Theoretically, you can run any node e2e test if you configure the container and
|
||||||
|
|
|
@ -5,255 +5,6 @@ assignees:
|
||||||
title: Nodes
|
title: Nodes
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## What is a node?
|
[Nodes](/docs/concepts/nodes/node/)
|
||||||
|
|
||||||
A `node` is a worker machine in Kubernetes, previously known as a `minion`. A node
|
|
||||||
may be a VM or physical machine, depending on the cluster. Each node has
|
|
||||||
the services necessary to run [pods](/docs/user-guide/pods) and is managed by the master
|
|
||||||
components. The services on a node include Docker, kubelet and kube-proxy. See
|
|
||||||
[The Kubernetes Node](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/architecture.md#the-kubernetes-node) section in the
|
|
||||||
architecture design doc for more details.
|
|
||||||
|
|
||||||
## Node Status
|
|
||||||
|
|
||||||
A node's status contains the following information:
|
|
||||||
|
|
||||||
* [Addresses](#Addresses)
|
|
||||||
* ~~[Phase](#Phase)~~ **deprecated**
|
|
||||||
* [Condition](#Condition)
|
|
||||||
* [Capacity](#Capacity)
|
|
||||||
* [Info](#Info)
|
|
||||||
|
|
||||||
Each section is described in detail below.
|
|
||||||
|
|
||||||
### Addresses
|
|
||||||
|
|
||||||
The usage of these fields varies depending on your cloud provider or bare metal configuration.
|
|
||||||
|
|
||||||
* HostName: The hostname as reported by the node's kernel. Can be overridden via the kubelet `--hostname-override` parameter.
|
|
||||||
* ExternalIP: Typically the IP address of the node that is externally routable (available from outside the cluster).
|
|
||||||
* InternalIP: Typically the IP address of the node that is routable only within the cluster.
|
|
||||||
|
|
||||||
### Phase
|
|
||||||
|
|
||||||
Deprecated: node phase is no longer used.
|
|
||||||
|
|
||||||
### Condition
|
|
||||||
|
|
||||||
The `conditions` field describes the status of all `Running` nodes.
|
|
||||||
|
|
||||||
| Node Condition | Description |
|
|
||||||
|----------------|-------------|
|
|
||||||
| `OutOfDisk` | `True` if there is insufficient free space on the node for adding new pods, otherwise `False` |
|
|
||||||
| `Ready` | `True` if the node is healthy and ready to accept pods, `False` if the node is not healthy and is not accepting pods, and `Unknown` if the node controller has not heard from the node in the last 40 seconds |
|
|
||||||
|
|
||||||
The node condition is represented as a JSON object. For example, the following response describes a healthy node.
|
|
||||||
|
|
||||||
```json
|
|
||||||
"conditions": [
|
|
||||||
{
|
|
||||||
"kind": "Ready",
|
|
||||||
"status": "True"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
If the Status of the Ready condition is "Unknown" or "False" for longer than the `pod-eviction-timeout`, an argument passed to the [kube-controller-manager](/docs/admin/kube-controller-manager/), all of the Pods on the node are scheduled for deletion by the Node Controller. The default eviction timeout duration is **five minutes**. In some cases when the node is unreachable, the apiserver is unable to communicate with the kubelet on it. The decision to delete the pods cannot be communicated to the kubelet until it re-establishes communication with the apiserver. In the meantime, the pods which are scheduled for deletion may continue to run on the partitioned node.
|
|
||||||
|
|
||||||
In versions of Kubernetes prior to 1.5, the node controller would [force delete](/docs/user-guide/pods/#force-deletion-of-pods) these unreachable pods from the apiserver. However, in 1.5 and higher, the node controller does not force delete pods until it is confirmed that they have stopped running in the cluster. One can see these pods which may be running on an unreachable node as being in the "Terminating" or "Unknown" states. In cases where Kubernetes cannot deduce from the underlying infrastructure if a node has permanently left a cluster, the cluster administrator may need to delete the node object by hand. Deleting the node object from Kubernetes causes all the Pod objects running on it to be deleted from the apiserver, freeing up their names.
|
|
||||||
|
|
||||||
### Capacity
|
|
||||||
|
|
||||||
Describes the resources available on the node: CPU, memory and the maximum
|
|
||||||
number of pods that can be scheduled onto the node.
|
|
||||||
|
|
||||||
### Info
|
|
||||||
|
|
||||||
General information about the node, such as kernel version, Kubernetes version
|
|
||||||
(kubelet and kube-proxy version), Docker version (if used), OS name.
|
|
||||||
The information is gathered by Kubelet from the node.
|
|
||||||
|
|
||||||
## Management
|
|
||||||
|
|
||||||
Unlike [pods](/docs/user-guide/pods) and [services](/docs/user-guide/services),
|
|
||||||
a node is not inherently created by Kubernetes: it is created externally by cloud
|
|
||||||
providers like Google Compute Engine, or exists in your pool of physical or virtual
|
|
||||||
machines. What this means is that when Kubernetes creates a node, it is really
|
|
||||||
just creating an object that represents the node. After creation, Kubernetes
|
|
||||||
will check whether the node is valid or not. For example, if you try to create
|
|
||||||
a node from the following content:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"kind": "Node",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "10.240.79.157",
|
|
||||||
"labels": {
|
|
||||||
"name": "my-first-k8s-node"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Kubernetes will create a node object internally (the representation), and
|
|
||||||
validate the node by health checking based on the `metadata.name` field (we
|
|
||||||
assume `metadata.name` can be resolved). If the node is valid, i.e. all necessary
|
|
||||||
services are running, it is eligible to run a pod; otherwise, it will be
|
|
||||||
ignored for any cluster activity until it becomes valid. Note that Kubernetes
|
|
||||||
will keep the object for the invalid node unless it is explicitly deleted by
|
|
||||||
the client, and it will keep checking to see if it becomes valid.
|
|
||||||
|
|
||||||
Currently, there are three components that interact with the Kubernetes node
|
|
||||||
interface: node controller, kubelet, and kubectl.
|
|
||||||
|
|
||||||
### Node Controller
|
|
||||||
|
|
||||||
The node controller is a Kubernetes master component which manages various
|
|
||||||
aspects of nodes.
|
|
||||||
|
|
||||||
The node controller has multiple roles in a node's life. The first is assigning a
|
|
||||||
CIDR block to the node when it is registered (if CIDR assignment is turned on).
|
|
||||||
|
|
||||||
The second is keeping the node controller's internal list of nodes up to date with
|
|
||||||
the cloud provider's list of available machines. When running in a cloud
|
|
||||||
environment, whenever a node is unhealthy the node controller asks the cloud
|
|
||||||
provider if the VM for that node is still available. If not, the node
|
|
||||||
controller deletes the node from its list of nodes.
|
|
||||||
|
|
||||||
The third is monitoring the nodes' health. The node controller is
|
|
||||||
responsible for updating the NodeReady condition of NodeStatus to
|
|
||||||
ConditionUnknown when a node becomes unreachable (i.e. the node controller stops
|
|
||||||
receiving heartbeats for some reason, e.g. due to the node being down), and then later evicting
|
|
||||||
all the pods from the node (using graceful termination) if the node continues
|
|
||||||
to be unreachable. (The default timeouts are 40s to start reporting
|
|
||||||
ConditionUnknown and 5m after that to start evicting pods.) The node controller
|
|
||||||
checks the state of each node every `--node-monitor-period` seconds.
|
|
||||||
|
|
||||||
In Kubernetes 1.4, we updated the logic of the node controller to better handle
|
|
||||||
cases when a big number of nodes have problems with reaching the master
|
|
||||||
(e.g. because the master has networking problem). Starting with 1.4, the node
|
|
||||||
controller will look at the state of all nodes in the cluster when making a
|
|
||||||
decision about pod eviction.
|
|
||||||
|
|
||||||
In most cases, node controller limits the eviction rate to
|
|
||||||
`--node-eviction-rate` (default 0.1) per second, meaning it won't evict pods
|
|
||||||
from more than 1 node per 10 seconds.
|
|
||||||
|
|
||||||
The node eviction behavior changes when a node in a given availability zone
|
|
||||||
becomes unhealthy. The node controller checks what percentage of nodes in the zone
|
|
||||||
are unhealthy (NodeReady condition is ConditionUnknown or ConditionFalse) at
|
|
||||||
the same time. If the fraction of unhealthy nodes is at least
|
|
||||||
`--unhealthy-zone-threshold` (default 0.55) then the eviction rate is reduced:
|
|
||||||
if the cluster is small (i.e. has less than or equal to
|
|
||||||
`--large-cluster-size-threshold` nodes - default 50) then evictions are
|
|
||||||
stopped, otherwise the eviction rate is reduced to
|
|
||||||
`--secondary-node-eviction-rate` (default 0.01) per second. The reason these
|
|
||||||
policies are implemented per availability zone is because one availability zone
|
|
||||||
might become partitioned from the master while the others remain connected. If
|
|
||||||
your cluster does not span multiple cloud provider availability zones, then
|
|
||||||
there is only one availability zone (the whole cluster).
|
|
||||||
|
|
||||||
A key reason for spreading your nodes across availability zones is so that the
|
|
||||||
workload can be shifted to healthy zones when one entire zone goes down.
|
|
||||||
Therefore, if all nodes in a zone are unhealthy then node controller evicts at
|
|
||||||
the normal rate `--node-eviction-rate`. The corner case is when all zones are
|
|
||||||
completely unhealthy (i.e. there are no healthy nodes in the cluster). In such
|
|
||||||
case, the node controller assumes that there's some problem with master
|
|
||||||
connectivity and stops all evictions until some connectivity is restored.
|
|
||||||
|
|
||||||
Starting in Kubernetes 1.6, the NodeController is also responsible for evicting
|
|
||||||
pods that are running on nodes with `NoExecute` taints, when the pods do not tolerate
|
|
||||||
the taints. Additionally, as an alpha feature that is disabled by default, the
|
|
||||||
NodeController is responsible for adding taints corresponding to node problems like
|
|
||||||
node unreachable or not ready. See [this documentation](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature)
|
|
||||||
for details about `NoExecute` taints and the alpha feature.
|
|
||||||
|
|
||||||
### Self-Registration of Nodes
|
|
||||||
|
|
||||||
When the kubelet flag `--register-node` is true (the default), the kubelet will attempt to
|
|
||||||
register itself with the API server. This is the preferred pattern, used by most distros.
|
|
||||||
|
|
||||||
For self-registration, the kubelet is started with the following options:
|
|
||||||
|
|
||||||
- `--api-servers` - Location of the apiservers.
|
|
||||||
- `--kubeconfig` - Path to credentials to authenticate itself to the apiserver.
|
|
||||||
- `--cloud-provider` - How to talk to a cloud provider to read metadata about itself.
|
|
||||||
- `--register-node` - Automatically register with the API server.
|
|
||||||
- `--register-with-taints` - Register the node with the given list of taints (comma seperated `<key>=<value>:<effect>`). No-op if `register-node` is false.
|
|
||||||
- `--node-ip` IP address of the node.
|
|
||||||
- `--node-labels` - Labels to add when registering the node in the cluster.
|
|
||||||
- `--node-status-update-frequency` - Specifies how often kubelet posts node status to master.
|
|
||||||
|
|
||||||
Currently, any kubelet is authorized to create/modify any node resource, but in practice it only creates/modifies
|
|
||||||
its own. (In the future, we plan to only allow a kubelet to modify its own node resource.)
|
|
||||||
|
|
||||||
#### Manual Node Administration
|
|
||||||
|
|
||||||
A cluster administrator can create and modify node objects.
|
|
||||||
|
|
||||||
If the administrator wishes to create node objects manually, set the kubelet flag
|
|
||||||
`--register-node=false`.
|
|
||||||
|
|
||||||
The administrator can modify node resources (regardless of the setting of `--register-node`).
|
|
||||||
Modifications include setting labels on the node and marking it unschedulable.
|
|
||||||
|
|
||||||
Labels on nodes can be used in conjunction with node selectors on pods to control scheduling,
|
|
||||||
e.g. to constrain a pod to only be eligible to run on a subset of the nodes.
|
|
||||||
|
|
||||||
Marking a node as unschedulable will prevent new pods from being scheduled to that
|
|
||||||
node, but will not affect any existing pods on the node. This is useful as a
|
|
||||||
preparatory step before a node reboot, etc. For example, to mark a node
|
|
||||||
unschedulable, run this command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
kubectl cordon $NODENAME
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that pods which are created by a daemonSet controller bypass the Kubernetes scheduler,
|
|
||||||
and do not respect the unschedulable attribute on a node. The assumption is that daemons belong on
|
|
||||||
the machine even if it is being drained of applications in preparation for a reboot.
|
|
||||||
|
|
||||||
### Node capacity
|
|
||||||
|
|
||||||
The capacity of the node (number of cpus and amount of memory) is part of the node object.
|
|
||||||
Normally, nodes register themselves and report their capacity when creating the node object. If
|
|
||||||
you are doing [manual node administration](#manual-node-administration), then you need to set node
|
|
||||||
capacity when adding a node.
|
|
||||||
|
|
||||||
The Kubernetes scheduler ensures that there are enough resources for all the pods on a node. It
|
|
||||||
checks that the sum of the limits of containers on the node is no greater than the node capacity. It
|
|
||||||
includes all containers started by the kubelet, but not containers started directly by Docker nor
|
|
||||||
processes not in containers.
|
|
||||||
|
|
||||||
If you want to explicitly reserve resources for non-pod processes, you can create a placeholder
|
|
||||||
pod. Use the following template:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Pod
|
|
||||||
metadata:
|
|
||||||
name: resource-reserver
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: sleep-forever
|
|
||||||
image: gcr.io/google_containers/pause:0.8.0
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 100Mi
|
|
||||||
```
|
|
||||||
|
|
||||||
Set the `cpu` and `memory` values to the amount of resources you want to reserve.
|
|
||||||
Place the file in the manifest directory (`--config=DIR` flag of kubelet). Do this
|
|
||||||
on each kubelet where you want to reserve resources.
|
|
||||||
|
|
||||||
|
|
||||||
## API Object
|
|
||||||
|
|
||||||
Node is a top-level resource in the Kubernetes REST API. More details about the
|
|
||||||
API object can be found at: [Node API
|
|
||||||
object](/docs/api-reference/v1.6/#node-v1-core).
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
assignees:
|
assignees:
|
||||||
- mml
|
- mml
|
||||||
title: Cluster Management Guide
|
title: Cluster Management Guide for Version 1.6
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
* TOC
|
||||||
|
@ -23,4 +23,4 @@ startup:
|
||||||
$ kube-apiserver --storage-backend='etcd2' $(EXISTING_ARGS)
|
$ kube-apiserver --storage-backend='etcd2' $(EXISTING_ARGS)
|
||||||
```
|
```
|
||||||
|
|
||||||
However, for long-term maintenance of the cluster, we recommend that the operator plan an outage window in order to perform a [v2->v3 data upgrade](https://coreos.com/etcd/docs/latest/upgrades/upgrade_3_0.html).
|
However, for long-term maintenance of the cluster, we recommend that the operator plan an outage window in order to perform a [v2->v3 data upgrade](https://coreos.com/etcd/docs/latest/upgrades/upgrade_3_0.html).
|
||||||
|
|
|
@ -0,0 +1,201 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- lavalamp
|
||||||
|
- thockin
|
||||||
|
title: Cluster Management
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
This document describes several topics related to the lifecycle of a cluster: creating a new cluster,
|
||||||
|
upgrading your cluster's
|
||||||
|
master and worker nodes, performing node maintenance (e.g. kernel upgrades), and upgrading the Kubernetes API version of a
|
||||||
|
running cluster.
|
||||||
|
|
||||||
|
## Creating and configuring a Cluster
|
||||||
|
|
||||||
|
To install Kubernetes on a set of machines, consult one of the existing [Getting Started guides](/docs/getting-started-guides/) depending on your environment.
|
||||||
|
|
||||||
|
## Upgrading a cluster
|
||||||
|
|
||||||
|
The current state of cluster upgrades is provider dependent, and some releases may require special care when upgrading. It is recommended that administrators consult both the [release notes](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md), as well as the version specific upgrade notes prior to upgrading their clusters.
|
||||||
|
|
||||||
|
* [Upgrading to 1.6](/docs/admin/upgrade-1-6)
|
||||||
|
|
||||||
|
### Upgrading Google Compute Engine clusters
|
||||||
|
|
||||||
|
Google Compute Engine Open Source (GCE-OSS) support master upgrades by deleting and
|
||||||
|
recreating the master, while maintaining the same Persistent Disk (PD) to ensure that data is retained across the
|
||||||
|
upgrade.
|
||||||
|
|
||||||
|
Node upgrades for GCE use a [Managed Instance Group](https://cloud.google.com/compute/docs/instance-groups/), each node
|
||||||
|
is sequentially destroyed and then recreated with new software. Any Pods that are running on that node need to be
|
||||||
|
controlled by a Replication Controller, or manually re-created after the roll out.
|
||||||
|
|
||||||
|
Upgrades on open source Google Compute Engine (GCE) clusters are controlled by the `cluster/gce/upgrade.sh` script.
|
||||||
|
|
||||||
|
Get its usage by running `cluster/gce/upgrade.sh -h`.
|
||||||
|
|
||||||
|
For example, to upgrade just your master to a specific version (v1.0.2):
|
||||||
|
|
||||||
|
```shell
|
||||||
|
cluster/gce/upgrade.sh -M v1.0.2
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, to upgrade your entire cluster to the latest stable release:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
cluster/gce/upgrade.sh release/stable
|
||||||
|
```
|
||||||
|
|
||||||
|
### Upgrading Google Container Engine (GKE) clusters
|
||||||
|
|
||||||
|
Google Container Engine automatically updates master components (e.g. `kube-apiserver`, `kube-scheduler`) to the latest
|
||||||
|
version. It also handles upgrading the operating system and other components that the master runs on.
|
||||||
|
|
||||||
|
The node upgrade process is user-initiated and is described in the [GKE documentation.](https://cloud.google.com/container-engine/docs/clusters/upgrade)
|
||||||
|
|
||||||
|
### Upgrading clusters on other platforms
|
||||||
|
|
||||||
|
Different providers, and tools, will manage upgrades differently. It is recommended that you consult their main documentation regarding upgrades.
|
||||||
|
|
||||||
|
* [kops](https://github.com/kubernetes/kops)
|
||||||
|
* [kargo](https://github.com/kubernetes-incubator/kargo)
|
||||||
|
* [CoreOS Tectonic](https://coreos.com/tectonic/docs/latest/admin/upgrade.html)
|
||||||
|
* ...
|
||||||
|
|
||||||
|
## Resizing a cluster
|
||||||
|
|
||||||
|
If your cluster runs short on resources you can easily add more machines to it if your cluster is running in [Node self-registration mode](/docs/admin/node/#self-registration-of-nodes).
|
||||||
|
If you're using GCE or GKE it's done by resizing Instance Group managing your Nodes. It can be accomplished by modifying number of instances on `Compute > Compute Engine > Instance groups > your group > Edit group` [Google Cloud Console page](https://console.developers.google.com) or using gcloud CLI:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
gcloud compute instance-groups managed resize kubernetes-minion-group --size 42 --zone $ZONE
|
||||||
|
```
|
||||||
|
|
||||||
|
Instance Group will take care of putting appropriate image on new machines and start them, while Kubelet will register its Node with API server to make it available for scheduling. If you scale the instance group down, system will randomly choose Nodes to kill.
|
||||||
|
|
||||||
|
In other environments you may need to configure the machine yourself and tell the Kubelet on which machine API server is running.
|
||||||
|
|
||||||
|
### Cluster autoscaling
|
||||||
|
|
||||||
|
If you are using GCE or GKE, you can configure your cluster so that it is automatically rescaled based on
|
||||||
|
pod needs.
|
||||||
|
|
||||||
|
As described in [Compute Resource](/docs/user-guide/compute-resources/), users can reserve how much CPU and memory is allocated to pods.
|
||||||
|
This information is used by the Kubernetes scheduler to find a place to run the pod. If there is
|
||||||
|
no node that has enough free capacity (or doesn't match other pod requirements) then the pod has
|
||||||
|
to wait until some pods are terminated or a new node is added.
|
||||||
|
|
||||||
|
Cluster autoscaler looks for the pods that cannot be scheduled and checks if adding a new node, similar
|
||||||
|
to the other in the cluster, would help. If yes, then it resizes the cluster to accommodate the waiting pods.
|
||||||
|
|
||||||
|
Cluster autoscaler also scales down the cluster if it notices that some node is not needed anymore for
|
||||||
|
an extended period of time (10min but it may change in the future).
|
||||||
|
|
||||||
|
Cluster autoscaler is configured per instance group (GCE) or node pool (GKE).
|
||||||
|
|
||||||
|
If you are using GCE then you can either enable it while creating a cluster with kube-up.sh script.
|
||||||
|
To configure cluster autoscaler you have to set three environment variables:
|
||||||
|
|
||||||
|
* `KUBE_ENABLE_CLUSTER_AUTOSCALER` - it enables cluster autoscaler if set to true.
|
||||||
|
* `KUBE_AUTOSCALER_MIN_NODES` - minimum number of nodes in the cluster.
|
||||||
|
* `KUBE_AUTOSCALER_MAX_NODES` - maximum number of nodes in the cluster.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
KUBE_ENABLE_CLUSTER_AUTOSCALER=true KUBE_AUTOSCALER_MIN_NODES=3 KUBE_AUTOSCALER_MAX_NODES=10 NUM_NODES=5 ./cluster/kube-up.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
On GKE you configure cluster autoscaler either on cluster creation or update or when creating a particular node pool
|
||||||
|
(which you want to be autoscaled) by passing flags `--enable-autoscaling` `--min-nodes` and `--max-nodes`
|
||||||
|
to the corresponding `gcloud` commands.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
gcloud container clusters create mytestcluster --zone=us-central1-b --enable-autoscaling --min-nodes=3 --max-nodes=10 --num-nodes=5
|
||||||
|
```
|
||||||
|
|
||||||
|
```shell
|
||||||
|
gcloud container clusters update mytestcluster --enable-autoscaling --min-nodes=1 --max-nodes=15
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cluster autoscaler expects that nodes have not been manually modified (e.g. by adding labels via kubectl) as those properties would not be propagated to the new nodes within the same instance group.**
|
||||||
|
|
||||||
|
## Maintenance on a Node
|
||||||
|
|
||||||
|
If you need to reboot a node (such as for a kernel upgrade, libc upgrade, hardware repair, etc.), and the downtime is
|
||||||
|
brief, then when the Kubelet restarts, it will attempt to restart the pods scheduled to it. If the reboot takes longer
|
||||||
|
(the default time is 5 minutes, controlled by `--pod-eviction-timeout` on the controller-manager),
|
||||||
|
then the node controller will terminate the pods that are bound to the unavailable node. If there is a corresponding
|
||||||
|
replica set (or replication controller), then a new copy of the pod will be started on a different node. So, in the case where all
|
||||||
|
pods are replicated, upgrades can be done without special coordination, assuming that not all nodes will go down at the same time.
|
||||||
|
|
||||||
|
If you want more control over the upgrading process, you may use the following workflow:
|
||||||
|
|
||||||
|
Use `kubectl drain` to gracefully terminate all pods on the node while marking the node as unschedulable:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
kubectl drain $NODENAME
|
||||||
|
```
|
||||||
|
|
||||||
|
This keeps new pods from landing on the node while you are trying to get them off.
|
||||||
|
|
||||||
|
For pods with a replica set, the pod will be replaced by a new pod which will be scheduled to a new node. Additionally, if the pod is part of a service, then clients will automatically be redirected to the new pod.
|
||||||
|
|
||||||
|
For pods with no replica set, you need to bring up a new copy of the pod, and assuming it is not part of a service, redirect clients to it.
|
||||||
|
|
||||||
|
Perform maintenance work on the node.
|
||||||
|
|
||||||
|
Make the node schedulable again:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
kubectl uncordon $NODENAME
|
||||||
|
```
|
||||||
|
|
||||||
|
If you deleted the node's VM instance and created a new one, then a new schedulable node resource will
|
||||||
|
be created automatically (if you're using a cloud provider that supports
|
||||||
|
node discovery; currently this is only Google Compute Engine, not including CoreOS on Google Compute Engine using kube-register). See [Node](/docs/admin/node) for more details.
|
||||||
|
|
||||||
|
## Advanced Topics
|
||||||
|
|
||||||
|
### Upgrading to a different API version
|
||||||
|
|
||||||
|
When a new API version is released, you may need to upgrade a cluster to support the new API version (e.g. switching from 'v1' to 'v2' when 'v2' is launched).
|
||||||
|
|
||||||
|
This is an infrequent event, but it requires careful management. There is a sequence of steps to upgrade to a new API version.
|
||||||
|
|
||||||
|
1. Turn on the new API version.
|
||||||
|
1. Upgrade the cluster's storage to use the new version.
|
||||||
|
1. Upgrade all config files. Identify users of the old API version endpoints.
|
||||||
|
1. Update existing objects in the storage to new version by running `cluster/update-storage-objects.sh`.
|
||||||
|
1. Turn off the old API version.
|
||||||
|
|
||||||
|
### Turn on or off an API version for your cluster
|
||||||
|
|
||||||
|
Specific API versions can be turned on or off by passing `--runtime-config=api/<version>` flag while bringing up the API server. For example: to turn off v1 API, pass `--runtime-config=api/v1=false`.
|
||||||
|
runtime-config also supports 2 special keys: api/all and api/legacy to control all and legacy APIs respectively.
|
||||||
|
For example, for turning off all API versions except v1, pass `--runtime-config=api/all=false,api/v1=true`.
|
||||||
|
For the purposes of these flags, _legacy_ APIs are those APIs which have been explicitly deprecated (e.g. `v1beta3`).
|
||||||
|
|
||||||
|
### Switching your cluster's storage API version
|
||||||
|
|
||||||
|
The objects that are stored to disk for a cluster's internal representation of the Kubernetes resources active in the cluster are written using a particular version of the API.
|
||||||
|
When the supported API changes, these objects may need to be rewritten in the newer API. Failure to do this will eventually result in resources that are no longer decodable or usable
|
||||||
|
by the Kubernetes API server.
|
||||||
|
|
||||||
|
`KUBE_API_VERSIONS` environment variable for the `kube-apiserver` binary which controls the API versions that are supported in the cluster. The first version in the list is used as the cluster's storage version. Hence, to set a specific version as the storage version, bring it to the front of list of versions in the value of `KUBE_API_VERSIONS`. You need to restart the `kube-apiserver` binary
|
||||||
|
for changes to this variable to take effect.
|
||||||
|
|
||||||
|
### Switching your config files to a new API version
|
||||||
|
|
||||||
|
You can use `kubectl convert` command to convert config files between different API versions.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
kubectl convert -f pod.yaml --output-version v1
|
||||||
|
```
|
||||||
|
|
||||||
|
For more options, please refer to the usage of [kubectl convert](/docs/user-guide/kubectl/kubectl_convert/) command.
|
|
@ -25,7 +25,7 @@ The kubelet has a single default network plugin, and a default network common to
|
||||||
|
|
||||||
## Network Plugin Requirements
|
## Network Plugin Requirements
|
||||||
|
|
||||||
Besides providing the [`NetworkPlugin` interface](https://github.com/kubernetes/kubernetes/tree/{{page.version}}/pkg/kubelet/network/plugins.go) to configure and clean up pod networking, the plugin may also need specific support for kube-proxy. The iptables proxy obviously depends on iptables, and the plugin may need to ensure that container traffic is made available to iptables. For example, if the plugin connects containers to a Linux bridge, the plugin must set the `net/bridge/bridge-nf-call-iptables` sysctl to `1` to ensure that the iptables proxy functions correctly. If the plugin does not use a Linux bridge (but instead something like Open vSwitch or some other mechanism) it should ensure container traffic is appropriately routed for the proxy.
|
Besides providing the [`NetworkPlugin` interface](https://github.com/kubernetes/kubernetes/tree/{{page.fullversion}}/pkg/kubelet/network/plugins.go) to configure and clean up pod networking, the plugin may also need specific support for kube-proxy. The iptables proxy obviously depends on iptables, and the plugin may need to ensure that container traffic is made available to iptables. For example, if the plugin connects containers to a Linux bridge, the plugin must set the `net/bridge/bridge-nf-call-iptables` sysctl to `1` to ensure that the iptables proxy functions correctly. If the plugin does not use a Linux bridge (but instead something like Open vSwitch or some other mechanism) it should ensure container traffic is appropriately routed for the proxy.
|
||||||
|
|
||||||
By default if no kubelet network plugin is specified, the `noop` plugin is used, which sets `net/bridge/bridge-nf-call-iptables=1` to ensure simple configurations (like docker with a bridge) work correctly with the iptables proxy.
|
By default if no kubelet network plugin is specified, the `noop` plugin is used, which sets `net/bridge/bridge-nf-call-iptables=1` to ensure simple configurations (like docker with a bridge) work correctly with the iptables proxy.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,795 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- mikedanese
|
||||||
|
title: Secrets
|
||||||
|
---
|
||||||
|
|
||||||
|
Objects of type `secret` are intended to hold sensitive information, such as
|
||||||
|
passwords, OAuth tokens, and ssh keys. Putting this information in a `secret`
|
||||||
|
is safer and more flexible than putting it verbatim in a `pod` definition or in
|
||||||
|
a docker image. See [Secrets design document](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/secrets.md) for more information.
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## Overview of Secrets
|
||||||
|
|
||||||
|
A Secret is an object that contains a small amount of sensitive data such as
|
||||||
|
a password, a token, or a key. Such information might otherwise be put in a
|
||||||
|
Pod specification or in an image; putting it in a Secret object allows for
|
||||||
|
more control over how it is used, and reduces the risk of accidental exposure.
|
||||||
|
|
||||||
|
Users can create secrets, and the system also creates some secrets.
|
||||||
|
|
||||||
|
To use a secret, a pod needs to reference the secret.
|
||||||
|
A secret can be used with a pod in two ways: as files in a [volume](/docs/concepts/storage/volumes/) mounted on one or more of
|
||||||
|
its containers, or used by kubelet when pulling images for the pod.
|
||||||
|
|
||||||
|
### Built-in Secrets
|
||||||
|
|
||||||
|
#### Service Accounts Automatically Create and Attach Secrets with API Credentials
|
||||||
|
|
||||||
|
Kubernetes automatically creates secrets which contain credentials for
|
||||||
|
accessing the API and it automatically modifies your pods to use this type of
|
||||||
|
secret.
|
||||||
|
|
||||||
|
The automatic creation and use of API credentials can be disabled or overridden
|
||||||
|
if desired. However, if all you need to do is securely access the apiserver,
|
||||||
|
this is the recommended workflow.
|
||||||
|
|
||||||
|
See the [Service Account](/docs/user-guide/service-accounts) documentation for more
|
||||||
|
information on how Service Accounts work.
|
||||||
|
|
||||||
|
### Creating your own Secrets
|
||||||
|
|
||||||
|
#### Creating a Secret Using kubectl create secret
|
||||||
|
|
||||||
|
Say that some pods need to access a database. The
|
||||||
|
username and password that the pods should use is in the files
|
||||||
|
`./username.txt` and `./password.txt` on your local machine.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Create files needed for rest of example.
|
||||||
|
$ echo -n "admin" > ./username.txt
|
||||||
|
$ echo -n "1f2d1e2e67df" > ./password.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
The `kubectl create secret` command
|
||||||
|
packages these files into a Secret and creates
|
||||||
|
the object on the Apiserver.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create secret generic db-user-pass --from-file=./username.txt --from-file=./password.txt
|
||||||
|
secret "db-user-pass" created
|
||||||
|
```
|
||||||
|
|
||||||
|
You can check that the secret was created like this:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get secrets
|
||||||
|
NAME TYPE DATA AGE
|
||||||
|
db-user-pass Opaque 2 51s
|
||||||
|
|
||||||
|
$ kubectl describe secrets/db-user-pass
|
||||||
|
Name: db-user-pass
|
||||||
|
Namespace: default
|
||||||
|
Labels: <none>
|
||||||
|
Annotations: <none>
|
||||||
|
|
||||||
|
Type: Opaque
|
||||||
|
|
||||||
|
Data
|
||||||
|
====
|
||||||
|
password.txt: 12 bytes
|
||||||
|
username.txt: 5 bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that neither `get` nor `describe` shows the contents of the file by default.
|
||||||
|
This is to protect the secret from being exposed accidentally to someone looking
|
||||||
|
or from being stored in a terminal log.
|
||||||
|
|
||||||
|
See [decoding a secret](#decoding-a-secret) for how to see the contents.
|
||||||
|
|
||||||
|
#### Creating a Secret Manually
|
||||||
|
|
||||||
|
You can also create a secret object in a file first,
|
||||||
|
in json or yaml format, and then create that object.
|
||||||
|
|
||||||
|
Each item must be base64 encoded:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ echo -n "admin" | base64
|
||||||
|
YWRtaW4=
|
||||||
|
$ echo -n "1f2d1e2e67df" | base64
|
||||||
|
MWYyZDFlMmU2N2Rm
|
||||||
|
```
|
||||||
|
|
||||||
|
Now write a secret object that looks like this:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: mysecret
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
username: YWRtaW4=
|
||||||
|
password: MWYyZDFlMmU2N2Rm
|
||||||
|
```
|
||||||
|
|
||||||
|
The data field is a map. Its keys must match
|
||||||
|
[`DNS_SUBDOMAIN`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/design/identifiers.md), except that leading dots are also
|
||||||
|
allowed. The values are arbitrary data, encoded using base64.
|
||||||
|
|
||||||
|
Create the secret using [`kubectl create`](/docs/user-guide/kubectl/kubectl_create/):
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create -f ./secret.yaml
|
||||||
|
secret "mysecret" created
|
||||||
|
```
|
||||||
|
|
||||||
|
**Encoding Note:** The serialized JSON and YAML values of secret data are
|
||||||
|
encoded as base64 strings. Newlines are not valid within these strings and must
|
||||||
|
be omitted. When using the `base64` utility on Darwin/OS X users should avoid
|
||||||
|
using the `-b` option to split long lines. Conversely Linux users *should* add
|
||||||
|
the option `-w 0` to `base64` commands.
|
||||||
|
|
||||||
|
#### Decoding a Secret
|
||||||
|
|
||||||
|
Get back the secret created in the previous section:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get secret mysecret -o yaml
|
||||||
|
apiVersion: v1
|
||||||
|
data:
|
||||||
|
username: YWRtaW4=
|
||||||
|
password: MWYyZDFlMmU2N2Rm
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
creationTimestamp: 2016-01-22T18:41:56Z
|
||||||
|
name: mysecret
|
||||||
|
namespace: default
|
||||||
|
resourceVersion: "164619"
|
||||||
|
selfLink: /api/v1/namespaces/default/secrets/mysecret
|
||||||
|
uid: cfee02d6-c137-11e5-8d73-42010af00002
|
||||||
|
type: Opaque
|
||||||
|
```
|
||||||
|
|
||||||
|
Decode the password field:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ echo "MWYyZDFlMmU2N2Rm" | base64 --decode
|
||||||
|
1f2d1e2e67df
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using Secrets
|
||||||
|
|
||||||
|
Secrets can be mounted as data volumes or be exposed as environment variables to
|
||||||
|
be used by a container in a pod. They can also be used by other parts of the
|
||||||
|
system, without being directly exposed to the pod. For example, they can hold
|
||||||
|
credentials that other parts of the system should use to interact with external
|
||||||
|
systems on your behalf.
|
||||||
|
|
||||||
|
#### Using Secrets as Files from a Pod
|
||||||
|
|
||||||
|
To consume a Secret in a volume in a Pod:
|
||||||
|
|
||||||
|
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
|
||||||
|
1. Modify your Pod definition to add a volume under `spec.volumes[]`. Name the volume anything, and have a `spec.volumes[].secret.secretName` field equal to the name of the secret object.
|
||||||
|
1. Add a `spec.containers[].volumeMounts[]` to each container that needs the secret. Specify `spec.containers[].volumeMounts[].readOnly = true` and `spec.containers[].volumeMounts[].mountPath` to an unused directory name where you would like the secrets to appear.
|
||||||
|
1. Modify your image and/or command line so that the program looks for files in that directory. Each key in the secret `data` map becomes the filename under `mountPath`.
|
||||||
|
|
||||||
|
This is an example of a pod that mounts a secret in a volume:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"kind": "Pod",
|
||||||
|
"metadata": {
|
||||||
|
"name": "mypod",
|
||||||
|
"namespace": "myns"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"containers": [{
|
||||||
|
"name": "mypod",
|
||||||
|
"image": "redis",
|
||||||
|
"volumeMounts": [{
|
||||||
|
"name": "foo",
|
||||||
|
"mountPath": "/etc/foo",
|
||||||
|
"readOnly": true
|
||||||
|
}]
|
||||||
|
}],
|
||||||
|
"volumes": [{
|
||||||
|
"name": "foo",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "mysecret"
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Each secret you want to use needs to be referred to in `spec.volumes`.
|
||||||
|
|
||||||
|
If there are multiple containers in the pod, then each container needs its
|
||||||
|
own `volumeMounts` block, but only one `spec.volumes` is needed per secret.
|
||||||
|
|
||||||
|
You can package many files into one secret, or use many secrets, whichever is convenient.
|
||||||
|
|
||||||
|
**Projection of secret keys to specific paths**
|
||||||
|
|
||||||
|
We can also control the paths within the volume where Secret keys are projected.
|
||||||
|
You can use `spec.volumes[].secret.items` field to change target path of each key:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"kind": "Pod",
|
||||||
|
"metadata": {
|
||||||
|
"name": "mypod",
|
||||||
|
"namespace": "myns"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"containers": [{
|
||||||
|
"name": "mypod",
|
||||||
|
"image": "redis",
|
||||||
|
"volumeMounts": [{
|
||||||
|
"name": "foo",
|
||||||
|
"mountPath": "/etc/foo",
|
||||||
|
"readOnly": true
|
||||||
|
}]
|
||||||
|
}],
|
||||||
|
"volumes": [{
|
||||||
|
"name": "foo",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "mysecret",
|
||||||
|
"items": [{
|
||||||
|
"key": "username",
|
||||||
|
"path": "my-group/my-username"
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
What will happen:
|
||||||
|
|
||||||
|
* `username` secret is stored under `/etc/foo/my-group/my-username` file instead of `/etc/foo/username`.
|
||||||
|
* `password` secret is not projected
|
||||||
|
|
||||||
|
If `spec.volumes[].secret.items` is used, only keys specified in `items` are projected.
|
||||||
|
To consume all keys from the secret, all of them must be listed in the `items` field.
|
||||||
|
All listed keys must exist in the corresponding secret. Otherwise, the volume is not created.
|
||||||
|
|
||||||
|
**Secret files permissions**
|
||||||
|
|
||||||
|
You can also specify the permission mode bits files part of a secret will have.
|
||||||
|
If you don't specify any, `0644` is used by default. You can specify a default
|
||||||
|
mode for the whole secret volume and override per key if needed.
|
||||||
|
|
||||||
|
For example, you can specify a default mode like this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"kind": "Pod",
|
||||||
|
"metadata": {
|
||||||
|
"name": "mypod",
|
||||||
|
"namespace": "myns"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"containers": [{
|
||||||
|
"name": "mypod",
|
||||||
|
"image": "redis",
|
||||||
|
"volumeMounts": [{
|
||||||
|
"name": "foo",
|
||||||
|
"mountPath": "/etc/foo"
|
||||||
|
}]
|
||||||
|
}],
|
||||||
|
"volumes": [{
|
||||||
|
"name": "foo",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "mysecret",
|
||||||
|
"defaultMode": 256
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, the secret will be mounted on `/etc/foo` and all the files created by the
|
||||||
|
secret volume mount will have permission `0400`.
|
||||||
|
|
||||||
|
Note that the JSON spec doesn't support octal notation, so use the value 256 for
|
||||||
|
0400 permissions. If you use yaml instead of json for the pod, you can use octal
|
||||||
|
notation to specify permissions in a more natural way.
|
||||||
|
|
||||||
|
You can also use mapping, as in the previous example, and specify different
|
||||||
|
permission for different files like this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"kind": "Pod",
|
||||||
|
"metadata": {
|
||||||
|
"name": "mypod",
|
||||||
|
"namespace": "myns"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"containers": [{
|
||||||
|
"name": "mypod",
|
||||||
|
"image": "redis",
|
||||||
|
"volumeMounts": [{
|
||||||
|
"name": "foo",
|
||||||
|
"mountPath": "/etc/foo"
|
||||||
|
}]
|
||||||
|
}],
|
||||||
|
"volumes": [{
|
||||||
|
"name": "foo",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "mysecret",
|
||||||
|
"items": [{
|
||||||
|
"key": "username",
|
||||||
|
"path": "my-group/my-username",
|
||||||
|
"mode": 511
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
In this case, the file resulting in `/etc/foo/my-group/my-username` will have
|
||||||
|
permission value of `0777`. Owing to JSON limitations, you must specify the mode
|
||||||
|
in decimal notation.
|
||||||
|
|
||||||
|
Note that this permission value might be displayed in decimal notation if you
|
||||||
|
read it later.
|
||||||
|
|
||||||
|
**Consuming Secret Values from Volumes**
|
||||||
|
|
||||||
|
Inside the container that mounts a secret volume, the secret keys appear as
|
||||||
|
files and the secret values are base-64 decoded and stored inside these files.
|
||||||
|
This is the result of commands
|
||||||
|
executed inside the container from the example above:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ ls /etc/foo/
|
||||||
|
username
|
||||||
|
password
|
||||||
|
$ cat /etc/foo/username
|
||||||
|
admin
|
||||||
|
$ cat /etc/foo/password
|
||||||
|
1f2d1e2e67df
|
||||||
|
```
|
||||||
|
|
||||||
|
The program in a container is responsible for reading the secrets from the
|
||||||
|
files.
|
||||||
|
|
||||||
|
**Mounted Secrets are updated automatically**
|
||||||
|
|
||||||
|
When a secret being already consumed in a volume is updated, projected keys are eventually updated as well.
|
||||||
|
Kubelet is checking whether the mounted secret is fresh on every periodic sync.
|
||||||
|
However, it is using its local ttl-based cache for getting the current value of the secret.
|
||||||
|
As a result, the total delay from the moment when the secret is updated to the moment when new keys are
|
||||||
|
projected to the pod can be as long as kubelet sync period + ttl of secrets cache in kubelet.
|
||||||
|
|
||||||
|
#### Using Secrets as Environment Variables
|
||||||
|
|
||||||
|
To use a secret in an environment variable in a pod:
|
||||||
|
|
||||||
|
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
|
||||||
|
1. Modify your Pod definition in each container that you wish to consume the value of a secret key to add an environment variable for each secret key you wish to consume. The environment variable that consumes the secret key should populate the secret's name and key in `env[x].valueFrom.secretKeyRef`.
|
||||||
|
1. Modify your image and/or command line so that the program looks for values in the specified environment variables
|
||||||
|
|
||||||
|
This is an example of a pod that uses secrets from environment variables:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: secret-env-pod
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: mycontainer
|
||||||
|
image: redis
|
||||||
|
env:
|
||||||
|
- name: SECRET_USERNAME
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: mysecret
|
||||||
|
key: username
|
||||||
|
- name: SECRET_PASSWORD
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: mysecret
|
||||||
|
key: password
|
||||||
|
restartPolicy: Never
|
||||||
|
```
|
||||||
|
|
||||||
|
**Consuming Secret Values from Environment Variables**
|
||||||
|
|
||||||
|
Inside a container that consumes a secret in an environment variables, the secret keys appear as
|
||||||
|
normal environment variables containing the base-64 decoded values of the secret data.
|
||||||
|
This is the result of commands executed inside the container from the example above:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ echo $SECRET_USERNAME
|
||||||
|
admin
|
||||||
|
$ echo $SECRET_PASSWORD
|
||||||
|
1f2d1e2e67df
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using imagePullSecrets
|
||||||
|
|
||||||
|
An imagePullSecret is a way to pass a secret that contains a Docker (or other) image registry
|
||||||
|
password to the Kubelet so it can pull a private image on behalf of your Pod.
|
||||||
|
|
||||||
|
**Manually specifying an imagePullSecret**
|
||||||
|
|
||||||
|
Use of imagePullSecrets is described in the [images documentation](/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod)
|
||||||
|
|
||||||
|
### Arranging for imagePullSecrets to be Automatically Attached
|
||||||
|
|
||||||
|
You can manually create an imagePullSecret, and reference it from
|
||||||
|
a serviceAccount. Any pods created with that serviceAccount
|
||||||
|
or that default to use that serviceAccount, will get their imagePullSecret
|
||||||
|
field set to that of the service account.
|
||||||
|
See [here](/docs/user-guide/service-accounts/#adding-imagepullsecrets-to-a-service-account)
|
||||||
|
for a detailed explanation of that process.
|
||||||
|
|
||||||
|
#### Automatic Mounting of Manually Created Secrets
|
||||||
|
|
||||||
|
We plan to extend the service account behavior so that manually created
|
||||||
|
secrets (e.g. one containing a token for accessing a github account)
|
||||||
|
can be automatically attached to pods based on their service account.
|
||||||
|
*This is not implemented yet. See [issue 9902](http://issue.k8s.io/9902).*
|
||||||
|
|
||||||
|
## Details
|
||||||
|
|
||||||
|
### Restrictions
|
||||||
|
|
||||||
|
Secret volume sources are validated to ensure that the specified object
|
||||||
|
reference actually points to an object of type `Secret`. Therefore, a secret
|
||||||
|
needs to be created before any pods that depend on it.
|
||||||
|
|
||||||
|
Secret API objects reside in a namespace. They can only be referenced by pods
|
||||||
|
in that same namespace.
|
||||||
|
|
||||||
|
Individual secrets are limited to 1MB in size. This is to discourage creation
|
||||||
|
of very large secrets which would exhaust apiserver and kubelet memory.
|
||||||
|
However, creation of many smaller secrets could also exhaust memory. More
|
||||||
|
comprehensive limits on memory usage due to secrets is a planned feature.
|
||||||
|
|
||||||
|
Kubelet only supports use of secrets for Pods it gets from the API server.
|
||||||
|
This includes any pods created using kubectl, or indirectly via a replication
|
||||||
|
controller. It does not include pods created via the kubelets
|
||||||
|
`--manifest-url` flag, its `--config` flag, or its REST API (these are
|
||||||
|
not common ways to create pods.)
|
||||||
|
|
||||||
|
### Secret and Pod Lifetime interaction
|
||||||
|
|
||||||
|
When a pod is created via the API, there is no check whether a referenced
|
||||||
|
secret exists. Once a pod is scheduled, the kubelet will try to fetch the
|
||||||
|
secret value. If the secret cannot be fetched because it does not exist or
|
||||||
|
because of a temporary lack of connection to the API server, kubelet will
|
||||||
|
periodically retry. It will report an event about the pod explaining the
|
||||||
|
reason it is not started yet. Once the secret is fetched, the kubelet will
|
||||||
|
create and mount a volume containing it. None of the pod's containers will
|
||||||
|
start until all the pod's volumes are mounted.
|
||||||
|
|
||||||
|
## Use cases
|
||||||
|
|
||||||
|
### Use-Case: Pod with ssh keys
|
||||||
|
|
||||||
|
Create a secret containing some ssh keys:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create secret generic ssh-key-secret --from-file=ssh-privatekey=/path/to/.ssh/id_rsa --from-file=ssh-publickey=/path/to/.ssh/id_rsa.pub
|
||||||
|
```
|
||||||
|
|
||||||
|
**Security Note:** think carefully before sending your own ssh keys: other users of the cluster may have access to the secret. Use a service account which you want to have accessible to all the users with whom you share the Kubernetes cluster, and can revoke if they are compromised.
|
||||||
|
|
||||||
|
|
||||||
|
Now we can create a pod which references the secret with the ssh key and
|
||||||
|
consumes it in a volume:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"kind": "Pod",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "secret-test-pod",
|
||||||
|
"labels": {
|
||||||
|
"name": "secret-test"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"volumes": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "ssh-key-secret"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"name": "ssh-test-container",
|
||||||
|
"image": "mySshImage",
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"readOnly": true,
|
||||||
|
"mountPath": "/etc/secret-volume"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
When the container's command runs, the pieces of the key will be available in:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
/etc/secret-volume/ssh-publickey
|
||||||
|
/etc/secret-volume/ssh-privatekey
|
||||||
|
```
|
||||||
|
|
||||||
|
The container is then free to use the secret data to establish an ssh connection.
|
||||||
|
|
||||||
|
### Use-Case: Pods with prod / test credentials
|
||||||
|
|
||||||
|
This example illustrates a pod which consumes a secret containing prod
|
||||||
|
credentials and another pod which consumes a secret with test environment
|
||||||
|
credentials.
|
||||||
|
|
||||||
|
Make the secrets:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create secret generic prod-db-secret --from-literal=username=produser --from-literal=password=Y4nys7f11
|
||||||
|
secret "prod-db-secret" created
|
||||||
|
$ kubectl create secret generic test-db-secret --from-literal=username=testuser --from-literal=password=iluvtests
|
||||||
|
secret "test-db-secret" created
|
||||||
|
```
|
||||||
|
|
||||||
|
Now make the pods:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"kind": "List",
|
||||||
|
"items":
|
||||||
|
[{
|
||||||
|
"kind": "Pod",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "prod-db-client-pod",
|
||||||
|
"labels": {
|
||||||
|
"name": "prod-db-client"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"volumes": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "prod-db-secret"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"name": "db-client-container",
|
||||||
|
"image": "myClientImage",
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"readOnly": true,
|
||||||
|
"mountPath": "/etc/secret-volume"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "Pod",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "test-db-client-pod",
|
||||||
|
"labels": {
|
||||||
|
"name": "test-db-client"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"volumes": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "test-db-secret"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"name": "db-client-container",
|
||||||
|
"image": "myClientImage",
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"readOnly": true,
|
||||||
|
"mountPath": "/etc/secret-volume"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Both containers will have the following files present on their filesystems:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
/etc/secret-volume/username
|
||||||
|
/etc/secret-volume/password
|
||||||
|
```
|
||||||
|
|
||||||
|
Note how the specs for the two pods differ only in one field; this facilitates
|
||||||
|
creating pods with different capabilities from a common pod config template.
|
||||||
|
|
||||||
|
You could further simplify the base pod specification by using two Service Accounts:
|
||||||
|
one called, say, `prod-user` with the `prod-db-secret`, and one called, say,
|
||||||
|
`test-user` with the `test-db-secret`. Then, the pod spec can be shortened to, for example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"kind": "Pod",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "prod-db-client-pod",
|
||||||
|
"labels": {
|
||||||
|
"name": "prod-db-client"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"serviceAccount": "prod-db-client",
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"name": "db-client-container",
|
||||||
|
"image": "myClientImage"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Use-case: Dotfiles in secret volume
|
||||||
|
|
||||||
|
In order to make piece of data 'hidden' (i.e., in a file whose name begins with a dot character), simply
|
||||||
|
make that key begin with a dot. For example, when the following secret is mounted into a volume:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"kind": "Secret",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "dotfile-secret"
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
".secret-file": "dmFsdWUtMg0KDQo="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
"kind": "Pod",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "secret-dotfiles-pod"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"volumes": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"secret": {
|
||||||
|
"secretName": "dotfile-secret"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"name": "dotfile-test-container",
|
||||||
|
"image": "gcr.io/google_containers/busybox",
|
||||||
|
"command": [ "ls", "-l", "/etc/secret-volume" ],
|
||||||
|
"volumeMounts": [
|
||||||
|
{
|
||||||
|
"name": "secret-volume",
|
||||||
|
"readOnly": true,
|
||||||
|
"mountPath": "/etc/secret-volume"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
The `secret-volume` will contain a single file, called `.secret-file`, and
|
||||||
|
the `dotfile-test-container` will have this file present at the path
|
||||||
|
`/etc/secret-volume/.secret-file`.
|
||||||
|
|
||||||
|
**NOTE**
|
||||||
|
|
||||||
|
Files beginning with dot characters are hidden from the output of `ls -l`;
|
||||||
|
you must use `ls -la` to see them when listing directory contents.
|
||||||
|
|
||||||
|
|
||||||
|
### Use-case: Secret visible to one container in a pod
|
||||||
|
|
||||||
|
Consider a program that needs to handle HTTP requests, do some complex business
|
||||||
|
logic, and then sign some messages with an HMAC. Because it has complex
|
||||||
|
application logic, there might be an unnoticed remote file reading exploit in
|
||||||
|
the server, which could expose the private key to an attacker.
|
||||||
|
|
||||||
|
This could be divided into two processes in two containers: a frontend container
|
||||||
|
which handles user interaction and business logic, but which cannot see the
|
||||||
|
private key; and a signer container that can see the private key, and responds
|
||||||
|
to simple signing requests from the frontend (e.g. over localhost networking).
|
||||||
|
|
||||||
|
With this partitioned approach, an attacker now has to trick the application
|
||||||
|
server into doing something rather arbitrary, which may be harder than getting
|
||||||
|
it to read a file.
|
||||||
|
|
||||||
|
<!-- TODO: explain how to do this while still using automation. -->
|
||||||
|
|
||||||
|
## Security Properties
|
||||||
|
|
||||||
|
### Protections
|
||||||
|
|
||||||
|
Because `secret` objects can be created independently of the `pods` that use
|
||||||
|
them, there is less risk of the secret being exposed during the workflow of
|
||||||
|
creating, viewing, and editing pods. The system can also take additional
|
||||||
|
precautions with `secret` objects, such as avoiding writing them to disk where
|
||||||
|
possible.
|
||||||
|
|
||||||
|
A secret is only sent to a node if a pod on that node requires it. It is not
|
||||||
|
written to disk. It is stored in a tmpfs. It is deleted once the pod that
|
||||||
|
depends on it is deleted.
|
||||||
|
|
||||||
|
On most Kubernetes-project-maintained distributions, communication between user
|
||||||
|
to the apiserver, and from apiserver to the kubelets, is protected by SSL/TLS.
|
||||||
|
Secrets are protected when transmitted over these channels.
|
||||||
|
|
||||||
|
Secret data on nodes is stored in tmpfs volumes and thus does not come to rest
|
||||||
|
on the node.
|
||||||
|
|
||||||
|
There may be secrets for several pods on the same node. However, only the
|
||||||
|
secrets that a pod requests are potentially visible within its containers.
|
||||||
|
Therefore, one Pod does not have access to the secrets of another pod.
|
||||||
|
|
||||||
|
There may be several containers in a pod. However, each container in a pod has
|
||||||
|
to request the secret volume in its `volumeMounts` for it to be visible within
|
||||||
|
the container. This can be used to construct useful [security partitions at the
|
||||||
|
Pod level](#use-case-secret-visible-to-one-container-in-a-pod).
|
||||||
|
|
||||||
|
### Risks
|
||||||
|
|
||||||
|
- In the API server secret data is stored as plaintext in etcd; therefore:
|
||||||
|
- Administrators should limit access to etcd to admin users
|
||||||
|
- Secret data in the API server is at rest on the disk that etcd uses; admins may want to wipe/shred disks
|
||||||
|
used by etcd when no longer in use
|
||||||
|
- Applications still need to protect the value of secret after reading it from the volume,
|
||||||
|
such as not accidentally logging it or transmitting it to an untrusted party.
|
||||||
|
- A user who can create a pod that uses a secret can also see the value of that secret. Even
|
||||||
|
if apiserver policy does not allow that user to read the secret object, the user could
|
||||||
|
run a pod which exposes the secret.
|
||||||
|
- If multiple replicas of etcd are run, then the secrets will be shared between them.
|
||||||
|
By default, etcd does not secure peer-to-peer communication with SSL/TLS, though this can be configured.
|
||||||
|
- Currently, anyone with root on any node can read any secret from the apiserver,
|
||||||
|
by impersonating the kubelet. It is a planned feature to only send secrets to
|
||||||
|
nodes that actually require them, to restrict the impact of a root exploit on a
|
||||||
|
single node.
|
|
@ -0,0 +1,259 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- caesarxuchao
|
||||||
|
- dchen1107
|
||||||
|
title: Nodes
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## What is a node?
|
||||||
|
|
||||||
|
A `node` is a worker machine in Kubernetes, previously known as a `minion`. A node
|
||||||
|
may be a VM or physical machine, depending on the cluster. Each node has
|
||||||
|
the services necessary to run [pods](/docs/user-guide/pods) and is managed by the master
|
||||||
|
components. The services on a node include Docker, kubelet and kube-proxy. See
|
||||||
|
[The Kubernetes Node](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/architecture.md#the-kubernetes-node) section in the
|
||||||
|
architecture design doc for more details.
|
||||||
|
|
||||||
|
## Node Status
|
||||||
|
|
||||||
|
A node's status contains the following information:
|
||||||
|
|
||||||
|
* [Addresses](#Addresses)
|
||||||
|
* ~~[Phase](#Phase)~~ **deprecated**
|
||||||
|
* [Condition](#Condition)
|
||||||
|
* [Capacity](#Capacity)
|
||||||
|
* [Info](#Info)
|
||||||
|
|
||||||
|
Each section is described in detail below.
|
||||||
|
|
||||||
|
### Addresses
|
||||||
|
|
||||||
|
The usage of these fields varies depending on your cloud provider or bare metal configuration.
|
||||||
|
|
||||||
|
* HostName: The hostname as reported by the node's kernel. Can be overridden via the kubelet `--hostname-override` parameter.
|
||||||
|
* ExternalIP: Typically the IP address of the node that is externally routable (available from outside the cluster).
|
||||||
|
* InternalIP: Typically the IP address of the node that is routable only within the cluster.
|
||||||
|
|
||||||
|
### Phase
|
||||||
|
|
||||||
|
Deprecated: node phase is no longer used.
|
||||||
|
|
||||||
|
### Condition
|
||||||
|
|
||||||
|
The `conditions` field describes the status of all `Running` nodes.
|
||||||
|
|
||||||
|
| Node Condition | Description |
|
||||||
|
|----------------|-------------|
|
||||||
|
| `OutOfDisk` | `True` if there is insufficient free space on the node for adding new pods, otherwise `False` |
|
||||||
|
| `Ready` | `True` if the node is healthy and ready to accept pods, `False` if the node is not healthy and is not accepting pods, and `Unknown` if the node controller has not heard from the node in the last 40 seconds |
|
||||||
|
|
||||||
|
The node condition is represented as a JSON object. For example, the following response describes a healthy node.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"conditions": [
|
||||||
|
{
|
||||||
|
"kind": "Ready",
|
||||||
|
"status": "True"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
If the Status of the Ready condition is "Unknown" or "False" for longer than the `pod-eviction-timeout`, an argument passed to the [kube-controller-manager](/docs/admin/kube-controller-manager/), all of the Pods on the node are scheduled for deletion by the Node Controller. The default eviction timeout duration is **five minutes**. In some cases when the node is unreachable, the apiserver is unable to communicate with the kubelet on it. The decision to delete the pods cannot be communicated to the kubelet until it re-establishes communication with the apiserver. In the meantime, the pods which are scheduled for deletion may continue to run on the partitioned node.
|
||||||
|
|
||||||
|
In versions of Kubernetes prior to 1.5, the node controller would [force delete](/docs/user-guide/pods/#force-deletion-of-pods) these unreachable pods from the apiserver. However, in 1.5 and higher, the node controller does not force delete pods until it is confirmed that they have stopped running in the cluster. One can see these pods which may be running on an unreachable node as being in the "Terminating" or "Unknown" states. In cases where Kubernetes cannot deduce from the underlying infrastructure if a node has permanently left a cluster, the cluster administrator may need to delete the node object by hand. Deleting the node object from Kubernetes causes all the Pod objects running on it to be deleted from the apiserver, freeing up their names.
|
||||||
|
|
||||||
|
### Capacity
|
||||||
|
|
||||||
|
Describes the resources available on the node: CPU, memory and the maximum
|
||||||
|
number of pods that can be scheduled onto the node.
|
||||||
|
|
||||||
|
### Info
|
||||||
|
|
||||||
|
General information about the node, such as kernel version, Kubernetes version
|
||||||
|
(kubelet and kube-proxy version), Docker version (if used), OS name.
|
||||||
|
The information is gathered by Kubelet from the node.
|
||||||
|
|
||||||
|
## Management
|
||||||
|
|
||||||
|
Unlike [pods](/docs/user-guide/pods) and [services](/docs/user-guide/services),
|
||||||
|
a node is not inherently created by Kubernetes: it is created externally by cloud
|
||||||
|
providers like Google Compute Engine, or exists in your pool of physical or virtual
|
||||||
|
machines. What this means is that when Kubernetes creates a node, it is really
|
||||||
|
just creating an object that represents the node. After creation, Kubernetes
|
||||||
|
will check whether the node is valid or not. For example, if you try to create
|
||||||
|
a node from the following content:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"kind": "Node",
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"metadata": {
|
||||||
|
"name": "10.240.79.157",
|
||||||
|
"labels": {
|
||||||
|
"name": "my-first-k8s-node"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Kubernetes will create a node object internally (the representation), and
|
||||||
|
validate the node by health checking based on the `metadata.name` field (we
|
||||||
|
assume `metadata.name` can be resolved). If the node is valid, i.e. all necessary
|
||||||
|
services are running, it is eligible to run a pod; otherwise, it will be
|
||||||
|
ignored for any cluster activity until it becomes valid. Note that Kubernetes
|
||||||
|
will keep the object for the invalid node unless it is explicitly deleted by
|
||||||
|
the client, and it will keep checking to see if it becomes valid.
|
||||||
|
|
||||||
|
Currently, there are three components that interact with the Kubernetes node
|
||||||
|
interface: node controller, kubelet, and kubectl.
|
||||||
|
|
||||||
|
### Node Controller
|
||||||
|
|
||||||
|
The node controller is a Kubernetes master component which manages various
|
||||||
|
aspects of nodes.
|
||||||
|
|
||||||
|
The node controller has multiple roles in a node's life. The first is assigning a
|
||||||
|
CIDR block to the node when it is registered (if CIDR assignment is turned on).
|
||||||
|
|
||||||
|
The second is keeping the node controller's internal list of nodes up to date with
|
||||||
|
the cloud provider's list of available machines. When running in a cloud
|
||||||
|
environment, whenever a node is unhealthy the node controller asks the cloud
|
||||||
|
provider if the VM for that node is still available. If not, the node
|
||||||
|
controller deletes the node from its list of nodes.
|
||||||
|
|
||||||
|
The third is monitoring the nodes' health. The node controller is
|
||||||
|
responsible for updating the NodeReady condition of NodeStatus to
|
||||||
|
ConditionUnknown when a node becomes unreachable (i.e. the node controller stops
|
||||||
|
receiving heartbeats for some reason, e.g. due to the node being down), and then later evicting
|
||||||
|
all the pods from the node (using graceful termination) if the node continues
|
||||||
|
to be unreachable. (The default timeouts are 40s to start reporting
|
||||||
|
ConditionUnknown and 5m after that to start evicting pods.) The node controller
|
||||||
|
checks the state of each node every `--node-monitor-period` seconds.
|
||||||
|
|
||||||
|
In Kubernetes 1.4, we updated the logic of the node controller to better handle
|
||||||
|
cases when a big number of nodes have problems with reaching the master
|
||||||
|
(e.g. because the master has networking problem). Starting with 1.4, the node
|
||||||
|
controller will look at the state of all nodes in the cluster when making a
|
||||||
|
decision about pod eviction.
|
||||||
|
|
||||||
|
In most cases, node controller limits the eviction rate to
|
||||||
|
`--node-eviction-rate` (default 0.1) per second, meaning it won't evict pods
|
||||||
|
from more than 1 node per 10 seconds.
|
||||||
|
|
||||||
|
The node eviction behavior changes when a node in a given availability zone
|
||||||
|
becomes unhealthy. The node controller checks what percentage of nodes in the zone
|
||||||
|
are unhealthy (NodeReady condition is ConditionUnknown or ConditionFalse) at
|
||||||
|
the same time. If the fraction of unhealthy nodes is at least
|
||||||
|
`--unhealthy-zone-threshold` (default 0.55) then the eviction rate is reduced:
|
||||||
|
if the cluster is small (i.e. has less than or equal to
|
||||||
|
`--large-cluster-size-threshold` nodes - default 50) then evictions are
|
||||||
|
stopped, otherwise the eviction rate is reduced to
|
||||||
|
`--secondary-node-eviction-rate` (default 0.01) per second. The reason these
|
||||||
|
policies are implemented per availability zone is because one availability zone
|
||||||
|
might become partitioned from the master while the others remain connected. If
|
||||||
|
your cluster does not span multiple cloud provider availability zones, then
|
||||||
|
there is only one availability zone (the whole cluster).
|
||||||
|
|
||||||
|
A key reason for spreading your nodes across availability zones is so that the
|
||||||
|
workload can be shifted to healthy zones when one entire zone goes down.
|
||||||
|
Therefore, if all nodes in a zone are unhealthy then node controller evicts at
|
||||||
|
the normal rate `--node-eviction-rate`. The corner case is when all zones are
|
||||||
|
completely unhealthy (i.e. there are no healthy nodes in the cluster). In such
|
||||||
|
case, the node controller assumes that there's some problem with master
|
||||||
|
connectivity and stops all evictions until some connectivity is restored.
|
||||||
|
|
||||||
|
Starting in Kubernetes 1.6, the NodeController is also responsible for evicting
|
||||||
|
pods that are running on nodes with `NoExecute` taints, when the pods do not tolerate
|
||||||
|
the taints. Additionally, as an alpha feature that is disabled by default, the
|
||||||
|
NodeController is responsible for adding taints corresponding to node problems like
|
||||||
|
node unreachable or not ready. See [this documentation](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature)
|
||||||
|
for details about `NoExecute` taints and the alpha feature.
|
||||||
|
|
||||||
|
### Self-Registration of Nodes
|
||||||
|
|
||||||
|
When the kubelet flag `--register-node` is true (the default), the kubelet will attempt to
|
||||||
|
register itself with the API server. This is the preferred pattern, used by most distros.
|
||||||
|
|
||||||
|
For self-registration, the kubelet is started with the following options:
|
||||||
|
|
||||||
|
- `--api-servers` - Location of the apiservers.
|
||||||
|
- `--kubeconfig` - Path to credentials to authenticate itself to the apiserver.
|
||||||
|
- `--cloud-provider` - How to talk to a cloud provider to read metadata about itself.
|
||||||
|
- `--register-node` - Automatically register with the API server.
|
||||||
|
- `--register-with-taints` - Register the node with the given list of taints (comma seperated `<key>=<value>:<effect>`). No-op if `register-node` is false.
|
||||||
|
- `--node-ip` IP address of the node.
|
||||||
|
- `--node-labels` - Labels to add when registering the node in the cluster.
|
||||||
|
- `--node-status-update-frequency` - Specifies how often kubelet posts node status to master.
|
||||||
|
|
||||||
|
Currently, any kubelet is authorized to create/modify any node resource, but in practice it only creates/modifies
|
||||||
|
its own. (In the future, we plan to only allow a kubelet to modify its own node resource.)
|
||||||
|
|
||||||
|
#### Manual Node Administration
|
||||||
|
|
||||||
|
A cluster administrator can create and modify node objects.
|
||||||
|
|
||||||
|
If the administrator wishes to create node objects manually, set the kubelet flag
|
||||||
|
`--register-node=false`.
|
||||||
|
|
||||||
|
The administrator can modify node resources (regardless of the setting of `--register-node`).
|
||||||
|
Modifications include setting labels on the node and marking it unschedulable.
|
||||||
|
|
||||||
|
Labels on nodes can be used in conjunction with node selectors on pods to control scheduling,
|
||||||
|
e.g. to constrain a pod to only be eligible to run on a subset of the nodes.
|
||||||
|
|
||||||
|
Marking a node as unschedulable will prevent new pods from being scheduled to that
|
||||||
|
node, but will not affect any existing pods on the node. This is useful as a
|
||||||
|
preparatory step before a node reboot, etc. For example, to mark a node
|
||||||
|
unschedulable, run this command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
kubectl cordon $NODENAME
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that pods which are created by a daemonSet controller bypass the Kubernetes scheduler,
|
||||||
|
and do not respect the unschedulable attribute on a node. The assumption is that daemons belong on
|
||||||
|
the machine even if it is being drained of applications in preparation for a reboot.
|
||||||
|
|
||||||
|
### Node capacity
|
||||||
|
|
||||||
|
The capacity of the node (number of cpus and amount of memory) is part of the node object.
|
||||||
|
Normally, nodes register themselves and report their capacity when creating the node object. If
|
||||||
|
you are doing [manual node administration](#manual-node-administration), then you need to set node
|
||||||
|
capacity when adding a node.
|
||||||
|
|
||||||
|
The Kubernetes scheduler ensures that there are enough resources for all the pods on a node. It
|
||||||
|
checks that the sum of the limits of containers on the node is no greater than the node capacity. It
|
||||||
|
includes all containers started by the kubelet, but not containers started directly by Docker nor
|
||||||
|
processes not in containers.
|
||||||
|
|
||||||
|
If you want to explicitly reserve resources for non-pod processes, you can create a placeholder
|
||||||
|
pod. Use the following template:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: resource-reserver
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: sleep-forever
|
||||||
|
image: gcr.io/google_containers/pause:0.8.0
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the `cpu` and `memory` values to the amount of resources you want to reserve.
|
||||||
|
Place the file in the manifest directory (`--config=DIR` flag of kubelet). Do this
|
||||||
|
on each kubelet where you want to reserve resources.
|
||||||
|
|
||||||
|
|
||||||
|
## API Object
|
||||||
|
|
||||||
|
Node is a top-level resource in the Kubernetes REST API. More details about the
|
||||||
|
API object can be found at: [Node API
|
||||||
|
object](/docs/api-reference/v1.6/#node-v1-core).
|
|
@ -28,9 +28,7 @@ What constitutes a compatible change and how to change the API are detailed by t
|
||||||
|
|
||||||
Complete API details are documented using [Swagger v1.2](http://swagger.io/) and [OpenAPI](https://www.openapis.org/). The Kubernetes apiserver (aka "master") exposes an API that can be used to retrieve the Swagger v1.2 Kubernetes API spec located at `/swaggerapi`. You can also enable a UI to browse the API documentation at `/swagger-ui` by passing the `--enable-swagger-ui=true` flag to apiserver.
|
Complete API details are documented using [Swagger v1.2](http://swagger.io/) and [OpenAPI](https://www.openapis.org/). The Kubernetes apiserver (aka "master") exposes an API that can be used to retrieve the Swagger v1.2 Kubernetes API spec located at `/swaggerapi`. You can also enable a UI to browse the API documentation at `/swagger-ui` by passing the `--enable-swagger-ui=true` flag to apiserver.
|
||||||
|
|
||||||
We also host a version of the [latest v1.2 API documentation UI](http://kubernetes.io/kubernetes/third_party/swagger-ui/). This is updated with the latest release, so if you are using a different version of Kubernetes you will want to use the spec from your apiserver.
|
Starting with kubernetes 1.4, OpenAPI spec is also available at [`/swagger.json`](https://github.com/kubernetes/kubernetes/blob/master/api/openapi-spec/swagger.json). While we are transitioning from Swagger v1.2 to OpenAPI (aka Swagger v2.0), some of the tools such as kubectl and swagger-ui are still using v1.2 spec. OpenAPI spec is in Beta as of Kubernetes 1.5.
|
||||||
|
|
||||||
Starting with kubernetes 1.4, OpenAPI spec is also available at `/swagger.json`. While we are transitioning from Swagger v1.2 to OpenAPI (aka Swagger v2.0), some of the tools such as kubectl and swagger-ui are still using v1.2 spec. OpenAPI spec is in Beta as of Kubernetes 1.5.
|
|
||||||
|
|
||||||
Kubernetes implements an alternative Protobuf based serialization format for the API that is primarily intended for intra-cluster communication, documented in the [design proposal](https://github.com/kubernetes/kubernetes/blob/{{ page.githubbranch }}/docs/proposals/protobuf.md) and the IDL files for each schema are located in the Go packages that define the API objects.
|
Kubernetes implements an alternative Protobuf based serialization format for the API that is primarily intended for intra-cluster communication, documented in the [design proposal](https://github.com/kubernetes/kubernetes/blob/{{ page.githubbranch }}/docs/proposals/protobuf.md) and the IDL files for each schema are located in the Go packages that define the API objects.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- mikedanese
|
||||||
|
- thockin
|
||||||
|
title: Names
|
||||||
|
---
|
||||||
|
|
||||||
|
All objects in the Kubernetes REST API are unambiguously identified by a Name and a UID.
|
||||||
|
|
||||||
|
For non-unique user-provided attributes, Kubernetes provides [labels](/docs/user-guide/labels) and [annotations](/docs/user-guide/annotations).
|
||||||
|
|
||||||
|
## Names
|
||||||
|
|
||||||
|
Names are generally client-provided. Only one object of a given kind can have a given name at a time (i.e., they are spatially unique). But if you delete an object, you can make a new object with the same name. Names are used to refer to an object in a resource URL, such as `/api/v1/pods/some-name`. By convention, the names of Kubernetes resources should be up to maximum length of 253 characters and consist of lower case alphanumeric characters, `-`, and `.`, but certain resources have more specific restrictions. See the [identifiers design doc](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/identifiers.md) for the precise syntax rules for names.
|
||||||
|
|
||||||
|
## UIDs
|
||||||
|
|
||||||
|
UID are generated by Kubernetes. Every object created over the whole lifetime of a Kubernetes cluster has a distinct UID (i.e., they are spatially and temporally unique).
|
|
@ -0,0 +1,88 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- derekwaynecarr
|
||||||
|
- mikedanese
|
||||||
|
- thockin
|
||||||
|
title: Namespaces
|
||||||
|
---
|
||||||
|
|
||||||
|
Kubernetes supports multiple virtual clusters backed by the same physical cluster.
|
||||||
|
These virtual clusters are called namespaces.
|
||||||
|
|
||||||
|
## When to Use Multiple Namespaces
|
||||||
|
|
||||||
|
Namespaces are intended for use in environments with many users spread across multiple
|
||||||
|
teams, or projects. For clusters with a few to tens of users, you should not
|
||||||
|
need to create or think about namespaces at all. Start using namespaces when you
|
||||||
|
need the features they provide.
|
||||||
|
|
||||||
|
Namespaces provide a scope for names. Names of resources need to be unique within a namespace, but not across namespaces.
|
||||||
|
|
||||||
|
Namespaces are a way to divide cluster resources between multiple uses (via [resource quota](/docs/admin/resourcequota/)).
|
||||||
|
|
||||||
|
In future versions of Kubernetes, objects in the same namespace will have the same
|
||||||
|
access control policies by default.
|
||||||
|
|
||||||
|
It is not necessary to use multiple namespaces just to separate slightly different
|
||||||
|
resources, such as different versions of the same software: use [labels](/docs/user-guide/labels) to distinguish
|
||||||
|
resources within the same namespace.
|
||||||
|
|
||||||
|
## Working with Namespaces
|
||||||
|
|
||||||
|
Creation and deletion of namespaces is described in the [Admin Guide documentation
|
||||||
|
for namespaces](/docs/admin/namespaces)
|
||||||
|
|
||||||
|
### Viewing namespaces
|
||||||
|
|
||||||
|
You can list the current namespaces in a cluster using:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get namespaces
|
||||||
|
NAME LABELS STATUS
|
||||||
|
default <none> Active
|
||||||
|
kube-system <none> Active
|
||||||
|
```
|
||||||
|
|
||||||
|
Kubernetes starts with two initial namespaces:
|
||||||
|
|
||||||
|
* `default` The default namespace for objects with no other namespace
|
||||||
|
* `kube-system` The namespace for objects created by the Kubernetes system
|
||||||
|
|
||||||
|
### Setting the namespace for a request
|
||||||
|
|
||||||
|
To temporarily set the namespace for a request, use the `--namespace` flag.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl --namespace=<insert-namespace-name-here> run nginx --image=nginx
|
||||||
|
$ kubectl --namespace=<insert-namespace-name-here> get pods
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setting the namespace preference
|
||||||
|
|
||||||
|
You can permanently save the namespace for all subsequent kubectl commands in that
|
||||||
|
context.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl config set-context $(kubectl config current-context) --namespace=<insert-namespace-name-here>
|
||||||
|
# Validate it
|
||||||
|
$ kubectl config view | grep namespace:
|
||||||
|
```
|
||||||
|
|
||||||
|
## Namespaces and DNS
|
||||||
|
|
||||||
|
When you create a [Service](/docs/user-guide/services), it creates a corresponding [DNS entry](/docs/admin/dns).
|
||||||
|
This entry is of the form `<service-name>.<namespace-name>.svc.cluster.local`, which means
|
||||||
|
that if a container just uses `<service-name>` it will resolve to the service which
|
||||||
|
is local to a namespace. This is useful for using the same configuration across
|
||||||
|
multiple namespaces such as Development, Staging and Production. If you want to reach
|
||||||
|
across namespaces, you need to use the fully qualified domain name (FQDN).
|
||||||
|
|
||||||
|
## Not All Objects are in a Namespace
|
||||||
|
|
||||||
|
Most Kubernetes resources (e.g. pods, services, replication controllers, and others) are
|
||||||
|
in some namespace. However namespace resources are not themselves in a namespace.
|
||||||
|
And low-level resources, such as [nodes](/docs/admin/node) and
|
||||||
|
persistentVolumes, are not in any namespace. Events are an exception: they may or may not
|
||||||
|
have a namespace, depending on the object the event is about.
|
|
@ -0,0 +1,294 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- bprashanth
|
||||||
|
title: Ingress Resources
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
__Terminology__
|
||||||
|
|
||||||
|
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
|
||||||
|
|
||||||
|
* Node: A single virtual or physical machine in a Kubernetes cluster.
|
||||||
|
* Cluster: A group of nodes firewalled from the internet, that are the primary compute resources managed by Kubernetes.
|
||||||
|
* Edge router: A router that enforces the firewall policy for your cluster. This could be a gateway managed by a cloudprovider or a physical piece of hardware.
|
||||||
|
* Cluster network: A set of links, logical or physical, that facilitate communication within a cluster according to the [Kubernetes networking model](/docs/admin/networking/). Examples of a Cluster network include Overlays such as [flannel](https://github.com/coreos/flannel#flannel) or SDNs such as [OVS](/docs/admin/ovs-networking/).
|
||||||
|
* Service: A Kubernetes [Service](/docs/user-guide/services/) that identifies a set of pods using label selectors. Unless mentioned otherwise, Services are assumed to have virtual IPs only routable within the cluster network.
|
||||||
|
|
||||||
|
## What is Ingress?
|
||||||
|
|
||||||
|
Typically, services and pods have IPs only routable by the cluster network. All traffic that ends up at an edge router is either dropped or forwarded elsewhere. Conceptually, this might look like:
|
||||||
|
|
||||||
|
```
|
||||||
|
internet
|
||||||
|
|
|
||||||
|
------------
|
||||||
|
[ Services ]
|
||||||
|
```
|
||||||
|
|
||||||
|
An Ingress is a collection of rules that allow inbound connections to reach the cluster services.
|
||||||
|
|
||||||
|
```
|
||||||
|
internet
|
||||||
|
|
|
||||||
|
[ Ingress ]
|
||||||
|
--|-----|--
|
||||||
|
[ Services ]
|
||||||
|
```
|
||||||
|
|
||||||
|
It can be configured to give services externally-reachable urls, load balance traffic, terminate SSL, offer name based virtual hosting etc. Users request ingress by POSTing the Ingress resource to the API server. An [Ingress controller](#ingress-controllers) is responsible for fulfilling the Ingress, usually with a loadbalancer, though it may also configure your edge router or additional frontends to help handle the traffic in an HA manner.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before you start using the Ingress resource, there are a few things you should understand. The Ingress is a beta resource, not available in any Kubernetes release prior to 1.1. You need an Ingress controller to satisfy an Ingress, simply creating the resource will have no effect.
|
||||||
|
|
||||||
|
GCE/GKE deploys an ingress controller on the master. You can deploy any number of custom ingress controllers in a pod. You must annotate each ingress with the appropriate class, as indicated [here](https://github.com/kubernetes/ingress/tree/master/controllers/nginx#running-multiple-ingress-controllers) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md#disabling-glbc).
|
||||||
|
|
||||||
|
Make sure you review the [beta limitations](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md) of this controller. In environments other than GCE/GKE, you need to [deploy a controller](https://github.com/kubernetes/ingress/tree/master/controllers) as a pod.
|
||||||
|
|
||||||
|
## The Ingress Resource
|
||||||
|
|
||||||
|
A minimal Ingress might look like:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: test-ingress
|
||||||
|
spec:
|
||||||
|
rules:
|
||||||
|
- http:
|
||||||
|
paths:
|
||||||
|
- path: /testpath
|
||||||
|
backend:
|
||||||
|
serviceName: test
|
||||||
|
servicePort: 80
|
||||||
|
```
|
||||||
|
|
||||||
|
*POSTing this to the API server will have no effect if you have not configured an [Ingress controller](#ingress-controllers).*
|
||||||
|
|
||||||
|
__Lines 1-4__: As with all other Kubernetes config, an Ingress needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/deploying-applications), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
|
||||||
|
|
||||||
|
__Lines 5-7__: Ingress [spec](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status) has all the information needed to configure a loadbalancer or proxy server. Most importantly, it contains a list of rules matched against all incoming requests. Currently the Ingress resource only supports http rules.
|
||||||
|
|
||||||
|
__Lines 8-9__: Each http rule contains the following information: A host (e.g.: foo.bar.com, defaults to * in this example), a list of paths (e.g.: /testpath) each of which has an associated backend (test:80). Both the host and path must match the content of an incoming request before the loadbalancer directs traffic to the backend.
|
||||||
|
|
||||||
|
__Lines 10-12__: A backend is a service:port combination as described in the [services doc](/docs/user-guide/services). Ingress traffic is typically sent directly to the endpoints matching a backend.
|
||||||
|
|
||||||
|
__Global Parameters__: For the sake of simplicity the example Ingress has no global parameters, see the [api-reference](https://releases.k8s.io/{{page.githubbranch}}/pkg/apis/extensions/v1beta1/types.go) for a full definition of the resource. One can specify a global default backend in the absence of which requests that don't match a path in the spec are sent to the default backend of the Ingress controller.
|
||||||
|
|
||||||
|
## Ingress controllers
|
||||||
|
|
||||||
|
In order for the Ingress resource to work, the cluster must have an Ingress controller running. This is unlike other types of controllers, which typically run as part of the `kube-controller-manager` binary, and which are typically started automatically as part of cluster creation. You need to choose the ingress controller implementation that is the best fit for your cluster, or implement one. Examples and instructions can be found [here](https://github.com/kubernetes/ingress/tree/master/controllers).
|
||||||
|
|
||||||
|
## Before you begin
|
||||||
|
|
||||||
|
The following document describes a set of cross platform features exposed through the Ingress resource. Ideally, all Ingress controllers should fulfill this specification, but we're not there yet. The docs for the GCE and nginx controllers are [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md) respectively. **Make sure you review controller specific docs so you understand the caveats of each one**.
|
||||||
|
|
||||||
|
## Types of Ingress
|
||||||
|
|
||||||
|
### Single Service Ingress
|
||||||
|
|
||||||
|
There are existing Kubernetes concepts that allow you to expose a single service (see [alternatives](#alternatives)), however you can do so through an Ingress as well, by specifying a *default backend* with no rules.
|
||||||
|
|
||||||
|
{% include code.html language="yaml" file="ingress.yaml" ghlink="/docs/concepts/services-networking/ingress.yaml" %}
|
||||||
|
|
||||||
|
If you create it using `kubectl create -f` you should see:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get ing
|
||||||
|
NAME RULE BACKEND ADDRESS
|
||||||
|
test-ingress - testsvc:80 107.178.254.228
|
||||||
|
```
|
||||||
|
|
||||||
|
Where `107.178.254.228` is the IP allocated by the Ingress controller to satisfy this Ingress. The `RULE` column shows that all traffic send to the IP is directed to the Kubernetes Service listed under `BACKEND`.
|
||||||
|
|
||||||
|
### Simple fanout
|
||||||
|
|
||||||
|
As described previously, pods within kubernetes have IPs only visible on the cluster network, so we need something at the edge accepting ingress traffic and proxying it to the right endpoints. This component is usually a highly available loadbalancer. An Ingress allows you to keep the number of loadbalancers down to a minimum, for example, a setup like:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
foo.bar.com -> 178.91.123.132 -> / foo s1:80
|
||||||
|
/ bar s2:80
|
||||||
|
```
|
||||||
|
|
||||||
|
would require an Ingress such as:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: test
|
||||||
|
spec:
|
||||||
|
rules:
|
||||||
|
- host: foo.bar.com
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /foo
|
||||||
|
backend:
|
||||||
|
serviceName: s1
|
||||||
|
servicePort: 80
|
||||||
|
- path: /bar
|
||||||
|
backend:
|
||||||
|
serviceName: s2
|
||||||
|
servicePort: 80
|
||||||
|
```
|
||||||
|
|
||||||
|
When you create the Ingress with `kubectl create -f`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get ing
|
||||||
|
NAME RULE BACKEND ADDRESS
|
||||||
|
test -
|
||||||
|
foo.bar.com
|
||||||
|
/foo s1:80
|
||||||
|
/bar s2:80
|
||||||
|
```
|
||||||
|
The Ingress controller will provision an implementation specific loadbalancer that satisfies the Ingress, as long as the services (s1, s2) exist. When it has done so, you will see the address of the loadbalancer under the last column of the Ingress.
|
||||||
|
|
||||||
|
### Name based virtual hosting
|
||||||
|
|
||||||
|
Name-based virtual hosts use multiple host names for the same IP address.
|
||||||
|
|
||||||
|
```
|
||||||
|
foo.bar.com --| |-> foo.bar.com s1:80
|
||||||
|
| 178.91.123.132 |
|
||||||
|
bar.foo.com --| |-> bar.foo.com s2:80
|
||||||
|
```
|
||||||
|
|
||||||
|
The following Ingress tells the backing loadbalancer to route requests based on the [Host header](https://tools.ietf.org/html/rfc7230#section-5.4).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: test
|
||||||
|
spec:
|
||||||
|
rules:
|
||||||
|
- host: foo.bar.com
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- backend:
|
||||||
|
serviceName: s1
|
||||||
|
servicePort: 80
|
||||||
|
- host: bar.foo.com
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- backend:
|
||||||
|
serviceName: s2
|
||||||
|
servicePort: 80
|
||||||
|
```
|
||||||
|
|
||||||
|
__Default Backends__: An Ingress with no rules, like the one shown in the previous section, sends all traffic to a single default backend. You can use the same technique to tell a loadbalancer where to find your website's 404 page, by specifying a set of rules *and* a default backend. Traffic is routed to your default backend if none of the Hosts in your Ingress match the Host in the request header, and/or none of the paths match the url of the request.
|
||||||
|
|
||||||
|
### TLS
|
||||||
|
|
||||||
|
You can secure an Ingress by specifying a [secret](/docs/user-guide/secrets) that contains a TLS private key and certificate. Currently the Ingress only supports a single TLS port, 443, and assumes TLS termination. If the TLS configuration section in an Ingress specifies different hosts, they will be multiplexed on the same port according to the hostname specified through the SNI TLS extension (provided the Ingress controller supports SNI). The TLS secret must contain keys named `tls.crt` and `tls.key` that contain the certificate and private key to use for TLS, e.g.:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
data:
|
||||||
|
tls.crt: base64 encoded cert
|
||||||
|
tls.key: base64 encoded key
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: testsecret
|
||||||
|
namespace: default
|
||||||
|
type: Opaque
|
||||||
|
```
|
||||||
|
|
||||||
|
Referencing this secret in an Ingress will tell the Ingress controller to secure the channel from the client to the loadbalancer using TLS:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: no-rules-map
|
||||||
|
spec:
|
||||||
|
tls:
|
||||||
|
- secretName: testsecret
|
||||||
|
backend:
|
||||||
|
serviceName: s1
|
||||||
|
servicePort: 80
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that there is a gap between TLS features supported by various Ingress controllers. Please refer to documentation on [nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md#https), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#tls), or any other platform specific Ingress controller to understand how TLS works in your environment.
|
||||||
|
|
||||||
|
### Loadbalancing
|
||||||
|
|
||||||
|
An Ingress controller is bootstrapped with some loadbalancing policy settings that it applies to all Ingress, such as the loadbalancing algorithm, backend weight scheme etc. More advanced loadbalancing concepts (e.g.: persistent sessions, dynamic weights) are not yet exposed through the Ingress. You can still get these features through the [service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). With time, we plan to distill loadbalancing patterns that are applicable cross platform into the Ingress resource.
|
||||||
|
|
||||||
|
It's also worth noting that even though health checks are not exposed directly through the Ingress, there exist parallel concepts in Kubernetes such as [readiness probes](/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/) which allow you to achieve the same end result. Please review the controller specific docs to see how they handle health checks ([nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#health-checks)).
|
||||||
|
|
||||||
|
## Updating an Ingress
|
||||||
|
|
||||||
|
Say you'd like to add a new Host to an existing Ingress, you can update it by editing the resource:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get ing
|
||||||
|
NAME RULE BACKEND ADDRESS
|
||||||
|
test - 178.91.123.132
|
||||||
|
foo.bar.com
|
||||||
|
/foo s1:80
|
||||||
|
$ kubectl edit ing test
|
||||||
|
```
|
||||||
|
|
||||||
|
This should pop up an editor with the existing yaml, modify it to include the new Host.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
spec:
|
||||||
|
rules:
|
||||||
|
- host: foo.bar.com
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- backend:
|
||||||
|
serviceName: s1
|
||||||
|
servicePort: 80
|
||||||
|
path: /foo
|
||||||
|
- host: bar.baz.com
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- backend:
|
||||||
|
serviceName: s2
|
||||||
|
servicePort: 80
|
||||||
|
path: /foo
|
||||||
|
..
|
||||||
|
```
|
||||||
|
|
||||||
|
saving it will update the resource in the API server, which should tell the Ingress controller to reconfigure the loadbalancer.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get ing
|
||||||
|
NAME RULE BACKEND ADDRESS
|
||||||
|
test - 178.91.123.132
|
||||||
|
foo.bar.com
|
||||||
|
/foo s1:80
|
||||||
|
bar.baz.com
|
||||||
|
/foo s2:80
|
||||||
|
```
|
||||||
|
|
||||||
|
You can achieve the same by invoking `kubectl replace -f` on a modified Ingress yaml file.
|
||||||
|
|
||||||
|
## Failing across availability zones
|
||||||
|
|
||||||
|
Techniques for spreading traffic across failure domains differs between cloud providers. Please check the documentation of the relevant Ingress controller for details. Please refer to the federation [doc](/docs/user-guide/federation/) for details on deploying Ingress in a federated cluster.
|
||||||
|
|
||||||
|
## Future Work
|
||||||
|
|
||||||
|
* Various modes of HTTPS/TLS support (e.g.: SNI, re-encryption)
|
||||||
|
* Requesting an IP or Hostname via claims
|
||||||
|
* Combining L4 and L7 Ingress
|
||||||
|
* More Ingress controllers
|
||||||
|
|
||||||
|
Please track the [L7 and Ingress proposal](https://github.com/kubernetes/kubernetes/pull/12827) for more details on the evolution of the resource, and the [Ingress repository](https://github.com/kubernetes/ingress/tree/master) for more details on the evolution of various Ingress controllers.
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
|
||||||
|
You can expose a Service in multiple ways that don't directly involve the Ingress resource:
|
||||||
|
|
||||||
|
* Use [Service.Type=LoadBalancer](/docs/user-guide/services/#type-loadbalancer)
|
||||||
|
* Use [Service.Type=NodePort](/docs/user-guide/services/#type-nodeport)
|
||||||
|
* Use a [Port Proxy](https://github.com/kubernetes/contrib/tree/master/for-demos/proxy-to-service)
|
||||||
|
* Deploy the [Service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). This allows you to share a single IP among multiple Services and achieve more advanced loadbalancing through Service Annotations.
|
|
@ -0,0 +1,9 @@
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: test-ingress
|
||||||
|
spec:
|
||||||
|
backend:
|
||||||
|
serviceName: testsvc
|
||||||
|
servicePort: 80
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- thockin
|
||||||
|
- caseydavenport
|
||||||
|
title: Network Policies
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
A network policy is a specification of how selections of pods are allowed to communicate with each other and other network endpoints.
|
||||||
|
|
||||||
|
`NetworkPolicy` resources use labels to select pods and define whitelist rules which allow traffic to the selected pods in addition to what is allowed by the isolation policy for a given namespace.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
You must enable the `extensions/v1beta1/networkpolicies` runtime config in your apiserver to enable this resource.
|
||||||
|
|
||||||
|
You must also be using a networking solution which supports `NetworkPolicy` - simply creating the
|
||||||
|
resource without a controller to implement it will have no effect.
|
||||||
|
|
||||||
|
## Configuring Namespace Isolation Policy
|
||||||
|
|
||||||
|
Isolation can be configured on a per-namespace basis. Once isolation is configured on a namespace it will be applied to all pods in that namespace. Currently, only isolation policy on inbound traffic (ingress) can be defined.
|
||||||
|
|
||||||
|
The following ingress isolation types being supported:
|
||||||
|
|
||||||
|
- `DefaultDeny`: Pods in the namespace will be inaccessible from any source except the pod's local node.
|
||||||
|
|
||||||
|
Ingress isolation can be enabled using an annotation on the Namespace.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: Namespace
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
net.beta.kubernetes.io/network-policy: |
|
||||||
|
{
|
||||||
|
"ingress": {
|
||||||
|
"isolation": "DefaultDeny"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
To configure the annotation via `kubectl`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
{% raw %}
|
||||||
|
kubectl annotate ns <namespace> "net.beta.kubernetes.io/network-policy={\"ingress\": {\"isolation\": \"DefaultDeny\"}}"
|
||||||
|
{% endraw %}
|
||||||
|
```
|
||||||
|
|
||||||
|
See the [NetworkPolicy getting started guide](/docs/getting-started-guides/network-policy/walkthrough) for an example.
|
||||||
|
|
||||||
|
## The `NetworkPolicy` Resource
|
||||||
|
|
||||||
|
See the [api-reference](/docs/api-reference/extensions/v1beta1/definitions/#_v1beta1_networkpolicy) for a full definition of the resource.
|
||||||
|
|
||||||
|
A minimal `NetworkPolicy` might look like this:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: test-network-policy
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
role: db
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
project: myproject
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
role: frontend
|
||||||
|
ports:
|
||||||
|
- protocol: tcp
|
||||||
|
port: 6379
|
||||||
|
```
|
||||||
|
|
||||||
|
*POSTing this to the API server will have no effect unless your chosen networking solution supports network policy.*
|
||||||
|
|
||||||
|
__Mandatory Fields__: As with all other Kubernetes config, a `NetworkPolicy` needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/simple-yaml), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
|
||||||
|
|
||||||
|
__spec__: `NetworkPolicy` [spec](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) has all the information needed to define a network isolation policy in the deployed controller.
|
||||||
|
|
||||||
|
__podSelector__: Each `NetworkPolicy` includes a `podSelector` which selects the grouping of pods to which the `ingress` rules in the policy apply.
|
||||||
|
|
||||||
|
__ingress__: Each `NetworkPolicy` includes a list of whitelist `ingress` rules. Each rule allows traffic which matches both the `from` and `ports` sections.
|
||||||
|
|
||||||
|
This example NetworkPolicy has the following characteristics:
|
||||||
|
|
||||||
|
1. applies to all pods in the default namespace with the label "role=db"
|
||||||
|
2. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the current namespace with the label "role=frontend" (due to the podSelector list element)
|
||||||
|
3. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the namespace "myproject" (due to the namespaceSelector list element)
|
|
@ -0,0 +1,679 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- jsafrane
|
||||||
|
- mikedanese
|
||||||
|
- saad-ali
|
||||||
|
- thockin
|
||||||
|
title: Persistent Volumes
|
||||||
|
---
|
||||||
|
|
||||||
|
This document describes the current state of `PersistentVolumes` in Kubernetes. Familiarity with [volumes](/docs/concepts/storage/volumes/) is suggested.
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
Managing storage is a distinct problem from managing compute. The `PersistentVolume` subsystem provides an API for users and administrators that abstracts details of how storage is provided from how it is consumed. To do this we introduce two new API resources: `PersistentVolume` and `PersistentVolumeClaim`.
|
||||||
|
|
||||||
|
A `PersistentVolume` (PV) is a piece of networked storage in the cluster that has been provisioned by an administrator. It is a resource in the cluster just like a node is a cluster resource. PVs are volume plugins like Volumes, but have a lifecycle independent of any individual pod that uses the PV. This API object captures the details of the implementation of the storage, be that NFS, iSCSI, or a cloud-provider-specific storage system.
|
||||||
|
|
||||||
|
A `PersistentVolumeClaim` (PVC) is a request for storage by a user. It is similar to a pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., can be mounted once read/write or many times read-only).
|
||||||
|
|
||||||
|
While `PersistentVolumeClaims` allow a user to consume abstract storage
|
||||||
|
resources, it is common that users need `PersistentVolumes` with varying
|
||||||
|
properties, such as performance, for different problems. Cluster administrators
|
||||||
|
need to be able to offer a variety of `PersistentVolumes` that differ in more
|
||||||
|
ways than just size and access modes, without exposing users to the details of
|
||||||
|
how those volumes are implemented. For these needs there is the `StorageClass`
|
||||||
|
resource.
|
||||||
|
|
||||||
|
A `StorageClass` provides a way for administrators to describe the "classes" of
|
||||||
|
storage they offer. Different classes might map to quality-of-service levels,
|
||||||
|
or to backup policies, or to arbitrary policies determined by the cluster
|
||||||
|
administrators. Kubernetes itself is unopinionated about what classes
|
||||||
|
represent. This concept is sometimes called "profiles" in other storage
|
||||||
|
systems.
|
||||||
|
|
||||||
|
Please see the [detailed walkthrough with working examples](/docs/user-guide/persistent-volumes/walkthrough/).
|
||||||
|
|
||||||
|
|
||||||
|
## Lifecycle of a volume and claim
|
||||||
|
|
||||||
|
PVs are resources in the cluster. PVCs are requests for those resources and also act as claim checks to the resource. The interaction between PVs and PVCs follows this lifecycle:
|
||||||
|
|
||||||
|
### Provisioning
|
||||||
|
|
||||||
|
There are two ways PVs may be provisioned: statically or dynamically.
|
||||||
|
|
||||||
|
#### Static
|
||||||
|
A cluster administrator creates a number of PVs. They carry the details of the real storage which is available for use by cluster users. They exist in the Kubernetes API and are available for consumption.
|
||||||
|
|
||||||
|
#### Dynamic
|
||||||
|
When none of the static PVs the administrator created matches a user's `PersistentVolumeClaim`, the cluster may try to dynamically provision a volume specially for the PVC. This provisioning is based on `StorageClasses`: the PVC must request a class and the administrator must have created and configured that class in order for dynamic provisioning to occur. Claims that request the class `""` effectively disable dynamic provisioning for themselves.
|
||||||
|
|
||||||
|
### Binding
|
||||||
|
|
||||||
|
A user creates, or has already created in the case of dynamic provisioning, a `PersistentVolumeClaim` with a specific amount of storage requested and with certain access modes. A control loop in the master watches for new PVCs, finds a matching PV (if possible), and binds them together. If a PV was dynamically provisioned for a new PVC, the loop will always bind that PV to the PVC. Otherwise, the user will always get at least what they asked for, but the volume may be in excess of what was requested. Once bound, `PersistentVolumeClaim` binds are exclusive, regardless of the mode used to bind them.
|
||||||
|
|
||||||
|
Claims will remain unbound indefinitely if a matching volume does not exist. Claims will be bound as matching volumes become available. For example, a cluster provisioned with many 50Gi PVs would not match a PVC requesting 100Gi. The PVC can be bound when a 100Gi PV is added to the cluster.
|
||||||
|
|
||||||
|
### Using
|
||||||
|
|
||||||
|
Pods use claims as volumes. The cluster inspects the claim to find the bound volume and mounts that volume for a pod. For volumes which support multiple access modes, the user specifies which mode desired when using their claim as a volume in a pod.
|
||||||
|
|
||||||
|
Once a user has a claim and that claim is bound, the bound PV belongs to the user for as long as they need it. Users schedule Pods and access their claimed PVs by including a persistentVolumeClaim in their Pod's volumes block. [See below for syntax details](#claims-as-volumes).
|
||||||
|
|
||||||
|
### Releasing
|
||||||
|
|
||||||
|
When a user is done with their volume, they can delete the PVC objects from the API which allows reclamation of the resource. The volume is considered "released" when the claim is deleted, but it is not yet available for another claim. The previous claimant's data remains on the volume which must be handled according to policy.
|
||||||
|
|
||||||
|
### Reclaiming
|
||||||
|
|
||||||
|
The reclaim policy for a `PersistentVolume` tells the cluster what to do with the volume after it has been released of its claim. Currently, volumes can either be Retained, Recycled or Deleted. Retention allows for manual reclamation of the resource. For those volume plugins that support it, deletion removes both the `PersistentVolume` object from Kubernetes, as well as deleting the associated storage asset in external infrastructure (such as an AWS EBS, GCE PD, Azure Disk, or Cinder volume). Volumes that were dynamically provisioned are always deleted.
|
||||||
|
|
||||||
|
#### Recycling
|
||||||
|
|
||||||
|
If supported by appropriate volume plugin, recycling performs a basic scrub (`rm -rf /thevolume/*`) on the volume and makes it available again for a new claim.
|
||||||
|
|
||||||
|
However, an administrator can configure a custom recycler pod templates using the Kubernetes controller manager command line arguments as described [here](/docs/admin/kube-controller-manager/). The custom recycler pod template must contain a `volumes` specification, as shown in the example below:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: pv-recycler-
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
volumes:
|
||||||
|
- name: vol
|
||||||
|
hostPath:
|
||||||
|
path: /any/path/it/will/be/replaced
|
||||||
|
containers:
|
||||||
|
- name: pv-recycler
|
||||||
|
image: "gcr.io/google_containers/busybox"
|
||||||
|
command: ["/bin/sh", "-c", "test -e /scrub && rm -rf /scrub/..?* /scrub/.[!.]* /scrub/* && test -z \"$(ls -A /scrub)\" || exit 1"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: vol
|
||||||
|
mountPath: /scrub
|
||||||
|
```
|
||||||
|
|
||||||
|
However, the particular path specified in the custom recycler pod template in the `volumes` part is replaced with the particular path of the volume that is being recycled.
|
||||||
|
|
||||||
|
## Types of Persistent Volumes
|
||||||
|
|
||||||
|
`PersistentVolume` types are implemented as plugins. Kubernetes currently supports the following plugins:
|
||||||
|
|
||||||
|
* GCEPersistentDisk
|
||||||
|
* AWSElasticBlockStore
|
||||||
|
* AzureFile
|
||||||
|
* AzureDisk
|
||||||
|
* FC (Fibre Channel)
|
||||||
|
* Flocker
|
||||||
|
* NFS
|
||||||
|
* iSCSI
|
||||||
|
* RBD (Ceph Block Device)
|
||||||
|
* CephFS
|
||||||
|
* Cinder (OpenStack block storage)
|
||||||
|
* Glusterfs
|
||||||
|
* VsphereVolume
|
||||||
|
* Quobyte Volumes
|
||||||
|
* HostPath (single node testing only -- local storage is not supported in any way and WILL NOT WORK in a multi-node cluster)
|
||||||
|
* VMware Photon
|
||||||
|
* Portworx Volumes
|
||||||
|
* ScaleIO Volumes
|
||||||
|
|
||||||
|
## Persistent Volumes
|
||||||
|
|
||||||
|
Each PV contains a spec and status, which is the specification and status of the volume.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolume
|
||||||
|
metadata:
|
||||||
|
name: pv0003
|
||||||
|
spec:
|
||||||
|
capacity:
|
||||||
|
storage: 5Gi
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
persistentVolumeReclaimPolicy: Recycle
|
||||||
|
storageClassName: slow
|
||||||
|
nfs:
|
||||||
|
path: /tmp
|
||||||
|
server: 172.17.0.2
|
||||||
|
```
|
||||||
|
|
||||||
|
### Capacity
|
||||||
|
|
||||||
|
Generally, a PV will have a specific storage capacity. This is set using the PV's `capacity` attribute. See the Kubernetes [Resource Model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) to understand the units expected by `capacity`.
|
||||||
|
|
||||||
|
Currently, storage size is the only resource that can be set or requested. Future attributes may include IOPS, throughput, etc.
|
||||||
|
|
||||||
|
### Access Modes
|
||||||
|
|
||||||
|
A `PersistentVolume` can be mounted on a host in any way supported by the resource provider. As shown in the table below, providers will have different capabilities and each PV's access modes are set to the specific modes supported by that particular volume. For example, NFS can support multiple read/write clients, but a specific NFS PV might be exported on the server as read-only. Each PV gets its own set of access modes describing that specific PV's capabilities.
|
||||||
|
|
||||||
|
The access modes are:
|
||||||
|
|
||||||
|
* ReadWriteOnce -- the volume can be mounted as read-write by a single node
|
||||||
|
* ReadOnlyMany -- the volume can be mounted read-only by many nodes
|
||||||
|
* ReadWriteMany -- the volume can be mounted as read-write by many nodes
|
||||||
|
|
||||||
|
In the CLI, the access modes are abbreviated to:
|
||||||
|
|
||||||
|
* RWO - ReadWriteOnce
|
||||||
|
* ROX - ReadOnlyMany
|
||||||
|
* RWX - ReadWriteMany
|
||||||
|
|
||||||
|
> __Important!__ A volume can only be mounted using one access mode at a time, even if it supports many. For example, a GCEPersistentDisk can be mounted as ReadWriteOnce by a single node or ReadOnlyMany by many nodes, but not at the same time.
|
||||||
|
|
||||||
|
|
||||||
|
| Volume Plugin | ReadWriteOnce| ReadOnlyMany| ReadWriteMany|
|
||||||
|
| :--- | :---: | :---: | :---: |
|
||||||
|
| AWSElasticBlockStore | ✓ | - | - |
|
||||||
|
| AzureFile | ✓ | ✓ | ✓ |
|
||||||
|
| AzureDisk | ✓ | - | - |
|
||||||
|
| CephFS | ✓ | ✓ | ✓ |
|
||||||
|
| Cinder | ✓ | - | - |
|
||||||
|
| FC | ✓ | ✓ | - |
|
||||||
|
| FlexVolume | ✓ | ✓ | - |
|
||||||
|
| Flocker | ✓ | - | - |
|
||||||
|
| GCEPersistentDisk | ✓ | ✓ | - |
|
||||||
|
| Glusterfs | ✓ | ✓ | ✓ |
|
||||||
|
| HostPath | ✓ | - | - |
|
||||||
|
| iSCSI | ✓ | ✓ | - |
|
||||||
|
| PhotonPersistentDisk | ✓ | - | - |
|
||||||
|
| Quobyte | ✓ | ✓ | ✓ |
|
||||||
|
| NFS | ✓ | ✓ | ✓ |
|
||||||
|
| RBD | ✓ | ✓ | - |
|
||||||
|
| VsphereVolume | ✓ | - | - |
|
||||||
|
| PortworxVolume | ✓ | - | ✓ |
|
||||||
|
| ScaleIO | ✓ | ✓ | - |
|
||||||
|
|
||||||
|
### Class
|
||||||
|
|
||||||
|
A PV can have a class, which is specified by setting the
|
||||||
|
`storageClassName` attribute to the name of a
|
||||||
|
`StorageClass`. A PV of a particular class can only be bound to PVCs requesting
|
||||||
|
that class. A PV with no `storageClassName` has no class and can only be bound
|
||||||
|
to PVCs that request no particular class.
|
||||||
|
|
||||||
|
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
|
||||||
|
of the `storageClassName` attribute. This annotation is still working, however
|
||||||
|
it will become fully deprecated in a future Kubernetes release.
|
||||||
|
|
||||||
|
### Reclaim Policy
|
||||||
|
|
||||||
|
Current reclaim policies are:
|
||||||
|
|
||||||
|
* Retain -- manual reclamation
|
||||||
|
* Recycle -- basic scrub ("rm -rf /thevolume/*")
|
||||||
|
* Delete -- associated storage asset such as AWS EBS, GCE PD, Azure Disk, or OpenStack Cinder volume is deleted
|
||||||
|
|
||||||
|
Currently, only NFS and HostPath support recycling. AWS EBS, GCE PD, Azure Disk, and Cinder volumes support deletion.
|
||||||
|
|
||||||
|
### Phase
|
||||||
|
|
||||||
|
A volume will be in one of the following phases:
|
||||||
|
|
||||||
|
* Available -- a free resource that is not yet bound to a claim
|
||||||
|
* Bound -- the volume is bound to a claim
|
||||||
|
* Released -- the claim has been deleted, but the resource is not yet reclaimed by the cluster
|
||||||
|
* Failed -- the volume has failed its automatic reclamation
|
||||||
|
|
||||||
|
The CLI will show the name of the PVC bound to the PV.
|
||||||
|
|
||||||
|
### Mount Options
|
||||||
|
|
||||||
|
A Kubernetes administrator can specify additional mount options for when a Persistent Volume is being mounted on a node.
|
||||||
|
|
||||||
|
You can specify a mount option by using the annotation `volume.beta.kubernetes.io/mount-options` on
|
||||||
|
your Persistent Volume.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: "v1"
|
||||||
|
kind: "PersistentVolume"
|
||||||
|
metadata:
|
||||||
|
name: gce-disk-1
|
||||||
|
annotations:
|
||||||
|
volume.beta.kubernetes.io/mount-options: "discard"
|
||||||
|
spec:
|
||||||
|
capacity:
|
||||||
|
storage: "10Gi"
|
||||||
|
accessModes:
|
||||||
|
- "ReadWriteOnce"
|
||||||
|
gcePersistentDisk:
|
||||||
|
fsType: "ext4"
|
||||||
|
pdName: "gce-disk-1
|
||||||
|
```
|
||||||
|
|
||||||
|
A mount option is a string which will be cumulatively joined and used while mounting volume to the disk.
|
||||||
|
|
||||||
|
Note that not all Persistent volume types support mount options. In Kubernetes version 1.6, the following
|
||||||
|
volume types support mount options.
|
||||||
|
|
||||||
|
* GCEPersistentDisk
|
||||||
|
* AWSElasticBlockStore
|
||||||
|
* AzureFile
|
||||||
|
* AzureDisk
|
||||||
|
* NFS
|
||||||
|
* iSCSI
|
||||||
|
* RBD (Ceph Block Device)
|
||||||
|
* CephFS
|
||||||
|
* Cinder (OpenStack block storage)
|
||||||
|
* Glusterfs
|
||||||
|
* VsphereVolume
|
||||||
|
* Quobyte Volumes
|
||||||
|
* VMware Photon
|
||||||
|
|
||||||
|
|
||||||
|
## PersistentVolumeClaims
|
||||||
|
|
||||||
|
Each PVC contains a spec and status, which is the specification and status of the claim.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
name: myclaim
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 8Gi
|
||||||
|
storageClassName: slow
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
release: "stable"
|
||||||
|
matchExpressions:
|
||||||
|
- {key: environment, operator: In, values: [dev]}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Access Modes
|
||||||
|
|
||||||
|
Claims use the same conventions as volumes when requesting storage with specific access modes.
|
||||||
|
|
||||||
|
### Resources
|
||||||
|
|
||||||
|
Claims, like pods, can request specific quantities of a resource. In this case, the request is for storage. The same [resource model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) applies to both volumes and claims.
|
||||||
|
|
||||||
|
### Selector
|
||||||
|
|
||||||
|
Claims can specify a [label selector](/docs/user-guide/labels/#label-selectors) to further filter the set of volumes. Only the volumes whose labels match the selector can be bound to the claim. The selector can consist of two fields:
|
||||||
|
|
||||||
|
* matchLabels - the volume must have a label with this value
|
||||||
|
* matchExpressions - a list of requirements made by specifying key, list of values, and operator that relates the key and values. Valid operators include In, NotIn, Exists, and DoesNotExist.
|
||||||
|
|
||||||
|
All of the requirements, from both `matchLabels` and `matchExpressions` are ANDed together – they must all be satisfied in order to match.
|
||||||
|
|
||||||
|
### Class
|
||||||
|
|
||||||
|
A claim can request a particular class by specifying the name of a
|
||||||
|
`StorageClass` using the attribute `storageClassName`.
|
||||||
|
Only PVs of the requested class, ones with the same `storageClassName` as the PVC, can
|
||||||
|
be bound to the PVC.
|
||||||
|
|
||||||
|
PVCs don't necessarily have to request a class. A PVC with its `storageClasName` set
|
||||||
|
equal to `""` is always interpreted to be requesting a PV with no class, so it
|
||||||
|
can only be bound to PVs with no class (no annotation or one set equal to
|
||||||
|
`""`). A PVC with no `storageClassName` is not quite the same and is treated differently
|
||||||
|
by the cluster depending on whether the
|
||||||
|
[`DefaultStorageClass` admission plugin](/docs/admin/admission-controllers/#defaultstorageclass)
|
||||||
|
is turned on.
|
||||||
|
|
||||||
|
* If the admission plugin is turned on, the administrator may specify a
|
||||||
|
default `StorageClass`. All PVCs that have no `storageClassName` can be bound only to
|
||||||
|
PVs of that default. Specifying a default `StorageClass` is done by setting the
|
||||||
|
annotation `storageclass.kubernetes.io/is-default-class` equal to "true" in
|
||||||
|
a `StorageClass` object. If the administrator does not specify a default, the
|
||||||
|
cluster responds to PVC creation as if the admission plugin were turned off. If
|
||||||
|
more than one default is specified, the admission plugin forbids the creation of
|
||||||
|
all PVCs.
|
||||||
|
* If the admission plugin is turned off, there is no notion of a default
|
||||||
|
`StorageClass`. All PVCs that have no `storageClassName` can be bound only to PVs that
|
||||||
|
have no class. In this case, the PVCs that have no `storageClassName` are treated the
|
||||||
|
same way as PVCs that have their `storageClassName` set to `""`.
|
||||||
|
|
||||||
|
Depending on installation method, a default StorageClass may be deployed
|
||||||
|
to Kubernetes cluster by addon manager during installation.
|
||||||
|
|
||||||
|
When a PVC specifies a `selector` in addition to requesting a `StorageClass`,
|
||||||
|
the requirements are ANDed together: only a PV of the requested class and with
|
||||||
|
the requested labels may be bound to the PVC. Note that currently, a PVC with a
|
||||||
|
non-empty `selector` can't have a PV dynamically provisioned for it.
|
||||||
|
|
||||||
|
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
|
||||||
|
of `storageClassName` attribute. This annotation is still working, however
|
||||||
|
it won't be supported in a future Kubernetes release.
|
||||||
|
|
||||||
|
## Claims As Volumes
|
||||||
|
|
||||||
|
Pods access storage by using the claim as a volume. Claims must exist in the same namespace as the pod using the claim. The cluster finds the claim in the pod's namespace and uses it to get the `PersistentVolume` backing the claim. The volume is then mounted to the host and into the pod.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: Pod
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
name: mypod
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: myfrontend
|
||||||
|
image: dockerfile/nginx
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: "/var/www/html"
|
||||||
|
name: mypd
|
||||||
|
volumes:
|
||||||
|
- name: mypd
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: myclaim
|
||||||
|
```
|
||||||
|
|
||||||
|
### A Note on Namespaces
|
||||||
|
|
||||||
|
`PersistentVolumes` binds are exclusive, and since `PersistentVolumeClaims` are namespaced objects, mounting claims with "Many" modes (`ROX`, `RWX`) is only possible within one namespace.
|
||||||
|
|
||||||
|
## StorageClasses
|
||||||
|
|
||||||
|
Each `StorageClass` contains the fields `provisioner` and `parameters`, which
|
||||||
|
are used when a `PersistentVolume` belonging to the class needs to be
|
||||||
|
dynamically provisioned.
|
||||||
|
|
||||||
|
The name of a `StorageClass` object is significant, and is how users can
|
||||||
|
request a particular class. Administrators set the name and other parameters
|
||||||
|
of a class when first creating `StorageClass` objects, and the objects cannot
|
||||||
|
be updated once they are created.
|
||||||
|
|
||||||
|
Administrators can specify a default `StorageClass` just for PVCs that don't
|
||||||
|
request any particular class to bind to: see the
|
||||||
|
[`PersistentVolumeClaim` section](#persistentvolumeclaims)
|
||||||
|
for details.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: standard
|
||||||
|
provisioner: kubernetes.io/aws-ebs
|
||||||
|
parameters:
|
||||||
|
type: gp2
|
||||||
|
```
|
||||||
|
|
||||||
|
### Provisioner
|
||||||
|
Storage classes have a provisioner that determines what volume plugin is used
|
||||||
|
for provisioning PVs. This field must be specified.
|
||||||
|
|
||||||
|
You are not restricted to specifying the "internal" provisioners
|
||||||
|
listed here (whose names are prefixed with "kubernetes.io" and shipped
|
||||||
|
alongside Kubernetes). You can also run and specify external provisioners,
|
||||||
|
which are independent programs that follow a [specification](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/volume-provisioning.md)
|
||||||
|
defined by Kubernetes. Authors of external provisioners have full discretion
|
||||||
|
over where their code lives, how the provisioner is shipped, how it needs to be
|
||||||
|
run, what volume plugin it uses (including Flex), etc. The repository [kubernetes-incubator/external-storage](https://github.com/kubernetes-incubator/external-storage)
|
||||||
|
houses a library for writing external provisioners that implements the bulk of
|
||||||
|
the specification plus various community-maintained external provisioners.
|
||||||
|
|
||||||
|
### Parameters
|
||||||
|
Storage classes have parameters that describe volumes belonging to the storage
|
||||||
|
class. Different parameters may be accepted depending on the `provisioner`. For
|
||||||
|
example, the value `io1`, for the parameter `type`, and the parameter
|
||||||
|
`iopsPerGB` are specific to EBS. When a parameter is omitted, some default is
|
||||||
|
used.
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: slow
|
||||||
|
provisioner: kubernetes.io/aws-ebs
|
||||||
|
parameters:
|
||||||
|
type: io1
|
||||||
|
zone: us-east-1d
|
||||||
|
iopsPerGB: "10"
|
||||||
|
```
|
||||||
|
|
||||||
|
* `type`: `io1`, `gp2`, `sc1`, `st1`. See [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) for details. Default: `gp2`.
|
||||||
|
* `zone`: AWS zone. If not specified, a random zone from those where Kubernetes cluster has a node is chosen.
|
||||||
|
* `iopsPerGB`: only for `io1` volumes. I/O operations per second per GiB. AWS volume plugin multiplies this with size of requested volume to compute IOPS of the volume and caps it at 20 000 IOPS (maximum supported by AWS, see [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html). A string is expected here, i.e. `"10"`, not `10`.
|
||||||
|
* `encrypted`: denotes whether the EBS volume should be encrypted or not. Valid values are `"true"` or `"false"`. A string is expected here, i.e. `"true"`, not `true`.
|
||||||
|
* `kmsKeyId`: optional. The full Amazon Resource Name of the key to use when encrypting the volume. If none is supplied but `encrypted` is true, a key is generated by AWS. See AWS docs for valid ARN value.
|
||||||
|
|
||||||
|
#### GCE
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: slow
|
||||||
|
provisioner: kubernetes.io/gce-pd
|
||||||
|
parameters:
|
||||||
|
type: pd-standard
|
||||||
|
zone: us-central1-a
|
||||||
|
```
|
||||||
|
|
||||||
|
* `type`: `pd-standard` or `pd-ssd`. Default: `pd-standard`
|
||||||
|
* `zone`: GCE zone. If not specified, a random zone in the same region as controller-manager will be chosen.
|
||||||
|
|
||||||
|
#### Glusterfs
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: StorageClass
|
||||||
|
metadata:
|
||||||
|
name: slow
|
||||||
|
provisioner: kubernetes.io/glusterfs
|
||||||
|
parameters:
|
||||||
|
resturl: "http://127.0.0.1:8081"
|
||||||
|
restauthenabled: "true"
|
||||||
|
restuser: "admin"
|
||||||
|
secretNamespace: "default"
|
||||||
|
secretName: "heketi-secret"
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
* `resturl`: Gluster REST service/Heketi service url which provision gluster volumes on demand. The general format should be `IPaddress:Port` and this is a mandatory parameter for GlusterFS dynamic provisioner. If Heketi service is exposed as a routable service in openshift/kubernetes setup, this can have a format similar to
|
||||||
|
`http://heketi-storage-project.cloudapps.mystorage.com` where the fqdn is a resolvable heketi service url.
|
||||||
|
* `restauthenabled` : Gluster REST service authentication boolean that enables authentication to the REST server. If this value is 'true', `restuser` and `restuserkey` or `secretNamespace` + `secretName` have to be filled. This option is deprecated, authentication is enabled when any of `restuser`, `restuserkey`, `secretName` or `secretNamespace` is specified.
|
||||||
|
* `restuser` : Gluster REST service/Heketi user who has access to create volumes in the Gluster Trusted Pool.
|
||||||
|
* `restuserkey` : Gluster REST service/Heketi user's password which will be used for authentication to the REST server. This parameter is deprecated in favor of `secretNamespace` + `secretName`.
|
||||||
|
* `secretNamespace` + `secretName` : Identification of Secret instance that contains user password to use when talking to Gluster REST service. These parameters are optional, empty password will be used when both `secretNamespace` and `secretName` are omitted. The provided secret must have type "kubernetes.io/glusterfs", e.g. created in this way:
|
||||||
|
```
|
||||||
|
$ kubectl create secret generic heketi-secret --type="kubernetes.io/glusterfs" --from-literal=key='opensesame' --namespace=default
|
||||||
|
```
|
||||||
|
|
||||||
|
#### OpenStack Cinder
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: gold
|
||||||
|
provisioner: kubernetes.io/cinder
|
||||||
|
parameters:
|
||||||
|
type: fast
|
||||||
|
availability: nova
|
||||||
|
```
|
||||||
|
|
||||||
|
* `type`: [VolumeType](http://docs.openstack.org/admin-guide/dashboard-manage-volumes.html) created in Cinder. Default is empty.
|
||||||
|
* `availability`: Availability Zone. Default is empty.
|
||||||
|
|
||||||
|
#### vSphere
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: fast
|
||||||
|
provisioner: kubernetes.io/vsphere-volume
|
||||||
|
parameters:
|
||||||
|
diskformat: zeroedthick
|
||||||
|
```
|
||||||
|
|
||||||
|
* `diskformat`: `thin`, `zeroedthick` and `eagerzeroedthick`. Default: `"thin"`.
|
||||||
|
|
||||||
|
#### Ceph RBD
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: StorageClass
|
||||||
|
metadata:
|
||||||
|
name: fast
|
||||||
|
provisioner: kubernetes.io/rbd
|
||||||
|
parameters:
|
||||||
|
monitors: 10.16.153.105:6789
|
||||||
|
adminId: kube
|
||||||
|
adminSecretName: ceph-secret
|
||||||
|
adminSecretNamespace: kube-system
|
||||||
|
pool: kube
|
||||||
|
userId: kube
|
||||||
|
userSecretName: ceph-secret-user
|
||||||
|
```
|
||||||
|
|
||||||
|
* `monitors`: Ceph monitors, comma delimited. This parameter is required.
|
||||||
|
* `adminId`: Ceph client ID that is capable of creating images in the pool. Default is "admin".
|
||||||
|
* `adminSecretNamespace`: The namespace for `adminSecret`. Default is "default".
|
||||||
|
* `adminSecret`: Secret Name for `adminId`. This parameter is required. The provided secret must have type "kubernetes.io/rbd".
|
||||||
|
* `pool`: Ceph RBD pool. Default is "rbd".
|
||||||
|
* `userId`: Ceph client ID that is used to map the RBD image. Default is the same as `adminId`.
|
||||||
|
* `userSecretName`: The name of Ceph Secret for `userId` to map RBD image. It must exist in the same namespace as PVCs. This parameter is required. The provided secret must have type "kubernetes.io/rbd", e.g. created in this way:
|
||||||
|
```
|
||||||
|
$ kubectl create secret generic ceph-secret --type="kubernetes.io/rbd" --from-literal=key='QVFEQ1pMdFhPUnQrSmhBQUFYaERWNHJsZ3BsMmNjcDR6RFZST0E9PQ==' --namespace=kube-system
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Quobyte
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: StorageClass
|
||||||
|
metadata:
|
||||||
|
name: slow
|
||||||
|
provisioner: kubernetes.io/quobyte
|
||||||
|
parameters:
|
||||||
|
quobyteAPIServer: "http://138.68.74.142:7860"
|
||||||
|
registry: "138.68.74.142:7861"
|
||||||
|
adminSecretName: "quobyte-admin-secret"
|
||||||
|
adminSecretNamespace: "kube-system"
|
||||||
|
user: "root"
|
||||||
|
group: "root"
|
||||||
|
quobyteConfig: "BASE"
|
||||||
|
quobyteTenant: "DEFAULT"
|
||||||
|
```
|
||||||
|
|
||||||
|
* `quobyteAPIServer`: API Server of Quobyte in the format `http(s)://api-server:7860`
|
||||||
|
* `registry`: Quobyte registry to use to mount the volume. You can specify the registry as ``<host>:<port>`` pair or if you want to specify multiple registries you just have to put a comma between them e.q. ``<host1>:<port>,<host2>:<port>,<host3>:<port>``. The host can be an IP address or if you have a working DNS you can also provide the DNS names.
|
||||||
|
* `adminSecretNamespace`: The namespace for `adminSecretName`. Default is "default".
|
||||||
|
* `adminSecretName`: secret that holds information about the Quobyte user and the password to authenticate against the API server. The provided secret must have type "kubernetes.io/quobyte", e.g. created in this way:
|
||||||
|
```
|
||||||
|
$ kubectl create secret generic quobyte-admin-secret --type="kubernetes.io/quobyte" --from-literal=key='opensesame' --namespace=kube-system
|
||||||
|
```
|
||||||
|
* `user`: maps all access to this user. Default is "root".
|
||||||
|
* `group`: maps all access to this group. Default is "nfsnobody".
|
||||||
|
* `quobyteConfig`: use the specified configuration to create the volume. You can create a new configuration or modify an existing one with the Web console or the quobyte CLI. Default is "BASE".
|
||||||
|
* `quobyteTenant`: use the specified tenant ID to create/delete the volume. This Quobyte tenant has to be already present in Quobyte. Default is "DEFAULT".
|
||||||
|
|
||||||
|
#### Azure Disk
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: slow
|
||||||
|
provisioner: kubernetes.io/azure-disk
|
||||||
|
parameters:
|
||||||
|
skuName: Standard_LRS
|
||||||
|
location: eastus
|
||||||
|
storageAccount: azure_storage_account_name
|
||||||
|
```
|
||||||
|
|
||||||
|
* `skuName`: Azure storage account Sku tier. Default is empty.
|
||||||
|
* `location`: Azure storage account location. Default is empty.
|
||||||
|
* `storageAccount`: Azure storage account name. If storage account is not provided, all storage accounts associated with the resource group are searched to find one that matches `skuName` and `location`. If storage account is provided, it must reside in the same resource group as the cluster, and `skuName` and `location` are ignored.
|
||||||
|
|
||||||
|
#### Portworx Volume
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: portworx-io-priority-high
|
||||||
|
provisioner: kubernetes.io/portworx-volume
|
||||||
|
parameters:
|
||||||
|
repl: "1"
|
||||||
|
snap_interval: "70"
|
||||||
|
io_priority: "high"
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
* `fs`: filesystem to be laid out: [none/xfs/ext4] (default: `ext4`).
|
||||||
|
* `block_size`: block size in Kbytes (default: `32`).
|
||||||
|
* `repl`: number of synchronous replicas to be provided in the form of replication factor [1..3] (default: `1`) A string is expected here i.e.`"1"` and not `1`.
|
||||||
|
* `io_priority`: determines whether the volume will be created from higher performance or a lower priority storage [high/medium/low] (default: `low`).
|
||||||
|
* `snap_interval`: clock/time interval in minutes for when to trigger snapshots. Snapshots are incremental based on difference with the prior snapshot, 0 disables snaps (default: `0`). A string is expected here i.e. `"70"` and not `70`.
|
||||||
|
* `aggregation_level`: specifies the number of chunks the volume would be distributed into, 0 indicates a non-aggregated volume (default: `0`). A string is expected here i.e. `"0"` and not `0`
|
||||||
|
* `ephemeral`: specifies whether the volume should be cleaned-up after unmount or should be persistent. `emptyDir` use case can set this value to true and `persistent volumes` use case such as for databases like Cassandra should set to false, [true/false] (default `false`). A string is expected here i.e. `"true"` and not `true`.
|
||||||
|
|
||||||
|
#### ScaleIO
|
||||||
|
```yaml
|
||||||
|
kind: StorageClass
|
||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
name: slow
|
||||||
|
provisioner: kubernetes.io/scaleio
|
||||||
|
parameters:
|
||||||
|
gateway: https://192.168.99.200:443/api
|
||||||
|
system: scaleio
|
||||||
|
protectionDomain: default
|
||||||
|
storagePool: default
|
||||||
|
storageMode: ThinProvisionned
|
||||||
|
secretRef: sio-secret
|
||||||
|
readOnly: false
|
||||||
|
fsType: xfs
|
||||||
|
```
|
||||||
|
|
||||||
|
* `provisioner`: attribute is set to `kubernetes.io/scaleio`
|
||||||
|
* `gateway`: address to a ScaleIO API gateway (required)
|
||||||
|
* `system`: the name of the ScaleIO system (required)
|
||||||
|
* `protectionDomain`: the name of the ScaleIO protection domain
|
||||||
|
* `storagePool`: the name of the volume storage pool
|
||||||
|
* `storageMode`: the storage provision mode: `ThinProvisionned` (default) or `ThickProvisionned`
|
||||||
|
* `secretRef`: reference to a configuered Secret object (required, see detail below)
|
||||||
|
* `readOnly`: specifies the access mode to the mounted volume
|
||||||
|
* `fsType`: the file system to use for the volume
|
||||||
|
|
||||||
|
The ScaleIO Kubernetes volume plugin requires a configuered Secret object.
|
||||||
|
The secret must be created with type `kubernetes.io/scaleio` and use the same namespace value as that of the PVC where it is referenced
|
||||||
|
as shown in the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
$> kubectl create secret generic sio-secret --type="kubernetes.io/scaleio" --from-literal=username=sioadmin --from-literal=password=d2NABDNjMA== --namespace=default
|
||||||
|
```
|
||||||
|
|
||||||
|
## Writing Portable Configuration
|
||||||
|
|
||||||
|
If you're writing configuration templates or examples that run on a wide range of clusters
|
||||||
|
and need persistent storage, we recommend that you use the following pattern:
|
||||||
|
|
||||||
|
- Do include PersistentVolumeClaim objects in your bundle of config (alongside Deployments, ConfigMaps, etc).
|
||||||
|
- Do not include PersistentVolume objects in the config, since the user instantiating the config may not have
|
||||||
|
permission to create PersistentVolumes.
|
||||||
|
- Give the user the option of providing a storage class name when instantiating the template.
|
||||||
|
- If the user provides a storage class name, and the cluster is version 1.4 or newer, put that value into the `volume.beta.kubernetes.io/storage-class` annotation of the PVC.
|
||||||
|
This will cause the PVC to match the right storage class if the cluster has StorageClasses enabled by the admin.
|
||||||
|
- If the user does not provide a storage class name or the cluster is version 1.3, then instead put a `volume.alpha.kubernetes.io/storage-class: default` annotation on the PVC.
|
||||||
|
- This will cause a PV to be automatically provisioned for the user with sane default characteristics on some clusters.
|
||||||
|
- Despite the word `alpha` in the name, the code behind this annotation has `beta` level support.
|
||||||
|
- Do not use `volume.beta.kubernetes.io/storage-class:` with any value including the empty string since it will prevent DefaultStorageClass admission controller
|
||||||
|
from running if enabled.
|
||||||
|
- In your tooling, do watch for PVCs that are not getting bound after some time and surface this to the user, as this may indicate that the cluster has no dynamic
|
||||||
|
storage support (in which case the user should create a matching PV) or the cluster has no storage system (in which case the user cannot deploy config requiring
|
||||||
|
PVCs).
|
||||||
|
- In the future, we expect most clusters to have `DefaultStorageClass` enabled, and to have some form of storage available. However, there may not be any
|
||||||
|
storage class names which work on all clusters, so continue to not set one by default.
|
||||||
|
At some point, the alpha annotation will cease to have meaning, but the unset `storageClass` field on the PVC
|
||||||
|
will have the desired effect.
|
|
@ -0,0 +1,173 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- erictune
|
||||||
|
title: Daemon Sets
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## What is a DaemonSet?
|
||||||
|
|
||||||
|
A _DaemonSet_ ensures that all (or some) nodes run a copy of a pod. As nodes are added to the
|
||||||
|
cluster, pods are added to them. As nodes are removed from the cluster, those pods are garbage
|
||||||
|
collected. Deleting a DaemonSet will clean up the pods it created.
|
||||||
|
|
||||||
|
Some typical uses of a DaemonSet are:
|
||||||
|
|
||||||
|
- running a cluster storage daemon, such as `glusterd`, `ceph`, on each node.
|
||||||
|
- running a logs collection daemon on every node, such as `fluentd` or `logstash`.
|
||||||
|
- running a node monitoring daemon on every node, such as [Prometheus Node Exporter](
|
||||||
|
https://github.com/prometheus/node_exporter), `collectd`, New Relic agent, or Ganglia `gmond`.
|
||||||
|
|
||||||
|
In a simple case, one DaemonSet, covering all nodes, would be used for each type of daemon.
|
||||||
|
A more complex setup might use multiple DaemonSets for a single type of daemon, but with
|
||||||
|
different flags and/or different memory and cpu requests for different hardware types.
|
||||||
|
|
||||||
|
## Writing a DaemonSet Spec
|
||||||
|
|
||||||
|
### Required Fields
|
||||||
|
|
||||||
|
As with all other Kubernetes config, a DaemonSet needs `apiVersion`, `kind`, and `metadata` fields. For
|
||||||
|
general information about working with config files, see [deploying applications](/docs/user-guide/deploying-applications/),
|
||||||
|
[configuring containers](/docs/user-guide/configuring-containers/), and [working with resources](/docs/user-guide/working-with-resources/) documents.
|
||||||
|
|
||||||
|
A DaemonSet also needs a [`.spec`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) section.
|
||||||
|
|
||||||
|
### Pod Template
|
||||||
|
|
||||||
|
The `.spec.template` is the only required field of the `.spec`.
|
||||||
|
|
||||||
|
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template).
|
||||||
|
It has exactly the same schema as a [pod](/docs/user-guide/pods), except
|
||||||
|
it is nested and does not have an `apiVersion` or `kind`.
|
||||||
|
|
||||||
|
In addition to required fields for a pod, a pod template in a DaemonSet has to specify appropriate
|
||||||
|
labels (see [pod selector](#pod-selector)).
|
||||||
|
|
||||||
|
A pod template in a DaemonSet must have a [`RestartPolicy`](/docs/user-guide/pod-states)
|
||||||
|
equal to `Always`, or be unspecified, which defaults to `Always`.
|
||||||
|
|
||||||
|
### Pod Selector
|
||||||
|
|
||||||
|
The `.spec.selector` field is a pod selector. It works the same as the `.spec.selector` of
|
||||||
|
a [Job](/docs/concepts/jobs/run-to-completion-finite-workloads/) or other new resources.
|
||||||
|
|
||||||
|
The `spec.selector` is an object consisting of two fields:
|
||||||
|
|
||||||
|
* `matchLabels` - works the same as the `.spec.selector` of a [ReplicationController](/docs/user-guide/replication-controller/)
|
||||||
|
* `matchExpressions` - allows to build more sophisticated selectors by specifying key,
|
||||||
|
list of values and an operator that relates the key and values.
|
||||||
|
|
||||||
|
When the two are specified the result is ANDed.
|
||||||
|
|
||||||
|
If the `.spec.selector` is specified, it must match the `.spec.template.metadata.labels`. If not
|
||||||
|
specified, they are defaulted to be equal. Config with these not matching will be rejected by the API.
|
||||||
|
|
||||||
|
Also you should not normally create any pods whose labels match this selector, either directly, via
|
||||||
|
another DaemonSet, or via other controller such as ReplicationController. Otherwise, the DaemonSet
|
||||||
|
controller will think that those pods were created by it. Kubernetes will not stop you from doing
|
||||||
|
this. One case where you might want to do this is manually create a pod with a different value on
|
||||||
|
a node for testing.
|
||||||
|
|
||||||
|
### Running Pods on Only Some Nodes
|
||||||
|
|
||||||
|
If you specify a `.spec.template.spec.nodeSelector`, then the DaemonSet controller will
|
||||||
|
create pods on nodes which match that [node
|
||||||
|
selector](/docs/user-guide/node-selection/). Likewise if you specify a `.spec.template.spec.affinity`
|
||||||
|
then DaemonSet controller will create pods on nodes which match that [node affinity](../../user-guide/node-selection/index.md).
|
||||||
|
If you do not specify either, then the DaemonSet controller will create pods on all nodes.
|
||||||
|
|
||||||
|
## How Daemon Pods are Scheduled
|
||||||
|
|
||||||
|
Normally, the machine that a pod runs on is selected by the Kubernetes scheduler. However, pods
|
||||||
|
created by the Daemon controller have the machine already selected (`.spec.nodeName` is specified
|
||||||
|
when the pod is created, so it is ignored by the scheduler). Therefore:
|
||||||
|
|
||||||
|
- the [`unschedulable`](/docs/admin/node/#manual-node-administration) field of a node is not respected
|
||||||
|
by the DaemonSet controller.
|
||||||
|
- DaemonSet controller can make pods even when the scheduler has not been started, which can help cluster
|
||||||
|
bootstrap.
|
||||||
|
|
||||||
|
Daemon pods do respect [taints and tolerations](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature), but they are
|
||||||
|
created with `NoExecute` tolerations for the `node.alpha.kubernetes.io/notReady` and `node.alpha.kubernetes.io/unreachable`
|
||||||
|
taints with no `tolerationSeconds`. This ensures that when the `TaintBasedEvictions` alpha feature is enabled,
|
||||||
|
they will not be evicted when there are node problems such as a network partition. (When the
|
||||||
|
`TaintBasedEvictions` feature is not enabled, they are also not evicted in these scenarios, but
|
||||||
|
due to hard-coded behavior of the NodeController rather than due to tolerations).
|
||||||
|
|
||||||
|
|
||||||
|
## Communicating with Daemon Pods
|
||||||
|
|
||||||
|
Some possible patterns for communicating with pods in a DaemonSet are:
|
||||||
|
|
||||||
|
- **Push**: Pods in the DaemonSet are configured to send updates to another service, such
|
||||||
|
as a stats database. They do not have clients.
|
||||||
|
- **NodeIP and Known Port**: Pods in the DaemonSet use a `hostPort`, so that the pods are reachable via the node IPs. Clients know the list of nodes ips somehow, and know the port by convention.
|
||||||
|
- **DNS**: Create a [headless service](/docs/user-guide/services/#headless-services) with the same pod selector,
|
||||||
|
and then discover DaemonSets using the `endpoints` resource or retrieve multiple A records from
|
||||||
|
DNS.
|
||||||
|
- **Service**: Create a service with the same pod selector, and use the service to reach a
|
||||||
|
daemon on a random node. (No way to reach specific node.)
|
||||||
|
|
||||||
|
## Updating a DaemonSet
|
||||||
|
|
||||||
|
If node labels are changed, the DaemonSet will promptly add pods to newly matching nodes and delete
|
||||||
|
pods from newly not-matching nodes.
|
||||||
|
|
||||||
|
You can modify the pods that a DaemonSet creates. However, pods do not allow all
|
||||||
|
fields to be updated. Also, the DaemonSet controller will use the original template the next
|
||||||
|
time a node (even with the same name) is created.
|
||||||
|
|
||||||
|
|
||||||
|
You can delete a DaemonSet. If you specify `--cascade=false` with `kubectl`, then the pods
|
||||||
|
will be left on the nodes. You can then create a new DaemonSet with a different template.
|
||||||
|
the new DaemonSet with the different template will recognize all the existing pods as having
|
||||||
|
matching labels. It will not modify or delete them despite a mismatch in the pod template.
|
||||||
|
You will need to force new pod creation by deleting the pod or deleting the node.
|
||||||
|
|
||||||
|
In Kubernetes version 1.6 and later, you can [perform a rolling update](/docs/tasks/manage-daemon/update-daemon-set/) on a DaemonSet.
|
||||||
|
|
||||||
|
Future releases of Kubernetes will support controlled updating of nodes.
|
||||||
|
|
||||||
|
## Alternatives to DaemonSet
|
||||||
|
|
||||||
|
### Init Scripts
|
||||||
|
|
||||||
|
It is certainly possible to run daemon processes by directly starting them on a node (e.g. using
|
||||||
|
`init`, `upstartd`, or `systemd`). This is perfectly fine. However, there are several advantages to
|
||||||
|
running such processes via a DaemonSet:
|
||||||
|
|
||||||
|
- Ability to monitor and manage logs for daemons in the same way as applications.
|
||||||
|
- Same config language and tools (e.g. pod templates, `kubectl`) for daemons and applications.
|
||||||
|
- Future versions of Kubernetes will likely support integration between DaemonSet-created
|
||||||
|
pods and node upgrade workflows.
|
||||||
|
- Running daemons in containers with resource limits increases isolation between daemons from app
|
||||||
|
containers. However, this can also be accomplished by running the daemons in a container but not in a pod
|
||||||
|
(e.g. start directly via Docker).
|
||||||
|
|
||||||
|
### Bare Pods
|
||||||
|
|
||||||
|
It is possible to create pods directly which specify a particular node to run on. However,
|
||||||
|
a DaemonSet replaces pods that are deleted or terminated for any reason, such as in the case of
|
||||||
|
node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, you should
|
||||||
|
use a DaemonSet rather than creating individual pods.
|
||||||
|
|
||||||
|
### Static Pods
|
||||||
|
|
||||||
|
It is possible to create pods by writing a file to a certain directory watched by Kubelet. These
|
||||||
|
are called [static pods](/docs/admin/static-pods/).
|
||||||
|
Unlike DaemonSet, static pods cannot be managed with kubectl
|
||||||
|
or other Kubernetes API clients. Static pods do not depend on the apiserver, making them useful
|
||||||
|
in cluster bootstrapping cases. Also, static pods may be deprecated in the future.
|
||||||
|
|
||||||
|
### Replication Controller
|
||||||
|
|
||||||
|
DaemonSet are similar to [Replication Controllers](/docs/user-guide/replication-controller) in that
|
||||||
|
they both create pods, and those pods have processes which are not expected to terminate (e.g. web servers,
|
||||||
|
storage servers).
|
||||||
|
|
||||||
|
Use a replication controller for stateless services, like frontends, where scaling up and down the
|
||||||
|
number of replicas and rolling out updates are more important than controlling exactly which host
|
||||||
|
the pod runs on. Use a Daemon Controller when it is important that a copy of a pod always run on
|
||||||
|
all or certain hosts, and when it needs to start before other pods.
|
|
@ -0,0 +1,808 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- bgrant0607
|
||||||
|
- janetkuo
|
||||||
|
title: Deployments
|
||||||
|
---
|
||||||
|
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## What is a Deployment?
|
||||||
|
|
||||||
|
A _Deployment_ provides declarative updates for [Pods](/docs/user-guide/pods/) and [Replica Sets](/docs/user-guide/replicasets/) (the next-generation Replication Controller).
|
||||||
|
You only need to describe the desired state in a Deployment object, and the Deployment
|
||||||
|
controller will change the actual state to the desired state at a controlled rate for you.
|
||||||
|
You can define Deployments to create new resources, or replace existing ones
|
||||||
|
by new ones.
|
||||||
|
|
||||||
|
A typical use case is:
|
||||||
|
|
||||||
|
* Create a Deployment to bring up a Replica Set and Pods.
|
||||||
|
* Check the status of a Deployment to see if it succeeds or not.
|
||||||
|
* Later, update that Deployment to recreate the Pods (for example, to use a new image).
|
||||||
|
* Rollback to an earlier Deployment revision if the current Deployment isn't stable.
|
||||||
|
* Pause and resume a Deployment.
|
||||||
|
|
||||||
|
## Creating a Deployment
|
||||||
|
|
||||||
|
Here is an example Deployment. It creates a Replica Set to
|
||||||
|
bring up 3 nginx Pods.
|
||||||
|
|
||||||
|
{% include code.html language="yaml" file="nginx-deployment.yaml" ghlink="/docs/concepts/workloads/controllers/nginx-deployment.yaml" %}
|
||||||
|
|
||||||
|
Run the example by downloading the example file and then running this command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create -f docs/user-guide/nginx-deployment.yaml --record
|
||||||
|
deployment "nginx-deployment" created
|
||||||
|
```
|
||||||
|
|
||||||
|
Setting the kubectl flag `--record` to `true` allows you to record current command in the annotations of the resources being created or updated. It will be useful for future introspection; for example, to see the commands executed in each Deployment revision.
|
||||||
|
|
||||||
|
Then running `get` immediately will give:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get deployments
|
||||||
|
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
||||||
|
nginx-deployment 3 0 0 0 1s
|
||||||
|
```
|
||||||
|
|
||||||
|
This indicates that the Deployment's number of desired replicas is 3 (according to deployment's `.spec.replicas`), the number of current replicas (`.status.replicas`) is 0, the number of up-to-date replicas (`.status.updatedReplicas`) is 0, and the number of available replicas (`.status.availableReplicas`) is also 0.
|
||||||
|
|
||||||
|
Running the `get` again a few seconds later, should give:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get deployments
|
||||||
|
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
||||||
|
nginx-deployment 3 3 3 3 18s
|
||||||
|
```
|
||||||
|
|
||||||
|
This indicates that the Deployment has created all three replicas, and all replicas are up-to-date (contains the latest pod template) and available (pod status is ready for at least Deployment's `.spec.minReadySeconds`). Running `kubectl get rs` and `kubectl get pods` will show the Replica Set (RS) and Pods created.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-2035384211 3 3 0 18s
|
||||||
|
```
|
||||||
|
|
||||||
|
You may notice that the name of the Replica Set is always `<the name of the Deployment>-<hash value of the pod template>`.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get pods --show-labels
|
||||||
|
NAME READY STATUS RESTARTS AGE LABELS
|
||||||
|
nginx-deployment-2035384211-7ci7o 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
|
||||||
|
nginx-deployment-2035384211-kzszj 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
|
||||||
|
nginx-deployment-2035384211-qqcnn 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
|
||||||
|
```
|
||||||
|
|
||||||
|
The created Replica Set will ensure that there are three nginx Pods at all times.
|
||||||
|
|
||||||
|
**Note:** You must specify appropriate selector and pod template labels of a Deployment (in this case, `app = nginx`), i.e. don't overlap with other controllers (including Deployments, Replica Sets, Replication Controllers, etc.) Kubernetes won't stop you from doing that, and if you end up with multiple controllers that have overlapping selectors, those controllers will fight with each other's and won't behave correctly.
|
||||||
|
|
||||||
|
|
||||||
|
## Updating a Deployment
|
||||||
|
|
||||||
|
**Note:** a Deployment's rollout is triggered if and only if the Deployment's pod template (i.e. `.spec.template`) is changed,
|
||||||
|
e.g. updating labels or container images of the template. Other updates, such as scaling the Deployment, will not trigger a rollout.
|
||||||
|
|
||||||
|
Suppose that we now want to update the nginx Pods to start using the `nginx:1.9.1` image
|
||||||
|
instead of the `nginx:1.7.9` image.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
|
||||||
|
deployment "nginx-deployment" image updated
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, we can `edit` the Deployment and change `.spec.template.spec.containers[0].image` from `nginx:1.7.9` to `nginx:1.9.1`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl edit deployment/nginx-deployment
|
||||||
|
deployment "nginx-deployment" edited
|
||||||
|
```
|
||||||
|
|
||||||
|
To see its rollout status, simply run:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout status deployment/nginx-deployment
|
||||||
|
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
||||||
|
deployment "nginx-deployment" successfully rolled out
|
||||||
|
```
|
||||||
|
|
||||||
|
After the rollout succeeds, you may want to `get` the Deployment:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get deployments
|
||||||
|
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
||||||
|
nginx-deployment 3 3 3 3 36s
|
||||||
|
```
|
||||||
|
|
||||||
|
The number of up-to-date replicas indicates that the Deployment has updated the replicas to the latest configuration.
|
||||||
|
The current replicas indicates the total replicas this Deployment manages, and the available replicas indicates the
|
||||||
|
number of current replicas that are available.
|
||||||
|
|
||||||
|
We can run `kubectl get rs` to see that the Deployment updated the Pods by creating a new Replica Set and scaling it up to 3 replicas, as well as scaling down the old Replica Set to 0 replicas.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-1564180365 3 3 0 6s
|
||||||
|
nginx-deployment-2035384211 0 0 0 36s
|
||||||
|
```
|
||||||
|
|
||||||
|
Running `get pods` should now show only the new Pods:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get pods
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
nginx-deployment-1564180365-khku8 1/1 Running 0 14s
|
||||||
|
nginx-deployment-1564180365-nacti 1/1 Running 0 14s
|
||||||
|
nginx-deployment-1564180365-z9gth 1/1 Running 0 14s
|
||||||
|
```
|
||||||
|
|
||||||
|
Next time we want to update these Pods, we only need to update the Deployment's pod template again.
|
||||||
|
|
||||||
|
Deployment can ensure that only a certain number of Pods may be down while they are being updated. By
|
||||||
|
default, it ensures that at least 25% less than the desired number of Pods are
|
||||||
|
up (25% max unavailable).
|
||||||
|
|
||||||
|
Deployment can also ensure that only a certain number of Pods may be created above the desired number of Pods. By default, it ensures that at most 25% more than the desired number of Pods are up (25% max surge).
|
||||||
|
|
||||||
|
For example, if you look at the above Deployment closely, you will see that
|
||||||
|
it first created a new Pod, then deleted some old Pods and created new ones. It
|
||||||
|
does not kill old Pods until a sufficient number of new Pods have come up, and does not create new Pods until a sufficient number of old Pods have been killed. It makes sure that number of available Pods is at least 2 and the number of total Pods is at most 4.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl describe deployments
|
||||||
|
Name: nginx-deployment
|
||||||
|
Namespace: default
|
||||||
|
CreationTimestamp: Tue, 15 Mar 2016 12:01:06 -0700
|
||||||
|
Labels: app=nginx
|
||||||
|
Selector: app=nginx
|
||||||
|
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
|
||||||
|
StrategyType: RollingUpdate
|
||||||
|
MinReadySeconds: 0
|
||||||
|
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
||||||
|
OldReplicaSets: <none>
|
||||||
|
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
|
||||||
|
Events:
|
||||||
|
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||||
|
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||||
|
36s 36s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
|
||||||
|
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
|
||||||
|
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
|
||||||
|
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
|
||||||
|
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
|
||||||
|
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
|
||||||
|
```
|
||||||
|
|
||||||
|
Here we see that when we first created the Deployment, it created a Replica Set (nginx-deployment-2035384211) and scaled it up to 3 replicas directly.
|
||||||
|
When we updated the Deployment, it created a new Replica Set (nginx-deployment-1564180365) and scaled it up to 1 and then scaled down the old Replica Set to 2, so that at least 2 Pods were available and at most 4 Pods were created at all times.
|
||||||
|
It then continued scaling up and down the new and the old Replica Set, with the same rolling update strategy. Finally, we'll have 3 available replicas in the new Replica Set, and the old Replica Set is scaled down to 0.
|
||||||
|
|
||||||
|
### Multiple Updates
|
||||||
|
|
||||||
|
Each time a new deployment object is observed by the deployment controller, a Replica Set is
|
||||||
|
created to bring up the desired Pods if there is no existing Replica Set doing so.
|
||||||
|
Existing Replica Set controlling Pods whose labels match `.spec.selector` but whose
|
||||||
|
template does not match `.spec.template` are scaled down.
|
||||||
|
Eventually, the new Replica Set will be scaled to `.spec.replicas` and all old Replica Sets will
|
||||||
|
be scaled to 0.
|
||||||
|
|
||||||
|
If you update a Deployment while an existing deployment is in progress,
|
||||||
|
the Deployment will create a new Replica Set as per the update and start scaling that up, and
|
||||||
|
will roll the Replica Set that it was scaling up previously -- it will add it to its list of old Replica Sets and will
|
||||||
|
start scaling it down.
|
||||||
|
|
||||||
|
For example, suppose you create a Deployment to create 5 replicas of `nginx:1.7.9`,
|
||||||
|
but then updates the Deployment to create 5 replicas of `nginx:1.9.1`, when only 3
|
||||||
|
replicas of `nginx:1.7.9` had been created. In that case, Deployment will immediately start
|
||||||
|
killing the 3 `nginx:1.7.9` Pods that it had created, and will start creating
|
||||||
|
`nginx:1.9.1` Pods. It will not wait for 5 replicas of `nginx:1.7.9` to be created
|
||||||
|
before changing course.
|
||||||
|
|
||||||
|
## Rolling Back a Deployment
|
||||||
|
|
||||||
|
Sometimes you may want to rollback a Deployment; for example, when the Deployment is not stable, such as crash looping.
|
||||||
|
By default, two previous Deployment's rollout history are kept in the system so that you can rollback anytime you want
|
||||||
|
(you can change that by modifying [revision history limit](/docs/user-guide/deployments/#revision-history-limit)).
|
||||||
|
|
||||||
|
**Note:** a Deployment's revision is created when a Deployment's rollout is triggered. This means that the new revision is created
|
||||||
|
if and only if the Deployment's pod template (i.e. `.spec.template`) is changed, e.g. updating labels or container images of the template.
|
||||||
|
Other updates, such as scaling the Deployment, will not create a Deployment revision -- so that we can facilitate simultaneous manual- or
|
||||||
|
auto-scaling. This implies that when you rollback to an earlier revision, only the Deployment's pod template part will be rolled back.
|
||||||
|
|
||||||
|
Suppose that we made a typo while updating the Deployment, by putting the image name as `nginx:1.91` instead of `nginx:1.9.1`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.91
|
||||||
|
deployment "nginx-deployment" image updated
|
||||||
|
```
|
||||||
|
|
||||||
|
The rollout will be stuck.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout status deployments nginx-deployment
|
||||||
|
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
||||||
|
```
|
||||||
|
|
||||||
|
Press Ctrl-C to stop the above rollout status watch. For more information on stuck rollouts, [read more here](#deployment-status).
|
||||||
|
|
||||||
|
You will also see that both the number of old replicas (nginx-deployment-1564180365 and nginx-deployment-2035384211) and new replicas (nginx-deployment-3066724191) are 2.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-1564180365 2 2 0 25s
|
||||||
|
nginx-deployment-2035384211 0 0 0 36s
|
||||||
|
nginx-deployment-3066724191 2 2 2 6s
|
||||||
|
```
|
||||||
|
|
||||||
|
Looking at the Pods created, you will see that the 2 Pods created by new Replica Set are stuck in an image pull loop.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get pods
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
nginx-deployment-1564180365-70iae 1/1 Running 0 25s
|
||||||
|
nginx-deployment-1564180365-jbqqo 1/1 Running 0 25s
|
||||||
|
nginx-deployment-3066724191-08mng 0/1 ImagePullBackOff 0 6s
|
||||||
|
nginx-deployment-3066724191-eocby 0/1 ImagePullBackOff 0 6s
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the Deployment controller will stop the bad rollout automatically, and will stop scaling up the new Replica Set.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl describe deployment
|
||||||
|
Name: nginx-deployment
|
||||||
|
Namespace: default
|
||||||
|
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
|
||||||
|
Labels: app=nginx
|
||||||
|
Selector: app=nginx
|
||||||
|
Replicas: 2 updated | 3 total | 2 available | 2 unavailable
|
||||||
|
StrategyType: RollingUpdate
|
||||||
|
MinReadySeconds: 0
|
||||||
|
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
||||||
|
OldReplicaSets: nginx-deployment-1564180365 (2/2 replicas created)
|
||||||
|
NewReplicaSet: nginx-deployment-3066724191 (2/2 replicas created)
|
||||||
|
Events:
|
||||||
|
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||||
|
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||||
|
1m 1m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
|
||||||
|
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
|
||||||
|
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
|
||||||
|
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
|
||||||
|
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
|
||||||
|
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
|
||||||
|
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
|
||||||
|
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
|
||||||
|
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
|
||||||
|
```
|
||||||
|
|
||||||
|
To fix this, we need to rollback to a previous revision of Deployment that is stable.
|
||||||
|
|
||||||
|
### Checking Rollout History of a Deployment
|
||||||
|
|
||||||
|
First, check the revisions of this deployment:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout history deployment/nginx-deployment
|
||||||
|
deployments "nginx-deployment":
|
||||||
|
REVISION CHANGE-CAUSE
|
||||||
|
1 kubectl create -f docs/user-guide/nginx-deployment.yaml --record
|
||||||
|
2 kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
|
||||||
|
3 kubectl set image deployment/nginx-deployment nginx=nginx:1.91
|
||||||
|
```
|
||||||
|
|
||||||
|
Because we recorded the command while creating this Deployment using `--record`, we can easily see the changes we made in each revision.
|
||||||
|
|
||||||
|
To further see the details of each revision, run:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout history deployment/nginx-deployment --revision=2
|
||||||
|
deployments "nginx-deployment" revision 2
|
||||||
|
Labels: app=nginx
|
||||||
|
pod-template-hash=1159050644
|
||||||
|
Annotations: kubernetes.io/change-cause=kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
|
||||||
|
Containers:
|
||||||
|
nginx:
|
||||||
|
Image: nginx:1.9.1
|
||||||
|
Port: 80/TCP
|
||||||
|
QoS Tier:
|
||||||
|
cpu: BestEffort
|
||||||
|
memory: BestEffort
|
||||||
|
Environment Variables: <none>
|
||||||
|
No volumes.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rolling Back to a Previous Revision
|
||||||
|
|
||||||
|
Now we've decided to undo the current rollout and rollback to the previous revision:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout undo deployment/nginx-deployment
|
||||||
|
deployment "nginx-deployment" rolled back
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, you can rollback to a specific revision by specify that in `--to-revision`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout undo deployment/nginx-deployment --to-revision=2
|
||||||
|
deployment "nginx-deployment" rolled back
|
||||||
|
```
|
||||||
|
|
||||||
|
For more details about rollout related commands, read [`kubectl rollout`](/docs/user-guide/kubectl/kubectl_rollout/).
|
||||||
|
|
||||||
|
The Deployment is now rolled back to a previous stable revision. As you can see, a `DeploymentRollback` event for rolling back to revision 2 is generated from Deployment controller.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get deployment
|
||||||
|
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
||||||
|
nginx-deployment 3 3 3 3 30m
|
||||||
|
|
||||||
|
$ kubectl describe deployment
|
||||||
|
Name: nginx-deployment
|
||||||
|
Namespace: default
|
||||||
|
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
|
||||||
|
Labels: app=nginx
|
||||||
|
Selector: app=nginx
|
||||||
|
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
|
||||||
|
StrategyType: RollingUpdate
|
||||||
|
MinReadySeconds: 0
|
||||||
|
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
||||||
|
OldReplicaSets: <none>
|
||||||
|
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
|
||||||
|
Events:
|
||||||
|
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||||
|
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||||
|
30m 30m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
|
||||||
|
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
|
||||||
|
2m 2m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-3066724191 to 0
|
||||||
|
2m 2m 1 {deployment-controller } Normal DeploymentRollback Rolled back deployment "nginx-deployment" to revision 2
|
||||||
|
29m 2m 2 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Clean up Policy
|
||||||
|
|
||||||
|
You can set `.spec.revisionHistoryLimit` field to specify how much revision history of this deployment you want to keep. By default,
|
||||||
|
all revision history will be kept; explicitly setting this field to `0` disallows a deployment being rolled back.
|
||||||
|
|
||||||
|
## Scaling a Deployment
|
||||||
|
|
||||||
|
You can scale a Deployment by using the following command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl scale deployment nginx-deployment --replicas 10
|
||||||
|
deployment "nginx-deployment" scaled
|
||||||
|
```
|
||||||
|
|
||||||
|
Assuming [horizontal pod autoscaling](/docs/user-guide/horizontal-pod-autoscaling/walkthrough.md) is enabled
|
||||||
|
in your cluster, you can setup an autoscaler for your Deployment and choose the minimum and maximum number of
|
||||||
|
Pods you want to run based on the CPU utilization of your existing Pods.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl autoscale deployment nginx-deployment --min=10 --max=15 --cpu-percent=80
|
||||||
|
deployment "nginx-deployment" autoscaled
|
||||||
|
```
|
||||||
|
|
||||||
|
RollingUpdate Deployments support running multiple versions of an application at the same time. When you
|
||||||
|
or an autoscaler scales a RollingUpdate Deployment that is in the middle of a rollout (either in progress
|
||||||
|
or paused), then the Deployment controller will balance the additional replicas in the existing active
|
||||||
|
ReplicaSets (ReplicaSets with Pods) in order to mitigate risk. This is called *proportional scaling*.
|
||||||
|
|
||||||
|
For example, you are running a Deployment with 10 replicas, [maxSurge](#max-surge)=3, and [maxUnavailable](#max-unavailable)=2.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get deploy
|
||||||
|
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
||||||
|
nginx-deployment 10 10 10 10 50s
|
||||||
|
```
|
||||||
|
|
||||||
|
You update to a new image which happens to be unresolvable from inside the cluster.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl set image deploy/nginx-deployment nginx=nginx:sometag
|
||||||
|
deployment "nginx-deployment" image updated
|
||||||
|
```
|
||||||
|
|
||||||
|
The image update starts a new rollout with ReplicaSet nginx-deployment-1989198191 but it's blocked due to the
|
||||||
|
maxUnavailable requirement that we mentioned above.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-1989198191 5 5 0 9s
|
||||||
|
nginx-deployment-618515232 8 8 8 1m
|
||||||
|
```
|
||||||
|
|
||||||
|
Then a new scaling request for the Deployment comes along. The autoscaler increments the Deployment replicas
|
||||||
|
to 15. The Deployment controller needs to decide where to add these new 5 replicas. If we weren't using
|
||||||
|
proportional scaling, all 5 of them would be added in the new ReplicaSet. With proportional scaling, we
|
||||||
|
spread the additional replicas across all ReplicaSets. Bigger proportions go to the ReplicaSets with the
|
||||||
|
most replicas and lower proportions go to ReplicaSets with less replicas. Any leftovers are added to the
|
||||||
|
ReplicaSet with the most replicas. ReplicaSets with zero replicas are not scaled up.
|
||||||
|
|
||||||
|
In our example above, 3 replicas will be added to the old ReplicaSet and 2 replicas will be added to the
|
||||||
|
new ReplicaSet. The rollout process should eventually move all replicas to the new ReplicaSet, assuming
|
||||||
|
the new replicas become healthy.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get deploy
|
||||||
|
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
||||||
|
nginx-deployment 15 18 7 8 7m
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-1989198191 7 7 0 7m
|
||||||
|
nginx-deployment-618515232 11 11 11 7m
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pausing and Resuming a Deployment
|
||||||
|
|
||||||
|
You can also pause a Deployment mid-way and then resume it. A use case is to support canary deployment.
|
||||||
|
|
||||||
|
Update the Deployment again and then pause the Deployment with `kubectl rollout pause`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1; kubectl rollout pause deployment/nginx-deployment
|
||||||
|
deployment "nginx-deployment" image updated
|
||||||
|
deployment "nginx-deployment" paused
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that any current state of the Deployment will continue its function, but new updates to the Deployment will not have an effect as long as the Deployment is paused.
|
||||||
|
|
||||||
|
The Deployment was still in progress when we paused it, so the actions of scaling up and down Replica Sets are paused too.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-1564180365 2 2 2 1h
|
||||||
|
nginx-deployment-2035384211 2 2 0 1h
|
||||||
|
nginx-deployment-3066724191 0 0 0 1h
|
||||||
|
```
|
||||||
|
|
||||||
|
In a separate terminal, watch for rollout status changes and you'll see the rollout won't continue:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout status deployment/nginx-deployment
|
||||||
|
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
||||||
|
```
|
||||||
|
|
||||||
|
To resume the Deployment, simply do `kubectl rollout resume`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout resume deployment/nginx-deployment
|
||||||
|
deployment "nginx-deployment" resumed
|
||||||
|
```
|
||||||
|
|
||||||
|
Then the Deployment will continue and finish the rollout:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout status deployment/nginx-deployment
|
||||||
|
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
||||||
|
Waiting for deployment spec update to be observed...
|
||||||
|
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
||||||
|
deployment nginx-deployment successfully rolled out
|
||||||
|
```
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get rs
|
||||||
|
NAME DESIRED CURRENT READY AGE
|
||||||
|
nginx-deployment-1564180365 3 3 3 1h
|
||||||
|
nginx-deployment-2035384211 0 0 0 1h
|
||||||
|
nginx-deployment-3066724191 0 0 0 1h
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: You cannot rollback a paused Deployment until you resume it.
|
||||||
|
|
||||||
|
|
||||||
|
## Deployment status
|
||||||
|
|
||||||
|
A Deployment enters various states during its lifecycle. It can be [progressing](#progressing-deployment) while rolling out a new ReplicaSet,
|
||||||
|
it can be [complete](#complete-deployment), or it can [fail to progress](#failed-deployment).
|
||||||
|
|
||||||
|
### Progressing Deployment
|
||||||
|
|
||||||
|
Kubernetes marks a Deployment as _progressing_ when one of the following tasks is performed:
|
||||||
|
|
||||||
|
* The Deployment is in the process of creating a new ReplicaSet.
|
||||||
|
* The Deployment is scaling up an existing ReplicaSet.
|
||||||
|
* The Deployment is scaling down an existing ReplicaSet.
|
||||||
|
* New pods become available.
|
||||||
|
|
||||||
|
You can monitor the progress for a Deployment by using `kubectl rollout status`.
|
||||||
|
|
||||||
|
### Complete Deployment
|
||||||
|
|
||||||
|
Kubernetes marks a Deployment as _complete_ when it has the following characteristics:
|
||||||
|
|
||||||
|
* The Deployment has minimum availability. Minimum availability means that the Deployment's number of available replicas
|
||||||
|
equals or exceeds the number required by the Deployment strategy.
|
||||||
|
* All of the replicas associated with the Deployment have been updated to the latest version you've specified, meaning any
|
||||||
|
updates you've requested have been completed.
|
||||||
|
* No old pods for the Deployment are running.
|
||||||
|
|
||||||
|
You can check if a Deployment has completed by using `kubectl rollout status`. If the rollout completed successfully, `kubectl rollout status` returns a zero exit code.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout status deploy/nginx
|
||||||
|
Waiting for rollout to finish: 2 of 3 updated replicas are available...
|
||||||
|
deployment "nginx" successfully rolled out
|
||||||
|
$ echo $?
|
||||||
|
0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Failed Deployment
|
||||||
|
|
||||||
|
Your Deployment may get stuck trying to deploy its newest ReplicaSet without ever completing. This can occur due to some of the following factors:
|
||||||
|
|
||||||
|
* Insufficient quota
|
||||||
|
* Readiness probe failures
|
||||||
|
* Image pull errors
|
||||||
|
* Insufficient permissions
|
||||||
|
* Limit ranges
|
||||||
|
* Application runtime misconfiguration
|
||||||
|
|
||||||
|
One way you can detect this condition is to specify a deadline parameter in your Deployment spec: ([`spec.progressDeadlineSeconds`](#progress-deadline-seconds)). `spec.progressDeadlineSeconds` denotes the number of seconds the Deployment controller waits before indicating (via the Deployment status) that the Deployment progress has stalled.
|
||||||
|
|
||||||
|
The following `kubectl` command sets the spec with `progressDeadlineSeconds` to make the controller report lack of progress for a Deployment after 10 minutes:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl patch deployment/nginx-deployment -p '{"spec":{"progressDeadlineSeconds":600}}'
|
||||||
|
"nginx-deployment" patched
|
||||||
|
```
|
||||||
|
Once the deadline has been exceeded, the Deployment controller adds a DeploymentCondition with the following attributes to
|
||||||
|
the Deployment's `status.conditions`:
|
||||||
|
|
||||||
|
* Type=Progressing
|
||||||
|
* Status=False
|
||||||
|
* Reason=ProgressDeadlineExceeded
|
||||||
|
|
||||||
|
See the [Kubernetes API conventions](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#typical-status-properties) for more information on status conditions.
|
||||||
|
|
||||||
|
Note that in version 1.5, Kubernetes will take no action on a stalled Deployment other than to report a status condition with
|
||||||
|
`Reason=ProgressDeadlineExceeded`.
|
||||||
|
|
||||||
|
**Note:** If you pause a Deployment, Kubernetes does not check progress against your specified deadline. You can safely pause a Deployment in the middle of a rollout and resume without triggering the condition for exceeding the deadline.
|
||||||
|
|
||||||
|
You may experience transient errors with your Deployments, either due to a low timeout that you have set or due to any other kind
|
||||||
|
of error that can be treated as transient. For example, let's suppose you have insufficient quota. If you describe the Deployment
|
||||||
|
you will notice the following section:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl describe deployment nginx-deployment
|
||||||
|
<...>
|
||||||
|
Conditions:
|
||||||
|
Type Status Reason
|
||||||
|
---- ------ ------
|
||||||
|
Available True MinimumReplicasAvailable
|
||||||
|
Progressing True ReplicaSetUpdated
|
||||||
|
ReplicaFailure True FailedCreate
|
||||||
|
<...>
|
||||||
|
```
|
||||||
|
|
||||||
|
If you run `kubectl get deployment nginx-deployment -o yaml`, the Deployement status might look like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
status:
|
||||||
|
availableReplicas: 2
|
||||||
|
conditions:
|
||||||
|
- lastTransitionTime: 2016-10-04T12:25:39Z
|
||||||
|
lastUpdateTime: 2016-10-04T12:25:39Z
|
||||||
|
message: Replica set "nginx-deployment-4262182780" is progressing.
|
||||||
|
reason: ReplicaSetUpdated
|
||||||
|
status: "True"
|
||||||
|
type: Progressing
|
||||||
|
- lastTransitionTime: 2016-10-04T12:25:42Z
|
||||||
|
lastUpdateTime: 2016-10-04T12:25:42Z
|
||||||
|
message: Deployment has minimum availability.
|
||||||
|
reason: MinimumReplicasAvailable
|
||||||
|
status: "True"
|
||||||
|
type: Available
|
||||||
|
- lastTransitionTime: 2016-10-04T12:25:39Z
|
||||||
|
lastUpdateTime: 2016-10-04T12:25:39Z
|
||||||
|
message: 'Error creating: pods "nginx-deployment-4262182780-" is forbidden: exceeded quota:
|
||||||
|
object-counts, requested: pods=1, used: pods=3, limited: pods=2'
|
||||||
|
reason: FailedCreate
|
||||||
|
status: "True"
|
||||||
|
type: ReplicaFailure
|
||||||
|
observedGeneration: 3
|
||||||
|
replicas: 2
|
||||||
|
unavailableReplicas: 2
|
||||||
|
```
|
||||||
|
|
||||||
|
Eventually, once the Deployment progress deadline is exceeded, Kubernetes updates the status and the reason for the Progressing condition:
|
||||||
|
|
||||||
|
```
|
||||||
|
Conditions:
|
||||||
|
Type Status Reason
|
||||||
|
---- ------ ------
|
||||||
|
Available True MinimumReplicasAvailable
|
||||||
|
Progressing False ProgressDeadlineExceeded
|
||||||
|
ReplicaFailure True FailedCreate
|
||||||
|
```
|
||||||
|
|
||||||
|
You can address an issue of insufficient quota by scaling down your Deployment, by scaling down other controllers you may be running,
|
||||||
|
or by increasing quota in your namespace. If you satisfy the quota conditions and the Deployment controller then completes the Deployment
|
||||||
|
rollout, you'll see the Deployment's status update with a successful condition (`Status=True` and `Reason=NewReplicaSetAvailable`).
|
||||||
|
|
||||||
|
```
|
||||||
|
Conditions:
|
||||||
|
Type Status Reason
|
||||||
|
---- ------ ------
|
||||||
|
Available True MinimumReplicasAvailable
|
||||||
|
Progressing True NewReplicaSetAvailable
|
||||||
|
```
|
||||||
|
|
||||||
|
`Type=Available` with `Status=True` means that your Deployment has minimum availability. Minimum availability is dictated
|
||||||
|
by the parameters specified in the deployment strategy. `Type=Progressing` with `Status=True` means that your Deployment
|
||||||
|
is either in the middle of a rollout and it is progressing or that it has successfully completed its progress and the minimum
|
||||||
|
required new replicas are available (see the Reason of the condition for the particulars - in our case
|
||||||
|
`Reason=NewReplicaSetAvailable` means that the Deployment is complete).
|
||||||
|
|
||||||
|
You can check if a Deployment has failed to progress by using `kubectl rollout status`. `kubectl rollout status` returns a non-zero exit code if the Deployment has exceeded the progression deadline.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl rollout status deploy/nginx
|
||||||
|
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
||||||
|
error: deployment "nginx" exceeded its progress deadline
|
||||||
|
$ echo $?
|
||||||
|
1
|
||||||
|
```
|
||||||
|
|
||||||
|
### Operating on a failed deployment
|
||||||
|
|
||||||
|
All actions that apply to a complete Deployment also apply to a failed Deployment. You can scale it up/down, roll back
|
||||||
|
to a previous revision, or even pause it if you need to apply multiple tweaks in the Deployment pod template.
|
||||||
|
|
||||||
|
## Use Cases
|
||||||
|
|
||||||
|
### Canary Deployment
|
||||||
|
|
||||||
|
If you want to roll out releases to a subset of users or servers using the Deployment, you can create multiple Deployments, one for each release,
|
||||||
|
following the canary pattern described in [managing resources](/docs/concepts/cluster-administration/manage-deployment/#canary-deployments).
|
||||||
|
|
||||||
|
## Writing a Deployment Spec
|
||||||
|
|
||||||
|
As with all other Kubernetes configs, a Deployment needs `apiVersion`, `kind`, and
|
||||||
|
`metadata` fields. For general information about working with config files,
|
||||||
|
see [deploying applications](/docs/user-guide/deploying-applications), [configuring containers](/docs/user-guide/configuring-containers), and [using kubectl to manage resources](/docs/user-guide/working-with-resources) documents.
|
||||||
|
|
||||||
|
A Deployment also needs a [`.spec` section](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status).
|
||||||
|
|
||||||
|
### Pod Template
|
||||||
|
|
||||||
|
The `.spec.template` is the only required field of the `.spec`.
|
||||||
|
|
||||||
|
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template). It has exactly
|
||||||
|
the same schema as a [Pod](/docs/user-guide/pods), except it is nested and does not have an
|
||||||
|
`apiVersion` or `kind`.
|
||||||
|
|
||||||
|
In addition to required fields for a Pod, a pod template in a Deployment must specify appropriate
|
||||||
|
labels (i.e. don't overlap with other controllers, see [selector](#selector)) and an appropriate restart policy.
|
||||||
|
|
||||||
|
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
|
||||||
|
if not specified.
|
||||||
|
|
||||||
|
### Replicas
|
||||||
|
|
||||||
|
`.spec.replicas` is an optional field that specifies the number of desired Pods. It defaults
|
||||||
|
to 1.
|
||||||
|
|
||||||
|
### Selector
|
||||||
|
|
||||||
|
`.spec.selector` is an optional field that specifies a [label selector](/docs/user-guide/labels/#label-selectors) for the Pods
|
||||||
|
targeted by this deployment.
|
||||||
|
|
||||||
|
If specified, `.spec.selector` must match `.spec.template.metadata.labels`, or it will
|
||||||
|
be rejected by the API. If `.spec.selector` is unspecified, `.spec.selector.matchLabels` will be defaulted to
|
||||||
|
`.spec.template.metadata.labels`.
|
||||||
|
|
||||||
|
Deployment may kill Pods whose labels match the selector, in the case that their
|
||||||
|
template is different than `.spec.template` or if the total number of such Pods
|
||||||
|
exceeds `.spec.replicas`. It will bring up new Pods with `.spec.template` if
|
||||||
|
number of Pods are less than the desired number.
|
||||||
|
|
||||||
|
Note that you should not create other pods whose labels match this selector, either directly, via another Deployment or via another controller such as Replica Sets or Replication Controllers. Otherwise, the Deployment will think that those pods were created by it. Kubernetes will not stop you from doing this.
|
||||||
|
|
||||||
|
If you have multiple controllers that have overlapping selectors, the controllers will fight with each other's and won't behave correctly.
|
||||||
|
|
||||||
|
### Strategy
|
||||||
|
|
||||||
|
`.spec.strategy` specifies the strategy used to replace old Pods by new ones.
|
||||||
|
`.spec.strategy.type` can be "Recreate" or "RollingUpdate". "RollingUpdate" is
|
||||||
|
the default value.
|
||||||
|
|
||||||
|
#### Recreate Deployment
|
||||||
|
|
||||||
|
All existing Pods are killed before new ones are created when
|
||||||
|
`.spec.strategy.type==Recreate`.
|
||||||
|
|
||||||
|
#### Rolling Update Deployment
|
||||||
|
|
||||||
|
The Deployment updates Pods in a [rolling update](/docs/tasks/run-application/rolling-update-replication-controller/) fashion
|
||||||
|
when `.spec.strategy.type==RollingUpdate`.
|
||||||
|
You can specify `maxUnavailable` and `maxSurge` to control
|
||||||
|
the rolling update process.
|
||||||
|
|
||||||
|
##### Max Unavailable
|
||||||
|
|
||||||
|
`.spec.strategy.rollingUpdate.maxUnavailable` is an optional field that specifies the
|
||||||
|
maximum number of Pods that can be unavailable during the update process.
|
||||||
|
The value can be an absolute number (e.g. 5) or a percentage of desired Pods
|
||||||
|
(e.g. 10%).
|
||||||
|
The absolute number is calculated from percentage by rounding up.
|
||||||
|
This can not be 0 if `.spec.strategy.rollingUpdate.maxSurge` is 0.
|
||||||
|
By default, a fixed value of 1 is used.
|
||||||
|
|
||||||
|
For example, when this value is set to 30%, the old Replica Set can be scaled down to
|
||||||
|
70% of desired Pods immediately when the rolling update starts. Once new Pods are
|
||||||
|
ready, old Replica Set can be scaled down further, followed by scaling up the new Replica Set,
|
||||||
|
ensuring that the total number of Pods available at all times during the
|
||||||
|
update is at least 70% of the desired Pods.
|
||||||
|
|
||||||
|
##### Max Surge
|
||||||
|
|
||||||
|
`.spec.strategy.rollingUpdate.maxSurge` is an optional field that specifies the
|
||||||
|
maximum number of Pods that can be created above the desired number of Pods.
|
||||||
|
Value can be an absolute number (e.g. 5) or a percentage of desired Pods
|
||||||
|
(e.g. 10%).
|
||||||
|
This can not be 0 if `MaxUnavailable` is 0.
|
||||||
|
The absolute number is calculated from percentage by rounding up.
|
||||||
|
By default, a value of 1 is used.
|
||||||
|
|
||||||
|
For example, when this value is set to 30%, the new Replica Set can be scaled up immediately when
|
||||||
|
the rolling update starts, such that the total number of old and new Pods do not exceed
|
||||||
|
130% of desired Pods. Once old Pods have been killed,
|
||||||
|
the new Replica Set can be scaled up further, ensuring that the total number of Pods running
|
||||||
|
at any time during the update is at most 130% of desired Pods.
|
||||||
|
|
||||||
|
### Progress Deadline Seconds
|
||||||
|
|
||||||
|
`.spec.progressDeadlineSeconds` is an optional field that specifies the number of seconds you want
|
||||||
|
to wait for your Deployment to progress before the system reports back that the Deployment has
|
||||||
|
[failed progressing](#failed-deployment) - surfaced as a condition with `Type=Progressing`, `Status=False`.
|
||||||
|
and `Reason=ProgressDeadlineExceeded` in the status of the resource. The deployment controller will keep
|
||||||
|
retrying the Deployment. In the future, once automatic rollback will be implemented, the deployment
|
||||||
|
controller will roll back a Deployment as soon as it observes such a condition.
|
||||||
|
|
||||||
|
If specified, this field needs to be greater than `.spec.minReadySeconds`.
|
||||||
|
|
||||||
|
### Min Ready Seconds
|
||||||
|
|
||||||
|
`.spec.minReadySeconds` is an optional field (with default value of 600s) that specifies the
|
||||||
|
minimum number of seconds for which a newly created Pod should be ready
|
||||||
|
without any of its containers crashing, for it to be considered available.
|
||||||
|
This defaults to 0 (the Pod will be considered available as soon as it is ready).
|
||||||
|
To learn more about when a Pod is considered ready, see [Container Probes](/docs/user-guide/pod-states/#container-probes).
|
||||||
|
|
||||||
|
### Rollback To
|
||||||
|
|
||||||
|
`.spec.rollbackTo` is an optional field with the configuration the Deployment is rolling back to. Setting this field will trigger a rollback, and this field will be cleared every time a rollback is done.
|
||||||
|
|
||||||
|
#### Revision
|
||||||
|
|
||||||
|
`.spec.rollbackTo.revision` is an optional field specifying the revision to rollback to. This defaults to 0, meaning rollback to the last revision in history.
|
||||||
|
|
||||||
|
### Revision History Limit
|
||||||
|
|
||||||
|
A deployment's revision history is stored in the replica sets it controls.
|
||||||
|
|
||||||
|
`.spec.revisionHistoryLimit` is an optional field (with default value of two) that specifies the number of old Replica Sets to retain to allow rollback. Its ideal value depends on the frequency and stability of new deployments. All old Replica Sets will be kept by default, consuming resources in `etcd` and crowding the output of `kubectl get rs`, if this field is not set. The configuration of each Deployment revision is stored in its Replica Sets; therefore, once an old Replica Set is deleted, you lose the ability to rollback to that revision of Deployment.
|
||||||
|
|
||||||
|
More specifically, setting this field to zero means that all old replica sets with 0 replica will be cleaned up.
|
||||||
|
In this case, a new deployment rollout cannot be undone, since its revision history is cleaned up.
|
||||||
|
|
||||||
|
### Paused
|
||||||
|
|
||||||
|
`.spec.paused` is an optional boolean field for pausing and resuming a Deployment. It defaults to false (a Deployment is not paused).
|
||||||
|
|
||||||
|
## Alternative to Deployments
|
||||||
|
|
||||||
|
### kubectl rolling update
|
||||||
|
|
||||||
|
[Kubectl rolling update](/docs/user-guide/kubectl/kubectl_rolling-update) updates Pods and Replication Controllers in a similar fashion.
|
||||||
|
But Deployments are recommended, since they are declarative, server side, and have additional features, such as rolling back to any previous revision even after the rolling update is done.
|
|
@ -0,0 +1,45 @@
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: ReplicaSet
|
||||||
|
metadata:
|
||||||
|
name: frontend
|
||||||
|
# these labels can be applied automatically
|
||||||
|
# from the labels in the pod template if not set
|
||||||
|
# labels:
|
||||||
|
# app: guestbook
|
||||||
|
# tier: frontend
|
||||||
|
spec:
|
||||||
|
# this replicas value is default
|
||||||
|
# modify it according to your case
|
||||||
|
replicas: 3
|
||||||
|
# selector can be applied automatically
|
||||||
|
# from the labels in the pod template if not set,
|
||||||
|
# but we are specifying the selector here to
|
||||||
|
# demonstrate its usage.
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
tier: frontend
|
||||||
|
matchExpressions:
|
||||||
|
- {key: tier, operator: In, values: [frontend]}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: guestbook
|
||||||
|
tier: frontend
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: php-redis
|
||||||
|
image: gcr.io/google_samples/gb-frontend:v3
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
env:
|
||||||
|
- name: GET_HOSTS_FROM
|
||||||
|
value: dns
|
||||||
|
# If your cluster config does not include a dns service, then to
|
||||||
|
# instead access environment variables to find service host
|
||||||
|
# info, comment out the 'value: dns' line above, and uncomment the
|
||||||
|
# line below.
|
||||||
|
# value: env
|
||||||
|
ports:
|
||||||
|
- containerPort: 80
|
|
@ -0,0 +1,11 @@
|
||||||
|
apiVersion: autoscaling/v1
|
||||||
|
kind: HorizontalPodAutoscaler
|
||||||
|
metadata:
|
||||||
|
name: frontend-scaler
|
||||||
|
spec:
|
||||||
|
scaleTargetRef:
|
||||||
|
kind: ReplicaSet
|
||||||
|
name: frontend
|
||||||
|
minReplicas: 3
|
||||||
|
maxReplicas: 10
|
||||||
|
targetCPUUtilizationPercentage: 50
|
|
@ -0,0 +1,16 @@
|
||||||
|
apiVersion: apps/v1beta1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: nginx-deployment
|
||||||
|
spec:
|
||||||
|
replicas: 3
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: nginx
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: nginx
|
||||||
|
image: nginx:1.7.9
|
||||||
|
ports:
|
||||||
|
- containerPort: 80
|
|
@ -15,4 +15,430 @@ redirect_from:
|
||||||
|
|
||||||
__Warning:__ Starting in Kubernetes version 1.5, PetSet has been renamed to [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets). To use (or continue to use) PetSet in Kubernetes 1.5, you _must_ [migrate](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/) your existing PetSets to StatefulSets. For information on working with StatefulSet, see the tutorial on [how to run replicated stateful applications](/docs/tutorials/stateful-application/run-replicated-stateful-application).
|
__Warning:__ Starting in Kubernetes version 1.5, PetSet has been renamed to [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets). To use (or continue to use) PetSet in Kubernetes 1.5, you _must_ [migrate](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/) your existing PetSets to StatefulSets. For information on working with StatefulSet, see the tutorial on [how to run replicated stateful applications](/docs/tutorials/stateful-application/run-replicated-stateful-application).
|
||||||
|
|
||||||
__This document has been deprecated__.
|
__This document has been deprecated__, but can still apply if you're using
|
||||||
|
Kubernetes version 1.4 or earlier.
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
__Terminology__
|
||||||
|
|
||||||
|
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
|
||||||
|
|
||||||
|
* Node: A single virtual or physical machine in a Kubernetes cluster.
|
||||||
|
* Cluster: A group of nodes in a single failure domain, unless mentioned otherwise.
|
||||||
|
* Persistent Volume Claim (PVC): A request for storage, typically a [persistent volume](/docs/user-guide/persistent-volumes/walkthrough/).
|
||||||
|
* Host name: The hostname attached to the UTS namespace of the pod, i.e. the output of `hostname` in the pod.
|
||||||
|
* DNS/Domain name: A *cluster local* domain name resolvable using standard methods (e.g.: [gethostbyname](http://linux.die.net/man/3/gethostbyname)).
|
||||||
|
* Ordinality: the property of being "ordinal", or occupying a position in a sequence.
|
||||||
|
* Pet: a single member of a PetSet; more generally, a stateful application.
|
||||||
|
* Peer: a process running a server, capable of communicating with other such processes.
|
||||||
|
|
||||||
|
__Prerequisites__
|
||||||
|
|
||||||
|
This doc assumes familiarity with the following Kubernetes concepts:
|
||||||
|
|
||||||
|
* [Pods](/docs/user-guide/pods/single-container/)
|
||||||
|
* [Cluster DNS](/docs/admin/dns/)
|
||||||
|
* [Headless Services](/docs/user-guide/services/#headless-services)
|
||||||
|
* [Persistent Volumes](/docs/concepts/storage/volumes/)
|
||||||
|
* [Persistent Volume Provisioning](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md)
|
||||||
|
|
||||||
|
You need a working Kubernetes cluster at version >= 1.3, with a healthy DNS [cluster addon](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/README.md) at version >= 15. You cannot use PetSet on a hosted Kubernetes provider that has disabled `alpha` resources.
|
||||||
|
|
||||||
|
## What is a PetSet?
|
||||||
|
|
||||||
|
In Kubernetes, most pod management abstractions group them into disposable units of work that compose a micro service. Replication controllers for example, are designed with a weak guarantee - that there should be N replicas of a particular pod template. The pods are treated as stateless units, if one of them is unhealthy or superseded by a newer version, the system just disposes it.
|
||||||
|
|
||||||
|
```
|
||||||
|
foo.default.svc.cluster.local
|
||||||
|
|service|
|
||||||
|
/ \
|
||||||
|
| pod-asdf | | pod-zxcv |
|
||||||
|
```
|
||||||
|
|
||||||
|
A PetSet, in contrast, is a group of stateful pods that require a stronger notion of identity. The document refers to these as "clustered applications".
|
||||||
|
|
||||||
|
```
|
||||||
|
*.foo.default.svc.cluster.local
|
||||||
|
| mysql-0 | <-> | mysql-1 |
|
||||||
|
[pv 0] [pv 1]
|
||||||
|
```
|
||||||
|
|
||||||
|
The co-ordinated deployment of clustered applications is notoriously hard. They require stronger notions of identity and membership, which they use in opaque internal protocols, and are especially prone to race conditions and deadlock. Traditionally administrators have deployed these applications by leveraging nodes as stable, long-lived entities with persistent storage and static ips.
|
||||||
|
|
||||||
|
The goal of PetSet is to decouple this dependency by assigning identities to individual instances of an application that are not anchored to the underlying physical infrastructure. For the rest of this document we will refer to these entities as "Pets". Our use of this term is predated by the "Pets vs Cattle" analogy.
|
||||||
|
|
||||||
|
__Relationship between Pets and Pods__: PetSet requires there be {0..N-1} Pets. Each Pet has a deterministic name - PetSetName-Ordinal, and a unique identity. Each Pet has at most one pod, and each PetSet has at most one Pet with a given identity.
|
||||||
|
|
||||||
|
## When to use PetSet?
|
||||||
|
|
||||||
|
A PetSet ensures that a specified number of "pets" with unique identities are running at any given time. The identity of a Pet is comprised of:
|
||||||
|
|
||||||
|
* a stable hostname, available in DNS
|
||||||
|
* an ordinal index
|
||||||
|
* stable storage: linked to the ordinal & hostname
|
||||||
|
|
||||||
|
These properties are useful in deploying stateful applications. However most stateful applications are also clustered, meaning they form groups with strict membership requirements that rely on stored state. PetSet also helps with the 2 most common problems encountered managing such clustered applications:
|
||||||
|
|
||||||
|
* discovery of peers for quorum
|
||||||
|
* startup/teardown ordering
|
||||||
|
|
||||||
|
Only use PetSet if your application requires some or all of these properties. Managing pods as stateless replicas is vastly easier.
|
||||||
|
|
||||||
|
Example workloads for PetSet:
|
||||||
|
|
||||||
|
* Databases like MySQL or PostgreSQL that require a single instance attached to an NFS persistent volume at any time
|
||||||
|
* Clustered software like Zookeeper, Etcd, or Elasticsearch that require stable membership.
|
||||||
|
|
||||||
|
## Alpha limitations
|
||||||
|
|
||||||
|
Before you start deploying applications as PetSets, there are a few limitations you should understand.
|
||||||
|
|
||||||
|
* PetSet is an *alpha* resource, not available in any Kubernetes release prior to 1.3.
|
||||||
|
* As with all alpha/beta resources, it can be disabled through the `--runtime-config` option passed to the apiserver, and in fact most likely will be disabled on hosted offerings of Kubernetes.
|
||||||
|
* The only updatable field on a PetSet is `replicas`
|
||||||
|
* The storage for a given pet must either be provisioned by a [persistent volume provisioner](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md) based on the requested `storage class`, or pre-provisioned by an admin. Note that persistent volume provisioning is also currently in alpha.
|
||||||
|
* Deleting and/or scaling a PetSet down will *not* delete the volumes associated with the PetSet. This is done to ensure safety first, your data is more valuable than an auto purge of all related PetSet resources. **Deleting the Persistent Volume Claims will result in a deletion of the associated volumes**.
|
||||||
|
* All PetSets currently require a "governing service", or a Service responsible for the network identity of the pets. The user is responsible for this Service.
|
||||||
|
* Updating an existing PetSet is currently a manual process, meaning you either need to deploy a new PetSet with the new image version, or orphan Pets one by one, update their image, and join them back to the cluster.
|
||||||
|
|
||||||
|
## Example PetSet
|
||||||
|
|
||||||
|
We'll create a basic PetSet to demonstrate how Pets are assigned unique and "sticky" identities.
|
||||||
|
|
||||||
|
{% include code.html language="yaml" file="petset.yaml" ghlink="/docs/concepts/workloads/controllers/petset.yaml" %}
|
||||||
|
|
||||||
|
Saving this config into `petset.yaml` and submitting it to a Kubernetes cluster should create the defined PetSet and Pets it manages:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create -f petset.yaml
|
||||||
|
service "nginx" created
|
||||||
|
petset "nginx" created
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pet Identity
|
||||||
|
|
||||||
|
The identity of a Pet sticks to it, regardless of which node it's (re) scheduled on. We can examine the identity of the pets we just created.
|
||||||
|
|
||||||
|
### Ordinal index
|
||||||
|
|
||||||
|
you should see 2 pods with predictable names formatted thus: `$(petset name)-$(ordinal index assigned by petset controller)`
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get po
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
web-0 1/1 Running 0 10m
|
||||||
|
web-1 1/1 Running 0 10m
|
||||||
|
```
|
||||||
|
|
||||||
|
### Stable storage
|
||||||
|
|
||||||
|
2 persistent volumes, one per pod. This is auto created by the PetSet based on the `volumeClaimTemplate` field
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get pv
|
||||||
|
NAME CAPACITY ACCESSMODES STATUS CLAIM REASON AGE
|
||||||
|
pvc-90234946-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-0 11m
|
||||||
|
pvc-902733c2-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-1 11m
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network identity
|
||||||
|
|
||||||
|
The network identity has 2 parts. First, we created a headless Service that controls the domain within which we create Pets. The domain managed by this Service takes the form: `$(service name).$(namespace).svc.cluster.local`, where "cluster.local" is the [cluster domain](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it). As each pet is created, it gets a matching DNS subdomain, taking the form: `$(petname).$(governing service domain)`, where the governing service is defined by the `serviceName` field on the PetSet.
|
||||||
|
|
||||||
|
Here are some examples of choices for Cluster Domain, Service name, PetSet name, and how that affects the DNS names for the Pets and the hostnames in the Pet's pods:
|
||||||
|
|
||||||
|
Cluster Domain | Service (ns/name) | PetSet (ns/name) | PetSet Domain | Pet DNS | Pet Hostname |
|
||||||
|
-------------- | ----------------- | ----------------- | -------------- | ------- | ------------ |
|
||||||
|
cluster.local | default/nginx | default/web | nginx.default.svc.cluster.local | web-{0..N-1}.nginx.default.svc.cluster.local | web-{0..N-1} |
|
||||||
|
cluster.local | foo/nginx | foo/web | nginx.foo.svc.cluster.local | web-{0..N-1}.nginx.foo.svc.cluster.local | web-{0..N-1} |
|
||||||
|
kube.local | foo/nginx | foo/web | nginx.foo.svc.kube.local | web-{0..N-1}.nginx.foo.svc.kube.local | web-{0..N-1} |
|
||||||
|
|
||||||
|
Note that Cluster Domain will be set to `cluster.local` unless [otherwise configured](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it).
|
||||||
|
|
||||||
|
Let's verify our assertion with a simple test.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get svc
|
||||||
|
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
||||||
|
nginx None <none> 80/TCP 12m
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
First, the PetSet provides a stable hostname:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ for i in 0 1; do kubectl exec web-$i -- sh -c 'hostname'; done
|
||||||
|
web-0
|
||||||
|
web-1
|
||||||
|
```
|
||||||
|
|
||||||
|
And the hostname is linked to the in-cluster DNS address:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl run -i --tty --image busybox dns-test --restart=Never /bin/sh
|
||||||
|
dns-test # nslookup web-0.nginx
|
||||||
|
Server: 10.0.0.10
|
||||||
|
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
|
||||||
|
|
||||||
|
Name: web-0.nginx
|
||||||
|
Address 1: 10.180.3.5
|
||||||
|
|
||||||
|
dns-test # nslookup web-1.nginx
|
||||||
|
Server: 10.0.0.10
|
||||||
|
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
|
||||||
|
|
||||||
|
Name: web-1.nginx
|
||||||
|
Address 1: 10.180.0.9
|
||||||
|
```
|
||||||
|
|
||||||
|
The containers are running nginx webservers, which by default will look for an index.html file in `/usr/share/nginx/html/index.html`. That directory is backed by a `PersistentVolume` created by the PetSet. So let's write our hostname there:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ for i in 0 1; do
|
||||||
|
kubectl exec web-$i -- sh -c 'echo $(hostname) > /usr/share/nginx/html/index.html';
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
And verify each webserver serves its own hostname:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ for i in 0 1; do kubectl exec -it web-$i -- curl localhost; done
|
||||||
|
web-0
|
||||||
|
web-1
|
||||||
|
```
|
||||||
|
|
||||||
|
Now delete all pods in the petset:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl delete po -l app=nginx
|
||||||
|
pod "web-0" deleted
|
||||||
|
pod "web-1" deleted
|
||||||
|
```
|
||||||
|
|
||||||
|
Wait for them to come back up, and try to retrieve the previously written hostname through the DNS name of the peer. They match, because the storage, DNS name, and hostname stick to the Pet no matter where it gets scheduled:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl exec -it web-1 -- curl web-0.nginx
|
||||||
|
web-0
|
||||||
|
$ kubectl exec -it web-0 -- curl web-1.nginx
|
||||||
|
web-1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Peer discovery
|
||||||
|
|
||||||
|
A pet can piece together its own identity:
|
||||||
|
|
||||||
|
1. Use the [downward api](/docs/user-guide/downward-api/) to find its pod name
|
||||||
|
2. Run `hostname` to find its DNS name
|
||||||
|
3. Run `mount` or `df` to find its volumes (usually this is unnecessary)
|
||||||
|
|
||||||
|
It's not necessary to "discover" the governing Service of a PetSet, since it's known at creation time you can simply pass it down through an [environment variable](/docs/user-guide/environment-guide).
|
||||||
|
|
||||||
|
Usually pets also need to find their peers. In the previous nginx example, we just used `kubectl` to get the names of existing pods, and as humans, we could tell which ones belonged to a given PetSet. Another way to find peers is by contacting the API server, just like `kubectl`, but that has several disadvantages (you end up implementing a Kubernetes specific init system that runs as pid 1 in your application container).
|
||||||
|
|
||||||
|
PetSet gives you a way to discover your peers using DNS records. To illustrate this we can use the previous example (note: one usually doesn't `apt-get` in a container).
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl exec -it web-0 /bin/sh
|
||||||
|
web-0 # apt-get update && apt-get install -y dnsutils
|
||||||
|
...
|
||||||
|
|
||||||
|
web-0 # nslookup -type=srv nginx.default
|
||||||
|
Server: 10.0.0.10
|
||||||
|
Address: 10.0.0.10#53
|
||||||
|
|
||||||
|
nginx.default.svc.cluster.local service = 10 50 0 web-1.ub.default.svc.cluster.local.
|
||||||
|
nginx.default.svc.cluster.local service = 10 50 0 web-0.ub.default.svc.cluster.local.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Updating a PetSet
|
||||||
|
|
||||||
|
You cannot update any field of the PetSet except `spec.replicas` and the `containers` in the podTemplate. Updating `spec.replicas` will scale the PetSet, updating `containers` will not have any effect till a Pet is deleted, at which time it is recreated with the modified podTemplate.
|
||||||
|
|
||||||
|
## Scaling a PetSet
|
||||||
|
|
||||||
|
You can scale a PetSet by updating the "replicas" field. Note however that the controller will only:
|
||||||
|
|
||||||
|
1. Create one pet at a time, in order from {0..N-1}, and wait till each one is in [Running and Ready](/docs/user-guide/pod-states) before creating the next
|
||||||
|
2. Delete one pet at a time, in reverse order from {N-1..0}, and wait till each one is completely shutdown (past its [terminationGracePeriodSeconds](/docs/user-guide/pods/index#termination-of-pods)) before deleting the next
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get po
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
web-0 1/1 Running 0 30s
|
||||||
|
web-1 1/1 Running 0 36s
|
||||||
|
|
||||||
|
$ kubectl patch petset web -p '{"spec":{"replicas":3}}'
|
||||||
|
"web" patched
|
||||||
|
|
||||||
|
$ kubectl get po
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
web-0 1/1 Running 0 40s
|
||||||
|
web-1 1/1 Running 0 46s
|
||||||
|
web-2 1/1 Running 0 8s
|
||||||
|
```
|
||||||
|
|
||||||
|
You can also use the `kubectl scale` command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get petset
|
||||||
|
NAME DESIRED CURRENT AGE
|
||||||
|
web 3 3 24m
|
||||||
|
|
||||||
|
$ kubectl scale petset web --replicas=5
|
||||||
|
petset "web" scaled
|
||||||
|
|
||||||
|
$ kubectl get po --watch-only
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
web-0 1/1 Running 0 10m
|
||||||
|
web-1 1/1 Running 0 27m
|
||||||
|
web-2 1/1 Running 0 10m
|
||||||
|
web-3 1/1 Running 0 3m
|
||||||
|
web-4 0/1 ContainerCreating 0 48s
|
||||||
|
|
||||||
|
$ kubectl get petset web
|
||||||
|
NAME DESIRED CURRENT AGE
|
||||||
|
web 5 5 30m
|
||||||
|
```
|
||||||
|
|
||||||
|
Note however, that scaling up to N and back down to M *will not* delete the volumes of the M-N pets, as described in the section on [deletion](#deleting-a-petset), i.e. scaling back up to M creates new pets that use the same volumes. To see this in action, scale the PetSet back down to 3:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get po --watch-only
|
||||||
|
web-4 1/1 Terminating 0 4m
|
||||||
|
web-4 1/1 Terminating 0 4m
|
||||||
|
web-3 1/1 Terminating 0 6m
|
||||||
|
web-3 1/1 Terminating 0 6m
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that we still have 5 pvcs:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get pvc
|
||||||
|
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
|
||||||
|
www-web-0 Bound pvc-42ca5cef-8113-11e6-82f6-42010af00002 1Gi RWO 32m
|
||||||
|
www-web-1 Bound pvc-42de30af-8113-11e6-82f6-42010af00002 1Gi RWO 32m
|
||||||
|
www-web-2 Bound pvc-ba416413-8115-11e6-82f6-42010af00002 1Gi RWO 14m
|
||||||
|
www-web-3 Bound pvc-ba45f19c-8115-11e6-82f6-42010af00002 1Gi RWO 14m
|
||||||
|
www-web-4 Bound pvc-ba47674a-8115-11e6-82f6-42010af00002 1Gi RWO 14m
|
||||||
|
```
|
||||||
|
|
||||||
|
This allows you to upgrade the image of a petset and have it come back up with the same data, as described in the next section.
|
||||||
|
|
||||||
|
## Image upgrades
|
||||||
|
|
||||||
|
PetSet currently *does not* support automated image upgrade as noted in the section on [limitations](#alpha-limitations), however you can update the `image` field of any container in the podTemplate and delete Pets one by one, the PetSet controller will recreate it with the new image.
|
||||||
|
|
||||||
|
Edit the image on the PetSet to `gcr.io/google_containers/nginx-slim:0.7` and delete 1 Pet:
|
||||||
|
|
||||||
|
```shell{% raw %}
|
||||||
|
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
|
||||||
|
gcr.io/google_containers/nginx-slim:0.8
|
||||||
|
gcr.io/google_containers/nginx-slim:0.8
|
||||||
|
gcr.io/google_containers/nginx-slim:0.8
|
||||||
|
|
||||||
|
$ kubectl delete po web-0
|
||||||
|
pod "web-0" deleted
|
||||||
|
|
||||||
|
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
|
||||||
|
gcr.io/google_containers/nginx-slim:0.7
|
||||||
|
gcr.io/google_containers/nginx-slim:0.8
|
||||||
|
gcr.io/google_containers/nginx-slim:0.8
|
||||||
|
{% endraw %}```
|
||||||
|
|
||||||
|
Delete the remaining 2:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl delete po web-1 web-2
|
||||||
|
pod "web-1" deleted
|
||||||
|
pod "web-2" deleted
|
||||||
|
```
|
||||||
|
|
||||||
|
Wait till the PetSet is stable and check the images:
|
||||||
|
|
||||||
|
```shell{% raw %}
|
||||||
|
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
|
||||||
|
gcr.io/google_containers/nginx-slim:0.7
|
||||||
|
gcr.io/google_containers/nginx-slim:0.7
|
||||||
|
gcr.io/google_containers/nginx-slim:0.7
|
||||||
|
{% endraw %}```
|
||||||
|
|
||||||
|
## Deleting a PetSet
|
||||||
|
|
||||||
|
Deleting a PetSet through kubectl will scale it down to 0, thereby deleting all the Pets. If you wish to delete just the PetSet and not the Pets, use `--cascade=false`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl delete -f petset.yaml --cascade=false
|
||||||
|
petset "web" deleted
|
||||||
|
|
||||||
|
$ kubectl get po -l app=nginx
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
web-0 1/1 Running 0 21h
|
||||||
|
web-1 1/1 Running 0 21h
|
||||||
|
|
||||||
|
$ kubectl delete po -l app=nginx
|
||||||
|
pod "web-0" deleted
|
||||||
|
pod "web-1" deleted
|
||||||
|
```
|
||||||
|
|
||||||
|
Deleting the pods will *not* delete the volumes. Until we finalize the recycle policy for these volumes they will have to get cleaned up by an admin. This is to ensure that you have the chance to copy data off the volume before deleting it. Simply deleting the PVC after the pods have left the [terminating state](/docs/user-guide/pods/index#termination-of-pods) should trigger deletion of the backing Persistent Volumes.
|
||||||
|
|
||||||
|
**Note: you will lose all your data once the PVC is deleted, do this with caution.**
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl get po -l app=nginx
|
||||||
|
$ kubectl get pvc -l app=nginx
|
||||||
|
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
|
||||||
|
www-web-0 Bound pvc-62d271cd-3822-11e6-b1b7-42010af00002 0 21h
|
||||||
|
www-web-1 Bound pvc-62d6750e-3822-11e6-b1b7-42010af00002 0 21h
|
||||||
|
|
||||||
|
$ kubectl delete pvc -l app=nginx
|
||||||
|
$ kubectl get pv
|
||||||
|
```
|
||||||
|
|
||||||
|
If you simply want to clean everything:
|
||||||
|
|
||||||
|
```shell{% raw %}
|
||||||
|
$ grace=$(kubectl get po web-0 --template '{{.spec.terminationGracePeriodSeconds}}')
|
||||||
|
$ kubectl delete petset,po -l app=nginx
|
||||||
|
$ sleep $grace
|
||||||
|
$ kubectl delete pvc -l app=nginx
|
||||||
|
{% endraw %}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
You might have noticed an `annotations` field in all the PetSets shown above.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
annotations:
|
||||||
|
pod.alpha.kubernetes.io/initialized: "true"
|
||||||
|
```
|
||||||
|
|
||||||
|
This field is a debugging hook. It pauses any scale up/down operations on the entire PetSet. If you'd like to pause a petset after each pet, set it to `false` in the template, wait for each pet to come up, verify it has initialized correctly, and then set it to `true` using `kubectl edit` on the pet (setting it to `false` on *any pet* is enough to pause the PetSet). If you don't need it, create the PetSet with it set to `true` as shown. This is surprisingly useful in debugging bootstrapping race conditions.
|
||||||
|
|
||||||
|
## Future Work
|
||||||
|
|
||||||
|
There are a LOT of planned improvements since PetSet is still in alpha.
|
||||||
|
|
||||||
|
* Data gravity and local storage
|
||||||
|
* Richer notification events
|
||||||
|
* Public network identities
|
||||||
|
* WAN cluster deployments (multi-AZ/region/cloud provider)
|
||||||
|
* Image and node upgrades
|
||||||
|
|
||||||
|
This list goes on, if you have examples, ideas or thoughts, please contribute.
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
|
||||||
|
Deploying one RC of size 1/Service per pod is a popular alternative, as is simply deploying a DaemonSet that utilizes the identity of a Node.
|
||||||
|
|
||||||
|
## Next steps
|
||||||
|
|
||||||
|
* Learn about [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/),
|
||||||
|
the replacement for PetSet introduced in Kubernetes version 1.5.
|
||||||
|
* [Migrate your existing PetSets to StatefulSets](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/)
|
||||||
|
when upgrading to Kubernetes version 1.5 or higher.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
# A headless service to create DNS records
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: nginx
|
||||||
|
labels:
|
||||||
|
app: nginx
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- port: 80
|
||||||
|
name: web
|
||||||
|
# *.nginx.default.svc.cluster.local
|
||||||
|
clusterIP: None
|
||||||
|
selector:
|
||||||
|
app: nginx
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1alpha1
|
||||||
|
kind: PetSet
|
||||||
|
metadata:
|
||||||
|
name: web
|
||||||
|
spec:
|
||||||
|
serviceName: "nginx"
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: nginx
|
||||||
|
annotations:
|
||||||
|
pod.alpha.kubernetes.io/initialized: "true"
|
||||||
|
spec:
|
||||||
|
terminationGracePeriodSeconds: 0
|
||||||
|
containers:
|
||||||
|
- name: nginx
|
||||||
|
image: gcr.io/google_containers/nginx-slim:0.8
|
||||||
|
ports:
|
||||||
|
- containerPort: 80
|
||||||
|
name: web
|
||||||
|
volumeMounts:
|
||||||
|
- name: www
|
||||||
|
mountPath: /usr/share/nginx/html
|
||||||
|
volumeClaimTemplates:
|
||||||
|
- metadata:
|
||||||
|
name: www
|
||||||
|
annotations:
|
||||||
|
volume.alpha.kubernetes.io/storage-class: anything
|
||||||
|
spec:
|
||||||
|
accessModes: [ "ReadWriteOnce" ]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
|
|
@ -0,0 +1,102 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- Kashomon
|
||||||
|
- bprashanth
|
||||||
|
- madhusudancs
|
||||||
|
title: Replica Sets
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## What is a ReplicaSet?
|
||||||
|
|
||||||
|
ReplicaSet is the next-generation Replication Controller. The only difference
|
||||||
|
between a _ReplicaSet_ and a
|
||||||
|
[_Replication Controller_](/docs/user-guide/replication-controller/) right now is
|
||||||
|
the selector support. ReplicaSet supports the new set-based selector requirements
|
||||||
|
as described in the [labels user guide](/docs/user-guide/labels/#label-selectors)
|
||||||
|
whereas a Replication Controller only supports equality-based selector requirements.
|
||||||
|
|
||||||
|
Most [`kubectl`](/docs/user-guide/kubectl/) commands that support
|
||||||
|
Replication Controllers also support ReplicaSets. One exception is the
|
||||||
|
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command. If
|
||||||
|
you want the rolling update functionality please consider using Deployments
|
||||||
|
instead. Also, the
|
||||||
|
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command is
|
||||||
|
imperative whereas Deployments are declarative, so we recommend using Deployments
|
||||||
|
through the [`rollout`](/docs/user-guide/kubectl/kubectl_rollout/) command.
|
||||||
|
|
||||||
|
While ReplicaSets can be used independently, today it's mainly used by
|
||||||
|
[Deployments](/docs/user-guide/deployments/) as a mechanism to orchestrate pod
|
||||||
|
creation, deletion and updates. When you use Deployments you don't have to worry
|
||||||
|
about managing the ReplicaSets that they create. Deployments own and manage
|
||||||
|
their ReplicaSets.
|
||||||
|
|
||||||
|
## When to use a ReplicaSet?
|
||||||
|
|
||||||
|
A ReplicaSet ensures that a specified number of pod “replicas” are running at any given
|
||||||
|
time. However, a Deployment is a higher-level concept that manages ReplicaSets and
|
||||||
|
provides declarative updates to pods along with a lot of other useful features.
|
||||||
|
Therefore, we recommend using Deployments instead of directly using ReplicaSets, unless
|
||||||
|
you require custom update orchestration or don't require updates at all.
|
||||||
|
|
||||||
|
This actually means that you may never need to manipulate ReplicaSet objects:
|
||||||
|
use directly a Deployment and define your application in the spec section.
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
{% include code.html language="yaml" file="frontend.yaml" ghlink="/docs/concepts/workloads/controllers/frontend.yaml" %}
|
||||||
|
|
||||||
|
Saving this config into `frontend.yaml` and submitting it to a Kubernetes cluster should
|
||||||
|
create the defined ReplicaSet and the pods that it manages.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create -f frontend.yaml
|
||||||
|
replicaset "frontend" created
|
||||||
|
$ kubectl describe rs/frontend
|
||||||
|
Name: frontend
|
||||||
|
Namespace: default
|
||||||
|
Image(s): gcr.io/google_samples/gb-frontend:v3
|
||||||
|
Selector: tier=frontend,tier in (frontend)
|
||||||
|
Labels: app=guestbook,tier=frontend
|
||||||
|
Replicas: 3 current / 3 desired
|
||||||
|
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
|
||||||
|
No volumes.
|
||||||
|
Events:
|
||||||
|
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||||
|
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||||
|
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-qhloh
|
||||||
|
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-dnjpy
|
||||||
|
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-9si5l
|
||||||
|
$ kubectl get pods
|
||||||
|
NAME READY STATUS RESTARTS AGE
|
||||||
|
frontend-9si5l 1/1 Running 0 1m
|
||||||
|
frontend-dnjpy 1/1 Running 0 1m
|
||||||
|
frontend-qhloh 1/1 Running 0 1m
|
||||||
|
```
|
||||||
|
|
||||||
|
## ReplicaSet as an Horizontal Pod Autoscaler target
|
||||||
|
|
||||||
|
A ReplicaSet can also be a target for
|
||||||
|
[Horizontal Pod Autoscalers (HPA)](/docs/user-guide/horizontal-pod-autoscaling/),
|
||||||
|
i.e. a ReplicaSet can be auto-scaled by an HPA. Here is an example HPA targeting
|
||||||
|
the ReplicaSet we created in the previous example.
|
||||||
|
|
||||||
|
{% include code.html language="yaml" file="hpa-rs.yaml" ghlink="/docs/concepts/workloads/controllers/hpa-rs.yaml" %}
|
||||||
|
|
||||||
|
|
||||||
|
Saving this config into `hpa-rs.yaml` and submitting it to a Kubernetes cluster should
|
||||||
|
create the defined HPA that autoscales the target ReplicaSet depending on the CPU usage
|
||||||
|
of the replicated pods.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
kubectl create -f hpa-rs.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, you can just use the `kubectl autoscale` command to accomplish the same
|
||||||
|
(and it's easier!)
|
||||||
|
|
||||||
|
```shell
|
||||||
|
kubectl autoscale rs frontend
|
||||||
|
```
|
|
@ -0,0 +1,19 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ReplicationController
|
||||||
|
metadata:
|
||||||
|
name: nginx
|
||||||
|
spec:
|
||||||
|
replicas: 3
|
||||||
|
selector:
|
||||||
|
app: nginx
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
name: nginx
|
||||||
|
labels:
|
||||||
|
app: nginx
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: nginx
|
||||||
|
image: nginx
|
||||||
|
ports:
|
||||||
|
- containerPort: 80
|
|
@ -0,0 +1,261 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
- bprashanth
|
||||||
|
- janetkuo
|
||||||
|
title: Replication Controller
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
## What is a ReplicationController?
|
||||||
|
|
||||||
|
A _ReplicationController_ ensures that a specified number of pod "replicas" are running at any one
|
||||||
|
time. In other words, a ReplicationController makes sure that a pod or homogeneous set of pods are
|
||||||
|
always up and available.
|
||||||
|
If there are too many pods, it will kill some. If there are too few, the
|
||||||
|
ReplicationController will start more. Unlike manually created pods, the pods maintained by a
|
||||||
|
ReplicationController are automatically replaced if they fail, get deleted, or are terminated.
|
||||||
|
For example, your pods get re-created on a node after disruptive maintenance such as a kernel upgrade.
|
||||||
|
For this reason, we recommend that you use a ReplicationController even if your application requires
|
||||||
|
only a single pod. You can think of a ReplicationController as something similar to a process supervisor,
|
||||||
|
but rather than individual processes on a single node, the ReplicationController supervises multiple pods
|
||||||
|
across multiple nodes.
|
||||||
|
|
||||||
|
ReplicationController is often abbreviated to "rc" or "rcs" in discussion, and as a shortcut in
|
||||||
|
kubectl commands.
|
||||||
|
|
||||||
|
A simple case is to create 1 ReplicationController object in order to reliably run one instance of
|
||||||
|
a Pod indefinitely. A more complex use case is to run several identical replicas of a replicated
|
||||||
|
service, such as web servers.
|
||||||
|
|
||||||
|
## Running an example ReplicationController
|
||||||
|
|
||||||
|
Here is an example ReplicationController config. It runs 3 copies of the nginx web server.
|
||||||
|
|
||||||
|
{% include code.html language="yaml" file="replication.yaml" ghlink="/docs/concepts/workloads/controllers/replication.yaml" %}
|
||||||
|
|
||||||
|
Run the example job by downloading the example file and then running this command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl create -f ./replication.yaml
|
||||||
|
replicationcontroller "nginx" created
|
||||||
|
```
|
||||||
|
|
||||||
|
Check on the status of the ReplicationController using this command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ kubectl describe replicationcontrollers/nginx
|
||||||
|
Name: nginx
|
||||||
|
Namespace: default
|
||||||
|
Image(s): nginx
|
||||||
|
Selector: app=nginx
|
||||||
|
Labels: app=nginx
|
||||||
|
Replicas: 3 current / 3 desired
|
||||||
|
Pods Status: 0 Running / 3 Waiting / 0 Succeeded / 0 Failed
|
||||||
|
Events:
|
||||||
|
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||||
|
--------- -------- ----- ---- ------------- ---- ------ -------
|
||||||
|
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-qrm3m
|
||||||
|
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-3ntk0
|
||||||
|
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-4ok8v
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, 3 pods have been made, but none are running yet, perhaps because the image is being pulled.
|
||||||
|
A little later, the same command may show:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
|
||||||
|
```
|
||||||
|
|
||||||
|
To list all the pods that belong to the rc in a machine readable form, you can use a command like this:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ pods=$(kubectl get pods --selector=app=nginx --output=jsonpath={.items..metadata.name})
|
||||||
|
echo $pods
|
||||||
|
nginx-3ntk0 nginx-4ok8v nginx-qrm3m
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, the selector is the same as the selector for the ReplicationController (seen in the
|
||||||
|
`kubectl describe` output, and in a different form in `replication.yaml`. The `--output=jsonpath` option
|
||||||
|
specifies an expression that just gets the name from each pod in the returned list.
|
||||||
|
|
||||||
|
|
||||||
|
## Writing a ReplicationController Spec
|
||||||
|
|
||||||
|
As with all other Kubernetes config, a Job needs `apiVersion`, `kind`, and `metadata` fields. For
|
||||||
|
general information about working with config files, see [here](/docs/user-guide/simple-yaml/),
|
||||||
|
[here](/docs/user-guide/configuring-containers/), and [here](/docs/user-guide/working-with-resources/).
|
||||||
|
|
||||||
|
A ReplicationController also needs a [`.spec` section](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status).
|
||||||
|
|
||||||
|
### Pod Template
|
||||||
|
|
||||||
|
The `.spec.template` is the only required field of the `.spec`.
|
||||||
|
|
||||||
|
The `.spec.template` is a [pod template](#pod-template). It has exactly
|
||||||
|
the same schema as a [pod](/docs/user-guide/pods/), except it is nested and does not have an `apiVersion` or
|
||||||
|
`kind`.
|
||||||
|
|
||||||
|
In addition to required fields for a Pod, a pod template in a ReplicationController must specify appropriate
|
||||||
|
labels (i.e. don't overlap with other controllers, see [pod selector](#pod-selector)) and an appropriate restart policy.
|
||||||
|
|
||||||
|
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
|
||||||
|
if not specified.
|
||||||
|
|
||||||
|
For local container restarts, ReplicationControllers delegate to an agent on the node,
|
||||||
|
for example the [Kubelet](/docs/admin/kubelet/) or Docker.
|
||||||
|
|
||||||
|
### Labels on the ReplicationController
|
||||||
|
|
||||||
|
The ReplicationController can itself have labels (`.metadata.labels`). Typically, you
|
||||||
|
would set these the same as the `.spec.template.metadata.labels`; if `.metadata.labels` is not specified
|
||||||
|
then it is defaulted to `.spec.template.metadata.labels`. However, they are allowed to be
|
||||||
|
different, and the `.metadata.labels` do not affect the behavior of the ReplicationController.
|
||||||
|
|
||||||
|
### Pod Selector
|
||||||
|
|
||||||
|
The `.spec.selector` field is a [label selector](/docs/user-guide/labels/#label-selectors). A replication
|
||||||
|
controller manages all the pods with labels which match the selector. It does not distinguish
|
||||||
|
between pods which it created or deleted versus pods which some other person or process created or
|
||||||
|
deleted. This allows the ReplicationController to be replaced without affecting the running pods.
|
||||||
|
|
||||||
|
If specified, the `.spec.template.metadata.labels` must be equal to the `.spec.selector`, or it will
|
||||||
|
be rejected by the API. If `.spec.selector` is unspecified, it will be defaulted to
|
||||||
|
`.spec.template.metadata.labels`.
|
||||||
|
|
||||||
|
Also you should not normally create any pods whose labels match this selector, either directly, via
|
||||||
|
another ReplicationController or via another controller such as Job. Otherwise, the
|
||||||
|
ReplicationController will think that those pods were created by it. Kubernetes will not stop you
|
||||||
|
from doing this.
|
||||||
|
|
||||||
|
If you do end up with multiple controllers that have overlapping selectors, you
|
||||||
|
will have to manage the deletion yourself (see [below](#updating-a-replication-controller)).
|
||||||
|
|
||||||
|
### Multiple Replicas
|
||||||
|
|
||||||
|
You can specify how many pods should run concurrently by setting `.spec.replicas` to the number
|
||||||
|
of pods you would like to have running concurrently. The number running at any time may be higher
|
||||||
|
or lower, such as if the replicas was just increased or decreased, or if a pod is gracefully
|
||||||
|
shutdown, and a replacement starts early.
|
||||||
|
|
||||||
|
If you do not specify `.spec.replicas`, then it defaults to 1.
|
||||||
|
|
||||||
|
## Working with ReplicationControllers
|
||||||
|
|
||||||
|
### Deleting a ReplicationController and its Pods
|
||||||
|
|
||||||
|
To delete a ReplicationController and all its pods, use [`kubectl
|
||||||
|
delete`](/docs/user-guide/kubectl/kubectl_delete/). Kubectl will scale the ReplicationController to zero and wait
|
||||||
|
for it to delete each pod before deleting the ReplicationController itself. If this kubectl
|
||||||
|
command is interrupted, it can be restarted.
|
||||||
|
|
||||||
|
When using the REST API or go client library, you need to do the steps explicitly (scale replicas to
|
||||||
|
0, wait for pod deletions, then delete the ReplicationController).
|
||||||
|
|
||||||
|
### Deleting just a ReplicationController
|
||||||
|
|
||||||
|
You can delete a ReplicationController without affecting any of its pods.
|
||||||
|
|
||||||
|
Using kubectl, specify the `--cascade=false` option to [`kubectl delete`](/docs/user-guide/kubectl/kubectl_delete/).
|
||||||
|
|
||||||
|
When using the REST API or go client library, simply delete the ReplicationController object.
|
||||||
|
|
||||||
|
Once the original is deleted, you can create a new ReplicationController to replace it. As long
|
||||||
|
as the old and new `.spec.selector` are the same, then the new one will adopt the old pods.
|
||||||
|
However, it will not make any effort to make existing pods match a new, different pod template.
|
||||||
|
To update pods to a new spec in a controlled way, use a [rolling update](#rolling-updates).
|
||||||
|
|
||||||
|
### Isolating pods from a ReplicationController
|
||||||
|
|
||||||
|
Pods may be removed from a ReplicationController's target set by changing their labels. This technique may be used to remove pods from service for debugging, data recovery, etc. Pods that are removed in this way will be replaced automatically (assuming that the number of replicas is not also changed).
|
||||||
|
|
||||||
|
## Common usage patterns
|
||||||
|
|
||||||
|
### Rescheduling
|
||||||
|
|
||||||
|
As mentioned above, whether you have 1 pod you want to keep running, or 1000, a ReplicationController will ensure that the specified number of pods exists, even in the event of node failure or pod termination (e.g., due to an action by another control agent).
|
||||||
|
|
||||||
|
### Scaling
|
||||||
|
|
||||||
|
The ReplicationController makes it easy to scale the number of replicas up or down, either manually or by an auto-scaling control agent, by simply updating the `replicas` field.
|
||||||
|
|
||||||
|
### Rolling updates
|
||||||
|
|
||||||
|
The ReplicationController is designed to facilitate rolling updates to a service by replacing pods one-by-one.
|
||||||
|
|
||||||
|
As explained in [#1353](http://issue.k8s.io/1353), the recommended approach is to create a new ReplicationController with 1 replica, scale the new (+1) and old (-1) controllers one by one, and then delete the old controller after it reaches 0 replicas. This predictably updates the set of pods regardless of unexpected failures.
|
||||||
|
|
||||||
|
Ideally, the rolling update controller would take application readiness into account, and would ensure that a sufficient number of pods were productively serving at any given time.
|
||||||
|
|
||||||
|
The two ReplicationControllers would need to create pods with at least one differentiating label, such as the image tag of the primary container of the pod, since it is typically image updates that motivate rolling updates.
|
||||||
|
|
||||||
|
Rolling update is implemented in the client tool
|
||||||
|
[`kubectl rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update). Visit [`kubectl rolling-update` task](/docs/tasks/run-application/rolling-update-replication-controller/) for more concrete examples.
|
||||||
|
|
||||||
|
### Multiple release tracks
|
||||||
|
|
||||||
|
In addition to running multiple releases of an application while a rolling update is in progress, it's common to run multiple releases for an extended period of time, or even continuously, using multiple release tracks. The tracks would be differentiated by labels.
|
||||||
|
|
||||||
|
For instance, a service might target all pods with `tier in (frontend), environment in (prod)`. Now say you have 10 replicated pods that make up this tier. But you want to be able to 'canary' a new version of this component. You could set up a ReplicationController with `replicas` set to 9 for the bulk of the replicas, with labels `tier=frontend, environment=prod, track=stable`, and another ReplicationController with `replicas` set to 1 for the canary, with labels `tier=frontend, environment=prod, track=canary`. Now the service is covering both the canary and non-canary pods. But you can mess with the ReplicationControllers separately to test things out, monitor the results, etc.
|
||||||
|
|
||||||
|
### Using ReplicationControllers with Services
|
||||||
|
|
||||||
|
Multiple ReplicationControllers can sit behind a single service, so that, for example, some traffic
|
||||||
|
goes to the old version, and some goes to the new version.
|
||||||
|
|
||||||
|
A ReplicationController will never terminate on its own, but it isn't expected to be as long-lived as services. Services may be composed of pods controlled by multiple ReplicationControllers, and it is expected that many ReplicationControllers may be created and destroyed over the lifetime of a service (for instance, to perform an update of pods that run the service). Both services themselves and their clients should remain oblivious to the ReplicationControllers that maintain the pods of the services.
|
||||||
|
|
||||||
|
## Writing programs for Replication
|
||||||
|
|
||||||
|
Pods created by a ReplicationController are intended to be fungible and semantically identical, though their configurations may become heterogeneous over time. This is an obvious fit for replicated stateless servers, but ReplicationControllers can also be used to maintain availability of master-elected, sharded, and worker-pool applications. Such applications should use dynamic work assignment mechanisms, such as the [etcd lock module](https://coreos.com/docs/distributed-configuration/etcd-modules/) or [RabbitMQ work queues](https://www.rabbitmq.com/tutorials/tutorial-two-python.html), as opposed to static/one-time customization of the configuration of each pod, which is considered an anti-pattern. Any pod customization performed, such as vertical auto-sizing of resources (e.g., cpu or memory), should be performed by another online controller process, not unlike the ReplicationController itself.
|
||||||
|
|
||||||
|
## Responsibilities of the ReplicationController
|
||||||
|
|
||||||
|
The ReplicationController simply ensures that the desired number of pods matches its label selector and are operational. Currently, only terminated pods are excluded from its count. In the future, [readiness](http://issue.k8s.io/620) and other information available from the system may be taken into account, we may add more controls over the replacement policy, and we plan to emit events that could be used by external clients to implement arbitrarily sophisticated replacement and/or scale-down policies.
|
||||||
|
|
||||||
|
The ReplicationController is forever constrained to this narrow responsibility. It itself will not perform readiness nor liveness probes. Rather than performing auto-scaling, it is intended to be controlled by an external auto-scaler (as discussed in [#492](http://issue.k8s.io/492)), which would change its `replicas` field. We will not add scheduling policies (e.g., [spreading](http://issue.k8s.io/367#issuecomment-48428019)) to the ReplicationController. Nor should it verify that the pods controlled match the currently specified template, as that would obstruct auto-sizing and other automated processes. Similarly, completion deadlines, ordering dependencies, configuration expansion, and other features belong elsewhere. We even plan to factor out the mechanism for bulk pod creation ([#170](http://issue.k8s.io/170)).
|
||||||
|
|
||||||
|
The ReplicationController is intended to be a composable building-block primitive. We expect higher-level APIs and/or tools to be built on top of it and other complementary primitives for user convenience in the future. The "macro" operations currently supported by kubectl (run, stop, scale, rolling-update) are proof-of-concept examples of this. For instance, we could imagine something like [Asgard](http://techblog.netflix.com/2012/06/asgard-web-based-cloud-management-and.html) managing ReplicationControllers, auto-scalers, services, scheduling policies, canaries, etc.
|
||||||
|
|
||||||
|
|
||||||
|
## API Object
|
||||||
|
|
||||||
|
Replication controller is a top-level resource in the Kubernetes REST API. More details about the
|
||||||
|
API object can be found at: [ReplicationController API
|
||||||
|
object](/docs/api-reference/v1.6/#replicationcontroller-v1-core).
|
||||||
|
|
||||||
|
## Alternatives to ReplicationController
|
||||||
|
|
||||||
|
### ReplicaSet
|
||||||
|
|
||||||
|
[`ReplicaSet`](/docs/user-guide/replicasets/) is the next-generation ReplicationController that supports the new [set-based label selector](/docs/user-guide/labels/#set-based-requirement).
|
||||||
|
It’s mainly used by [`Deployment`](/docs/user-guide/deployments/) as a mechanism to orchestrate pod creation, deletion and updates.
|
||||||
|
Note that we recommend using Deployments instead of directly using Replica Sets, unless you require custom update orchestration or don’t require updates at all.
|
||||||
|
|
||||||
|
|
||||||
|
### Deployment (Recommended)
|
||||||
|
|
||||||
|
[`Deployment`](/docs/user-guide/deployments/) is a higher-level API object that updates its underlying Replica Sets and their Pods
|
||||||
|
in a similar fashion as `kubectl rolling-update`. Deployments are recommended if you want this rolling update functionality,
|
||||||
|
because unlike `kubectl rolling-update`, they are declarative, server-side, and have additional features.
|
||||||
|
|
||||||
|
### Bare Pods
|
||||||
|
|
||||||
|
Unlike in the case where a user directly created pods, a ReplicationController replaces pods that are deleted or terminated for any reason, such as in the case of node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, we recommend that you use a ReplicationController even if your application requires only a single pod. Think of it similarly to a process supervisor, only it supervises multiple pods across multiple nodes instead of individual processes on a single node. A ReplicationController delegates local container restarts to some agent on the node (e.g., Kubelet or Docker).
|
||||||
|
|
||||||
|
### Job
|
||||||
|
|
||||||
|
Use a [`Job`](/docs/concepts/jobs/run-to-completion-finite-workloads/) instead of a ReplicationController for pods that are expected to terminate on their own
|
||||||
|
(i.e. batch jobs).
|
||||||
|
|
||||||
|
### DaemonSet
|
||||||
|
|
||||||
|
Use a [`DaemonSet`](/docs/admin/daemons/) instead of a ReplicationController for pods that provide a
|
||||||
|
machine-level function, such as machine monitoring or machine logging. These pods have a lifetime that is tied
|
||||||
|
to a machine lifetime: the pod needs to be running on the machine before other pods start, and are
|
||||||
|
safe to terminate when the machine is otherwise ready to be rebooted/shutdown.
|
||||||
|
|
||||||
|
## For more information
|
||||||
|
|
||||||
|
Read [Run Stateless AP Replication Controller](/docs/tutorials/stateless-application/run-stateless-ap-replication-controller/).
|
|
@ -0,0 +1,196 @@
|
||||||
|
---
|
||||||
|
assignees:
|
||||||
|
title: Pods
|
||||||
|
---
|
||||||
|
|
||||||
|
* TOC
|
||||||
|
{:toc}
|
||||||
|
|
||||||
|
|
||||||
|
_pods_ are the smallest deployable units of computing that can be created and
|
||||||
|
managed in Kubernetes.
|
||||||
|
|
||||||
|
## What is a Pod?
|
||||||
|
|
||||||
|
A _pod_ (as in a pod of whales or pea pod) is a group of one or more containers
|
||||||
|
(such as Docker containers), the shared storage for those containers, and
|
||||||
|
options about how to run the containers. Pods are always co-located and
|
||||||
|
co-scheduled, and run in a shared context. A pod models an
|
||||||
|
application-specific "logical host" - it contains one or more application
|
||||||
|
containers which are relatively tightly coupled — in a pre-container
|
||||||
|
world, they would have executed on the same physical or virtual machine.
|
||||||
|
|
||||||
|
While Kubernetes supports more container runtimes than just Docker, Docker is
|
||||||
|
the most commonly known runtime, and it helps to describe pods in Docker terms.
|
||||||
|
|
||||||
|
The shared context of a pod is a set of Linux namespaces, cgroups, and
|
||||||
|
potentially other facets of isolation - the same things that isolate a Docker
|
||||||
|
container. Within a pod's context, the individual applications may have
|
||||||
|
further sub-isolations applied.
|
||||||
|
|
||||||
|
Containers within a pod share an IP address and port space, and
|
||||||
|
can find each other via `localhost`. They can also communicate with each
|
||||||
|
other using standard inter-process communications like SystemV semaphores or
|
||||||
|
POSIX shared memory. Containers in different pods have distinct IP addresses
|
||||||
|
and can not communicate by IPC.
|
||||||
|
|
||||||
|
Applications within a pod also have access to shared volumes, which are defined
|
||||||
|
as part of a pod and are made available to be mounted into each application's
|
||||||
|
filesystem.
|
||||||
|
|
||||||
|
In terms of [Docker](https://www.docker.com/) constructs, a pod is modelled as
|
||||||
|
a group of Docker containers with shared namespaces and shared
|
||||||
|
[volumes](/docs/concepts/storage/volumes/). PID namespace sharing is not yet implemented in Docker.
|
||||||
|
|
||||||
|
Like individual application containers, pods are considered to be relatively
|
||||||
|
ephemeral (rather than durable) entities. As discussed in [life of a
|
||||||
|
pod](/docs/user-guide/pod-states/), pods are created, assigned a unique ID (UID), and
|
||||||
|
scheduled to nodes where they remain until termination (according to restart
|
||||||
|
policy) or deletion. If a node dies, the pods scheduled to that node are
|
||||||
|
scheduled for deletion, after a timeout period. A given pod (as defined by a UID) is not
|
||||||
|
"rescheduled" to a new node; instead, it can be replaced by an identical pod,
|
||||||
|
with even the same name if desired, but with a new UID (see [replication
|
||||||
|
controller](/docs/user-guide/replication-controller/) for more details). (In the future, a
|
||||||
|
higher-level API may support pod migration.)
|
||||||
|
|
||||||
|
When something is said to have the same lifetime as a pod, such as a volume,
|
||||||
|
that means that it exists as long as that pod (with that UID) exists. If that
|
||||||
|
pod is deleted for any reason, even if an identical replacement is created, the
|
||||||
|
related thing (e.g. volume) is also destroyed and created anew.
|
||||||
|
|
||||||
|
![pod diagram](/images/docs/pod.svg){: style="max-width: 50%" }
|
||||||
|
|
||||||
|
*A multi-container pod that contains a file puller and a
|
||||||
|
web server that uses a persistent volume for shared storage between the containers.*
|
||||||
|
|
||||||
|
## Motivation for pods
|
||||||
|
|
||||||
|
### Management
|
||||||
|
|
||||||
|
Pods are a model of the pattern of multiple cooperating processes which form a
|
||||||
|
cohesive unit of service. They simplify application deployment and management
|
||||||
|
by providing a higher-level abstraction than the set of their constituent
|
||||||
|
applications. Pods serve as unit of deployment, horizontal scaling, and
|
||||||
|
replication. Colocation (co-scheduling), shared fate (e.g. termination),
|
||||||
|
coordinated replication, resource sharing, and dependency management are
|
||||||
|
handled automatically for containers in a pod.
|
||||||
|
|
||||||
|
### Resource sharing and communication
|
||||||
|
|
||||||
|
Pods enable data sharing and communication among their constituents.
|
||||||
|
|
||||||
|
The applications in a pod all use the same network namespace (same IP and port
|
||||||
|
space), and can thus "find" each other and communicate using `localhost`.
|
||||||
|
Because of this, applications in a pod must coordinate their usage of ports.
|
||||||
|
Each pod has an IP address in a flat shared networking space that has full
|
||||||
|
communication with other physical computers and pods across the network.
|
||||||
|
|
||||||
|
The hostname is set to the pod's Name for the application containers within the
|
||||||
|
pod. [More details on networking](/docs/admin/networking/).
|
||||||
|
|
||||||
|
In addition to defining the application containers that run in the pod, the pod
|
||||||
|
specifies a set of shared storage volumes. Volumes enable data to survive
|
||||||
|
container restarts and to be shared among the applications within the pod.
|
||||||
|
|
||||||
|
## Uses of pods
|
||||||
|
|
||||||
|
Pods can be used to host vertically integrated application stacks (e.g. LAMP),
|
||||||
|
but their primary motivation is to support co-located, co-managed helper
|
||||||
|
programs, such as:
|
||||||
|
|
||||||
|
* content management systems, file and data loaders, local cache managers, etc.
|
||||||
|
* log and checkpoint backup, compression, rotation, snapshotting, etc.
|
||||||
|
* data change watchers, log tailers, logging and monitoring adapters, event publishers, etc.
|
||||||
|
* proxies, bridges, and adapters
|
||||||
|
* controllers, managers, configurators, and updaters
|
||||||
|
|
||||||
|
Individual pods are not intended to run multiple instances of the same
|
||||||
|
application, in general.
|
||||||
|
|
||||||
|
For a longer explanation, see [The Distributed System ToolKit: Patterns for
|
||||||
|
Composite
|
||||||
|
Containers](http://blog.kubernetes.io/2015/06/the-distributed-system-toolkit-patterns.html).
|
||||||
|
|
||||||
|
## Alternatives considered
|
||||||
|
|
||||||
|
_Why not just run multiple programs in a single (Docker) container?_
|
||||||
|
|
||||||
|
1. Transparency. Making the containers within the pod visible to the
|
||||||
|
infrastructure enables the infrastructure to provide services to those
|
||||||
|
containers, such as process management and resource monitoring. This
|
||||||
|
facilitates a number of conveniences for users.
|
||||||
|
2. Decoupling software dependencies. The individual containers may be
|
||||||
|
versioned, rebuilt and redeployed independently. Kubernetes may even support
|
||||||
|
live updates of individual containers someday.
|
||||||
|
3. Ease of use. Users don't need to run their own process managers, worry about
|
||||||
|
signal and exit-code propagation, etc.
|
||||||
|
4. Efficiency. Because the infrastructure takes on more responsibility,
|
||||||
|
containers can be lighter weight.
|
||||||
|
|
||||||
|
_Why not support affinity-based co-scheduling of containers?_
|
||||||
|
|
||||||
|
That approach would provide co-location, but would not provide most of the
|
||||||
|
benefits of pods, such as resource sharing, IPC, guaranteed fate sharing, and
|
||||||
|
simplified management.
|
||||||
|
|
||||||
|
## Durability of pods (or lack thereof)
|
||||||
|
|
||||||
|
Pods aren't intended to be treated as durable entities. They won't survive scheduling failures, node failures, or other evictions, such as due to lack of resources, or in the case of node maintenance.
|
||||||
|
|
||||||
|
In general, users shouldn't need to create pods directly. They should almost always use controllers (e.g., [Deployments](/docs/user-guide/deployments/)), even for singletons. Controllers provide self-healing with a cluster scope, as well as replication and rollout management.
|
||||||
|
|
||||||
|
The use of collective APIs as the primary user-facing primitive is relatively common among cluster scheduling systems, including [Borg](https://research.google.com/pubs/pub43438.html), [Marathon](https://mesosphere.github.io/marathon/docs/rest-api.html), [Aurora](http://aurora.apache.org/documentation/latest/configuration-reference/#job-schema), and [Tupperware](http://www.slideshare.net/Docker/aravindnarayanan-facebook140613153626phpapp02-37588997).
|
||||||
|
|
||||||
|
Pod is exposed as a primitive in order to facilitate:
|
||||||
|
|
||||||
|
* scheduler and controller pluggability
|
||||||
|
* support for pod-level operations without the need to "proxy" them via controller APIs
|
||||||
|
* decoupling of pod lifetime from controller lifetime, such as for bootstrapping
|
||||||
|
* decoupling of controllers and services — the endpoint controller just watches pods
|
||||||
|
* clean composition of Kubelet-level functionality with cluster-level functionality — Kubelet is effectively the "pod controller"
|
||||||
|
* high-availability applications, which will expect pods to be replaced in advance of their termination and certainly in advance of deletion, such as in the case of planned evictions, image prefetching, or live pod migration [#3949](http://issue.k8s.io/3949)
|
||||||
|
|
||||||
|
There is new first-class support for stateful pods with the [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/) controller (currently in beta). The feature was alpha in 1.4 and was called [PetSet](/docs/user-guide/petset/). For prior versions of Kubernetes, best practice for having stateful pods is to create a replication controller with `replicas` equal to `1` and a corresponding service, see [this MySQL deployment example](/docs/tutorials/stateful-application/run-stateful-application/).
|
||||||
|
|
||||||
|
## Termination of Pods
|
||||||
|
|
||||||
|
Because pods represent running processes on nodes in the cluster, it is important to allow those processes to gracefully terminate when they are no longer needed (vs being violently killed with a KILL signal and having no chance to clean up). Users should be able to request deletion and know when processes terminate, but also be able to ensure that deletes eventually complete. When a user requests deletion of a pod the system records the intended grace period before the pod is allowed to be forcefully killed, and a TERM signal is sent to the main process in each container. Once the grace period has expired the KILL signal is sent to those processes and the pod is then deleted from the API server. If the Kubelet or the container manager is restarted while waiting for processes to terminate, the termination will be retried with the full grace period.
|
||||||
|
|
||||||
|
An example flow:
|
||||||
|
|
||||||
|
1. User sends command to delete Pod, with default grace period (30s)
|
||||||
|
2. The Pod in the API server is updated with the time beyond which the Pod is considered "dead" along with the grace period.
|
||||||
|
3. Pod shows up as "Terminating" when listed in client commands
|
||||||
|
4. (simultaneous with 3) When the Kubelet sees that a Pod has been marked as terminating because the time in 2 has been set, it begins the pod shutdown process.
|
||||||
|
1. If the pod has defined a [preStop hook](/docs/concepts/containers/container-lifecycle-hooks/#hook-details), it is invoked inside of the pod. If the `preStop` hook is still running after the grace period expires, step 2 is then invoked with a small (2 second) extended grace period.
|
||||||
|
2. The processes in the Pod are sent the TERM signal.
|
||||||
|
5. (simultaneous with 3), Pod is removed from endpoints list for service, and are no longer considered part of the set of running pods for replication controllers. Pods that shutdown slowly can continue to serve traffic as load balancers (like the service proxy) remove them from their rotations.
|
||||||
|
6. When the grace period expires, any processes still running in the Pod are killed with SIGKILL.
|
||||||
|
7. The Kubelet will finish deleting the Pod on the API server by setting grace period 0 (immediate deletion). The Pod disappears from the API and is no longer visible from the client.
|
||||||
|
|
||||||
|
By default, all deletes are graceful within 30 seconds. The `kubectl delete` command supports the `--grace-period=<seconds>` option which allows a user to override the default and specify their own value. The value `0` [force deletes](/docs/user-guide/pods/#force-termination-of-pods) the pod. In kubectl version >= 1.5, you must specify an additional flag `--force` along with `--grace-period=0` in order to perform force deletions.
|
||||||
|
|
||||||
|
### Force deletion of pods
|
||||||
|
|
||||||
|
Force deletion of a pod is defined as deletion of a pod from the cluster state and etcd immediately. When a force deletion is performed, the apiserver does not wait for confirmation from the kubelet that the pod has been terminated on the node it was running on. It removes the pod in the API immediately so a new pod can be created with the same name. On the node, pods that are set to terminate immediately will still be given a small grace period before being force killed.
|
||||||
|
|
||||||
|
Force deletions can be potentially dangerous for some pods and should be performed with caution. In case of StatefulSet pods, please refer to the task documentation for [deleting Pods from a StatefulSet](/docs/tasks/manage-stateful-set/delete-pods/#deleting-pods).
|
||||||
|
|
||||||
|
## Privileged mode for pod containers
|
||||||
|
|
||||||
|
From Kubernetes v1.1, any container in a pod can enable privileged mode, using the `privileged` flag on the `SecurityContext` of the container spec. This is useful for containers that want to use linux capabilities like manipulating the network stack and accessing devices. Processes within the container get almost the same privileges that are available to processes outside a container. With privileged mode, it should be easier to write network and volume plugins as separate pods that don't need to be compiled into the kubelet.
|
||||||
|
|
||||||
|
If the master is running Kubernetes v1.1 or higher, and the nodes are running a version lower than v1.1, then new privileged pods will be accepted by api-server, but will not be launched. They will be pending state.
|
||||||
|
If user calls `kubectl describe pod FooPodName`, user can see the reason why the pod is in pending state. The events table in the describe command output will say:
|
||||||
|
`Error validating pod "FooPodName"."FooPodNamespace" from api, ignoring: spec.containers[0].securityContext.privileged: forbidden '<*>(0xc2089d3248)true'`
|
||||||
|
|
||||||
|
|
||||||
|
If the master is running a version lower than v1.1, then privileged pods cannot be created. If user attempts to create a pod, that has a privileged container, the user will get the following error:
|
||||||
|
`The Pod "FooPodName" is invalid.
|
||||||
|
spec.containers[0].securityContext.privileged: forbidden '<*>(0xc20b222db0)true'`
|
||||||
|
|
||||||
|
## API Object
|
||||||
|
|
||||||
|
Pod is a top-level resource in the Kubernetes REST API. More details about the
|
||||||
|
API object can be found at: [Pod API
|
||||||
|
object](/docs/api-reference/v1.6/#pod-v1-core).
|
|
@ -9,7 +9,7 @@ title: Running Kubernetes on AWS EC2
|
||||||
{:toc}
|
{:toc}
|
||||||
|
|
||||||
|
|
||||||
## Supported Production Grade Tools with High Availability Options
|
## Supported Production Grade Tools
|
||||||
|
|
||||||
* [Kubernetes Operations](https://github.com/kubernetes/kops) - Production Grade K8s Installation, Upgrades, and Management. Supports running Debian, Ubuntu, CentOS, and RHEL in AWS.
|
* [Kubernetes Operations](https://github.com/kubernetes/kops) - Production Grade K8s Installation, Upgrades, and Management. Supports running Debian, Ubuntu, CentOS, and RHEL in AWS.
|
||||||
|
|
||||||
|
@ -17,16 +17,17 @@ title: Running Kubernetes on AWS EC2
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## kube-up bash script
|
## kube-up is no longer supported in kubernetes 1.6
|
||||||
|
|
||||||
> `kube-up.sh` is a legacy tool that is an easy way to spin up a cluster. This tool is being deprecated, and does not create a production ready environment.
|
> `kube-up.sh` is a legacy tool for launching clusters. It is deprecated, and removed entirely from kubernetes 1.6.
|
||||||
|
|
||||||
|
|
||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
|
||||||
1. You need an AWS account. Visit [http://aws.amazon.com](http://aws.amazon.com) to get started
|
1. This is only supported for kubernetes 1.5 and earlier. Consider switching to one of the supported options.
|
||||||
2. Install and configure the [AWS Command Line Interface](http://aws.amazon.com/cli)
|
2. You need an AWS account. Visit [http://aws.amazon.com](http://aws.amazon.com) to get started
|
||||||
3. We recommend installing using an account which has full access to the AWS APIs.
|
3. Install and configure the [AWS Command Line Interface](http://aws.amazon.com/cli)
|
||||||
|
4. We recommend installing using an account which has full access to the AWS APIs.
|
||||||
|
|
||||||
NOTE: This script use the 'default' AWS profile by default.
|
NOTE: This script use the 'default' AWS profile by default.
|
||||||
You may explicitly set the AWS profile to use using the `AWS_DEFAULT_PROFILE` environment variable:
|
You may explicitly set the AWS profile to use using the `AWS_DEFAULT_PROFILE` environment variable:
|
||||||
|
@ -161,7 +162,6 @@ cluster/kube-down.sh
|
||||||
|
|
||||||
IaaS Provider | Config. Mgmt | OS | Networking | Docs | Conforms | Support Level
|
IaaS Provider | Config. Mgmt | OS | Networking | Docs | Conforms | Support Level
|
||||||
-------------------- | ------------ | ------------- | ---------- | --------------------------------------------- | ---------| ----------------------------
|
-------------------- | ------------ | ------------- | ---------- | --------------------------------------------- | ---------| ----------------------------
|
||||||
AWS | Saltstack | Debian/Ubuntu | k8s (VPC) | [docs](/docs/getting-started-guides/aws) | | Community ([@justinsb](https://github.com/justinsb))
|
|
||||||
AWS | kops | Debian | k8s (VPC) | [docs](https://github.com/kubernetes/kops) | | Community ([@justinsb](https://github.com/justinsb))
|
AWS | kops | Debian | k8s (VPC) | [docs](https://github.com/kubernetes/kops) | | Community ([@justinsb](https://github.com/justinsb))
|
||||||
AWS | CoreOS | CoreOS | flannel | [docs](/docs/getting-started-guides/aws) | | Community
|
AWS | CoreOS | CoreOS | flannel | [docs](/docs/getting-started-guides/aws) | | Community
|
||||||
|
|
||||||
|
|
|
@ -450,7 +450,7 @@ You can find the docs at [Kubernetes Dashboard](https://github.com/kubernetes/da
|
||||||
|
|
||||||
## Launch other Services With Calico-Kubernetes
|
## Launch other Services With Calico-Kubernetes
|
||||||
|
|
||||||
At this point, you have a fully functioning cluster running on Kubernetes with a master and two nodes networked with Calico. You can now follow any of the [standard documentation](https://github.com/kubernetes/kubernetes/tree/{{page.version}}.0/examples/) to set up other services on your cluster.
|
At this point, you have a fully functioning cluster running on Kubernetes with a master and two nodes networked with Calico. You can now follow any of the [standard documentation](https://github.com/kubernetes/kubernetes/tree/{{page.fullversion}}/examples/) to set up other services on your cluster.
|
||||||
|
|
||||||
## Connectivity to outside the cluster
|
## Connectivity to outside the cluster
|
||||||
|
|
||||||
|
|
|
@ -28,4 +28,4 @@ Explore the glossary of essential Kubernetes concepts. Some good starting points
|
||||||
|
|
||||||
## Design Docs
|
## Design Docs
|
||||||
|
|
||||||
An archive of the design docs for Kubernetes functionality. Good starting points are [Kubernetes Architecture](https://github.com/kubernetes/kubernetes/blob/{{page.version}}/docs/design/architecture.md) and [Kubernetes Design Overview](https://github.com/kubernetes/kubernetes/tree/{{page.version}}/docs/design).
|
An archive of the design docs for Kubernetes functionality. Good starting points are [Kubernetes Architecture](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture.md) and [Kubernetes Design Overview](https://github.com/kubernetes/kubernetes/tree/{{page.fullversion}}/docs/design).
|
||||||
|
|
|
@ -87,7 +87,7 @@ than what you expect to use.
|
||||||
|
|
||||||
If you specify a request, a Pod is guaranteed to be able to use that much
|
If you specify a request, a Pod is guaranteed to be able to use that much
|
||||||
of the resource. See
|
of the resource. See
|
||||||
[Resource QoS](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resource-qos.md) for the difference between resource limits and requests.
|
[Resource QoS](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-qos.md) for the difference between resource limits and requests.
|
||||||
|
|
||||||
## If you don't specify limits or requests
|
## If you don't specify limits or requests
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ When prompted, enter your Docker username and password.
|
||||||
The login process creates or updates a `config.json` file that holds an
|
The login process creates or updates a `config.json` file that holds an
|
||||||
authorization token.
|
authorization token.
|
||||||
|
|
||||||
View the `configfile.json` file:
|
View the `config.json` file:
|
||||||
|
|
||||||
cat ~/.docker/config.json
|
cat ~/.docker/config.json
|
||||||
|
|
||||||
|
|
|
@ -5,27 +5,6 @@ assignees:
|
||||||
title: Annotations
|
title: Annotations
|
||||||
---
|
---
|
||||||
|
|
||||||
We have [labels](/docs/user-guide/labels) for identifying metadata.
|
{% include user-guide-content-moved.md %}
|
||||||
|
|
||||||
It is also useful to be able to attach arbitrary non-identifying metadata, for retrieval by API clients such as tools, libraries, etc. This information may be large, may be structured or unstructured, may include characters not permitted by labels, etc. Such information would not be used for object selection and therefore doesn't belong in labels.
|
[Annotations](/docs/concepts/overview/working-with-objects/annotations/)
|
||||||
|
|
||||||
Like labels, annotations are key-value maps.
|
|
||||||
|
|
||||||
```json
|
|
||||||
"annotations": {
|
|
||||||
"key1" : "value1",
|
|
||||||
"key2" : "value2"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Possible information that could be recorded in annotations:
|
|
||||||
|
|
||||||
* fields managed by a declarative configuration layer, to distinguish them from client- and/or server-set default values and other auto-generated fields, fields set by auto-sizing/auto-scaling systems, etc., in order to facilitate merging
|
|
||||||
* build/release/image information (timestamps, release ids, git branch, PR numbers, image hashes, registry address, etc.)
|
|
||||||
* pointers to logging/monitoring/analytics/audit repos
|
|
||||||
* client library/tool information (e.g. for debugging purposes -- name, version, build info)
|
|
||||||
* other user and/or tool/system provenance info, such as URLs of related objects from other ecosystem components
|
|
||||||
* lightweight rollout tool metadata (config and/or checkpoints)
|
|
||||||
* phone/pager number(s) of person(s) responsible, or directory entry where that info could be found, such as a team website
|
|
||||||
|
|
||||||
Yes, this information could be stored in an external database or directory, but that would make it much harder to produce shared client libraries and tools for deployment, management, introspection, etc.
|
|
||||||
|
|
|
@ -5,805 +5,6 @@ assignees:
|
||||||
title: Deployments
|
title: Deployments
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## What is a Deployment?
|
[Deployments](/docs/concepts/workloads/controllers/deployment/)
|
||||||
|
|
||||||
A _Deployment_ provides declarative updates for [Pods](/docs/user-guide/pods/) and [Replica Sets](/docs/user-guide/replicasets/) (the next-generation Replication Controller).
|
|
||||||
You only need to describe the desired state in a Deployment object, and the Deployment
|
|
||||||
controller will change the actual state to the desired state at a controlled rate for you.
|
|
||||||
You can define Deployments to create new resources, or replace existing ones
|
|
||||||
by new ones.
|
|
||||||
|
|
||||||
A typical use case is:
|
|
||||||
|
|
||||||
* Create a Deployment to bring up a Replica Set and Pods.
|
|
||||||
* Check the status of a Deployment to see if it succeeds or not.
|
|
||||||
* Later, update that Deployment to recreate the Pods (for example, to use a new image).
|
|
||||||
* Rollback to an earlier Deployment revision if the current Deployment isn't stable.
|
|
||||||
* Pause and resume a Deployment.
|
|
||||||
|
|
||||||
## Creating a Deployment
|
|
||||||
|
|
||||||
Here is an example Deployment. It creates a Replica Set to
|
|
||||||
bring up 3 nginx Pods.
|
|
||||||
|
|
||||||
{% include code.html language="yaml" file="nginx-deployment.yaml" ghlink="/docs/user-guide/nginx-deployment.yaml" %}
|
|
||||||
|
|
||||||
Run the example by downloading the example file and then running this command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create -f docs/user-guide/nginx-deployment.yaml --record
|
|
||||||
deployment "nginx-deployment" created
|
|
||||||
```
|
|
||||||
|
|
||||||
Setting the kubectl flag `--record` to `true` allows you to record current command in the annotations of the resources being created or updated. It will be useful for future introspection; for example, to see the commands executed in each Deployment revision.
|
|
||||||
|
|
||||||
Then running `get` immediately will give:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get deployments
|
|
||||||
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
|
||||||
nginx-deployment 3 0 0 0 1s
|
|
||||||
```
|
|
||||||
|
|
||||||
This indicates that the Deployment's number of desired replicas is 3 (according to deployment's `.spec.replicas`), the number of current replicas (`.status.replicas`) is 0, the number of up-to-date replicas (`.status.updatedReplicas`) is 0, and the number of available replicas (`.status.availableReplicas`) is also 0.
|
|
||||||
|
|
||||||
Running the `get` again a few seconds later, should give:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get deployments
|
|
||||||
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
|
||||||
nginx-deployment 3 3 3 3 18s
|
|
||||||
```
|
|
||||||
|
|
||||||
This indicates that the Deployment has created all three replicas, and all replicas are up-to-date (contains the latest pod template) and available (pod status is ready for at least Deployment's `.spec.minReadySeconds`). Running `kubectl get rs` and `kubectl get pods` will show the Replica Set (RS) and Pods created.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-2035384211 3 3 0 18s
|
|
||||||
```
|
|
||||||
|
|
||||||
You may notice that the name of the Replica Set is always `<the name of the Deployment>-<hash value of the pod template>`.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get pods --show-labels
|
|
||||||
NAME READY STATUS RESTARTS AGE LABELS
|
|
||||||
nginx-deployment-2035384211-7ci7o 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
|
|
||||||
nginx-deployment-2035384211-kzszj 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
|
|
||||||
nginx-deployment-2035384211-qqcnn 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
|
|
||||||
```
|
|
||||||
|
|
||||||
The created Replica Set will ensure that there are three nginx Pods at all times.
|
|
||||||
|
|
||||||
**Note:** You must specify appropriate selector and pod template labels of a Deployment (in this case, `app = nginx`), i.e. don't overlap with other controllers (including Deployments, Replica Sets, Replication Controllers, etc.) Kubernetes won't stop you from doing that, and if you end up with multiple controllers that have overlapping selectors, those controllers will fight with each other's and won't behave correctly.
|
|
||||||
|
|
||||||
|
|
||||||
## Updating a Deployment
|
|
||||||
|
|
||||||
**Note:** a Deployment's rollout is triggered if and only if the Deployment's pod template (i.e. `.spec.template`) is changed,
|
|
||||||
e.g. updating labels or container images of the template. Other updates, such as scaling the Deployment, will not trigger a rollout.
|
|
||||||
|
|
||||||
Suppose that we now want to update the nginx Pods to start using the `nginx:1.9.1` image
|
|
||||||
instead of the `nginx:1.7.9` image.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
|
|
||||||
deployment "nginx-deployment" image updated
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, we can `edit` the Deployment and change `.spec.template.spec.containers[0].image` from `nginx:1.7.9` to `nginx:1.9.1`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl edit deployment/nginx-deployment
|
|
||||||
deployment "nginx-deployment" edited
|
|
||||||
```
|
|
||||||
|
|
||||||
To see its rollout status, simply run:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout status deployment/nginx-deployment
|
|
||||||
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
|
||||||
deployment "nginx-deployment" successfully rolled out
|
|
||||||
```
|
|
||||||
|
|
||||||
After the rollout succeeds, you may want to `get` the Deployment:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get deployments
|
|
||||||
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
|
||||||
nginx-deployment 3 3 3 3 36s
|
|
||||||
```
|
|
||||||
|
|
||||||
The number of up-to-date replicas indicates that the Deployment has updated the replicas to the latest configuration.
|
|
||||||
The current replicas indicates the total replicas this Deployment manages, and the available replicas indicates the
|
|
||||||
number of current replicas that are available.
|
|
||||||
|
|
||||||
We can run `kubectl get rs` to see that the Deployment updated the Pods by creating a new Replica Set and scaling it up to 3 replicas, as well as scaling down the old Replica Set to 0 replicas.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-1564180365 3 3 0 6s
|
|
||||||
nginx-deployment-2035384211 0 0 0 36s
|
|
||||||
```
|
|
||||||
|
|
||||||
Running `get pods` should now show only the new Pods:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get pods
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
nginx-deployment-1564180365-khku8 1/1 Running 0 14s
|
|
||||||
nginx-deployment-1564180365-nacti 1/1 Running 0 14s
|
|
||||||
nginx-deployment-1564180365-z9gth 1/1 Running 0 14s
|
|
||||||
```
|
|
||||||
|
|
||||||
Next time we want to update these Pods, we only need to update the Deployment's pod template again.
|
|
||||||
|
|
||||||
Deployment can ensure that only a certain number of Pods may be down while they are being updated. By
|
|
||||||
default, it ensures that at least 25% less than the desired number of Pods are
|
|
||||||
up (25% max unavailable).
|
|
||||||
|
|
||||||
Deployment can also ensure that only a certain number of Pods may be created above the desired number of Pods. By default, it ensures that at most 25% more than the desired number of Pods are up (25% max surge).
|
|
||||||
|
|
||||||
For example, if you look at the above Deployment closely, you will see that
|
|
||||||
it first created a new Pod, then deleted some old Pods and created new ones. It
|
|
||||||
does not kill old Pods until a sufficient number of new Pods have come up, and does not create new Pods until a sufficient number of old Pods have been killed. It makes sure that number of available Pods is at least 2 and the number of total Pods is at most 4.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl describe deployments
|
|
||||||
Name: nginx-deployment
|
|
||||||
Namespace: default
|
|
||||||
CreationTimestamp: Tue, 15 Mar 2016 12:01:06 -0700
|
|
||||||
Labels: app=nginx
|
|
||||||
Selector: app=nginx
|
|
||||||
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
|
|
||||||
StrategyType: RollingUpdate
|
|
||||||
MinReadySeconds: 0
|
|
||||||
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
|
||||||
OldReplicaSets: <none>
|
|
||||||
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
|
|
||||||
Events:
|
|
||||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
|
||||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
|
||||||
36s 36s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
|
|
||||||
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
|
|
||||||
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
|
|
||||||
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
|
|
||||||
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
|
|
||||||
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
|
|
||||||
```
|
|
||||||
|
|
||||||
Here we see that when we first created the Deployment, it created a Replica Set (nginx-deployment-2035384211) and scaled it up to 3 replicas directly.
|
|
||||||
When we updated the Deployment, it created a new Replica Set (nginx-deployment-1564180365) and scaled it up to 1 and then scaled down the old Replica Set to 2, so that at least 2 Pods were available and at most 4 Pods were created at all times.
|
|
||||||
It then continued scaling up and down the new and the old Replica Set, with the same rolling update strategy. Finally, we'll have 3 available replicas in the new Replica Set, and the old Replica Set is scaled down to 0.
|
|
||||||
|
|
||||||
### Multiple Updates
|
|
||||||
|
|
||||||
Each time a new deployment object is observed by the deployment controller, a Replica Set is
|
|
||||||
created to bring up the desired Pods if there is no existing Replica Set doing so.
|
|
||||||
Existing Replica Set controlling Pods whose labels match `.spec.selector` but whose
|
|
||||||
template does not match `.spec.template` are scaled down.
|
|
||||||
Eventually, the new Replica Set will be scaled to `.spec.replicas` and all old Replica Sets will
|
|
||||||
be scaled to 0.
|
|
||||||
|
|
||||||
If you update a Deployment while an existing deployment is in progress,
|
|
||||||
the Deployment will create a new Replica Set as per the update and start scaling that up, and
|
|
||||||
will roll the Replica Set that it was scaling up previously -- it will add it to its list of old Replica Sets and will
|
|
||||||
start scaling it down.
|
|
||||||
|
|
||||||
For example, suppose you create a Deployment to create 5 replicas of `nginx:1.7.9`,
|
|
||||||
but then updates the Deployment to create 5 replicas of `nginx:1.9.1`, when only 3
|
|
||||||
replicas of `nginx:1.7.9` had been created. In that case, Deployment will immediately start
|
|
||||||
killing the 3 `nginx:1.7.9` Pods that it had created, and will start creating
|
|
||||||
`nginx:1.9.1` Pods. It will not wait for 5 replicas of `nginx:1.7.9` to be created
|
|
||||||
before changing course.
|
|
||||||
|
|
||||||
## Rolling Back a Deployment
|
|
||||||
|
|
||||||
Sometimes you may want to rollback a Deployment; for example, when the Deployment is not stable, such as crash looping.
|
|
||||||
By default, two previous Deployment's rollout history are kept in the system so that you can rollback anytime you want
|
|
||||||
(you can change that by modifying [revision history limit](/docs/user-guide/deployments/#revision-history-limit)).
|
|
||||||
|
|
||||||
**Note:** a Deployment's revision is created when a Deployment's rollout is triggered. This means that the new revision is created
|
|
||||||
if and only if the Deployment's pod template (i.e. `.spec.template`) is changed, e.g. updating labels or container images of the template.
|
|
||||||
Other updates, such as scaling the Deployment, will not create a Deployment revision -- so that we can facilitate simultaneous manual- or
|
|
||||||
auto-scaling. This implies that when you rollback to an earlier revision, only the Deployment's pod template part will be rolled back.
|
|
||||||
|
|
||||||
Suppose that we made a typo while updating the Deployment, by putting the image name as `nginx:1.91` instead of `nginx:1.9.1`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.91
|
|
||||||
deployment "nginx-deployment" image updated
|
|
||||||
```
|
|
||||||
|
|
||||||
The rollout will be stuck.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout status deployments nginx-deployment
|
|
||||||
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
|
||||||
```
|
|
||||||
|
|
||||||
Press Ctrl-C to stop the above rollout status watch. For more information on stuck rollouts, [read more here](#deployment-status).
|
|
||||||
|
|
||||||
You will also see that both the number of old replicas (nginx-deployment-1564180365 and nginx-deployment-2035384211) and new replicas (nginx-deployment-3066724191) are 2.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-1564180365 2 2 0 25s
|
|
||||||
nginx-deployment-2035384211 0 0 0 36s
|
|
||||||
nginx-deployment-3066724191 2 2 2 6s
|
|
||||||
```
|
|
||||||
|
|
||||||
Looking at the Pods created, you will see that the 2 Pods created by new Replica Set are stuck in an image pull loop.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get pods
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
nginx-deployment-1564180365-70iae 1/1 Running 0 25s
|
|
||||||
nginx-deployment-1564180365-jbqqo 1/1 Running 0 25s
|
|
||||||
nginx-deployment-3066724191-08mng 0/1 ImagePullBackOff 0 6s
|
|
||||||
nginx-deployment-3066724191-eocby 0/1 ImagePullBackOff 0 6s
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that the Deployment controller will stop the bad rollout automatically, and will stop scaling up the new Replica Set.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl describe deployment
|
|
||||||
Name: nginx-deployment
|
|
||||||
Namespace: default
|
|
||||||
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
|
|
||||||
Labels: app=nginx
|
|
||||||
Selector: app=nginx
|
|
||||||
Replicas: 2 updated | 3 total | 2 available | 2 unavailable
|
|
||||||
StrategyType: RollingUpdate
|
|
||||||
MinReadySeconds: 0
|
|
||||||
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
|
||||||
OldReplicaSets: nginx-deployment-1564180365 (2/2 replicas created)
|
|
||||||
NewReplicaSet: nginx-deployment-3066724191 (2/2 replicas created)
|
|
||||||
Events:
|
|
||||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
|
||||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
|
||||||
1m 1m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
|
|
||||||
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
|
|
||||||
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
|
|
||||||
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
|
|
||||||
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
|
|
||||||
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
|
|
||||||
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
|
|
||||||
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
|
|
||||||
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
|
|
||||||
```
|
|
||||||
|
|
||||||
To fix this, we need to rollback to a previous revision of Deployment that is stable.
|
|
||||||
|
|
||||||
### Checking Rollout History of a Deployment
|
|
||||||
|
|
||||||
First, check the revisions of this deployment:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout history deployment/nginx-deployment
|
|
||||||
deployments "nginx-deployment":
|
|
||||||
REVISION CHANGE-CAUSE
|
|
||||||
1 kubectl create -f docs/user-guide/nginx-deployment.yaml --record
|
|
||||||
2 kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
|
|
||||||
3 kubectl set image deployment/nginx-deployment nginx=nginx:1.91
|
|
||||||
```
|
|
||||||
|
|
||||||
Because we recorded the command while creating this Deployment using `--record`, we can easily see the changes we made in each revision.
|
|
||||||
|
|
||||||
To further see the details of each revision, run:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout history deployment/nginx-deployment --revision=2
|
|
||||||
deployments "nginx-deployment" revision 2
|
|
||||||
Labels: app=nginx
|
|
||||||
pod-template-hash=1159050644
|
|
||||||
Annotations: kubernetes.io/change-cause=kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
|
|
||||||
Containers:
|
|
||||||
nginx:
|
|
||||||
Image: nginx:1.9.1
|
|
||||||
Port: 80/TCP
|
|
||||||
QoS Tier:
|
|
||||||
cpu: BestEffort
|
|
||||||
memory: BestEffort
|
|
||||||
Environment Variables: <none>
|
|
||||||
No volumes.
|
|
||||||
```
|
|
||||||
|
|
||||||
### Rolling Back to a Previous Revision
|
|
||||||
|
|
||||||
Now we've decided to undo the current rollout and rollback to the previous revision:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout undo deployment/nginx-deployment
|
|
||||||
deployment "nginx-deployment" rolled back
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, you can rollback to a specific revision by specify that in `--to-revision`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout undo deployment/nginx-deployment --to-revision=2
|
|
||||||
deployment "nginx-deployment" rolled back
|
|
||||||
```
|
|
||||||
|
|
||||||
For more details about rollout related commands, read [`kubectl rollout`](/docs/user-guide/kubectl/kubectl_rollout/).
|
|
||||||
|
|
||||||
The Deployment is now rolled back to a previous stable revision. As you can see, a `DeploymentRollback` event for rolling back to revision 2 is generated from Deployment controller.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get deployment
|
|
||||||
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
|
||||||
nginx-deployment 3 3 3 3 30m
|
|
||||||
|
|
||||||
$ kubectl describe deployment
|
|
||||||
Name: nginx-deployment
|
|
||||||
Namespace: default
|
|
||||||
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
|
|
||||||
Labels: app=nginx
|
|
||||||
Selector: app=nginx
|
|
||||||
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
|
|
||||||
StrategyType: RollingUpdate
|
|
||||||
MinReadySeconds: 0
|
|
||||||
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
|
||||||
OldReplicaSets: <none>
|
|
||||||
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
|
|
||||||
Events:
|
|
||||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
|
||||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
|
||||||
30m 30m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
|
|
||||||
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
|
|
||||||
2m 2m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-3066724191 to 0
|
|
||||||
2m 2m 1 {deployment-controller } Normal DeploymentRollback Rolled back deployment "nginx-deployment" to revision 2
|
|
||||||
29m 2m 2 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
|
|
||||||
```
|
|
||||||
|
|
||||||
### Clean up Policy
|
|
||||||
|
|
||||||
You can set `.spec.revisionHistoryLimit` field to specify how much revision history of this deployment you want to keep. By default,
|
|
||||||
all revision history will be kept; explicitly setting this field to `0` disallows a deployment being rolled back.
|
|
||||||
|
|
||||||
## Scaling a Deployment
|
|
||||||
|
|
||||||
You can scale a Deployment by using the following command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl scale deployment nginx-deployment --replicas 10
|
|
||||||
deployment "nginx-deployment" scaled
|
|
||||||
```
|
|
||||||
|
|
||||||
Assuming [horizontal pod autoscaling](/docs/user-guide/horizontal-pod-autoscaling/walkthrough.md) is enabled
|
|
||||||
in your cluster, you can setup an autoscaler for your Deployment and choose the minimum and maximum number of
|
|
||||||
Pods you want to run based on the CPU utilization of your existing Pods.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl autoscale deployment nginx-deployment --min=10 --max=15 --cpu-percent=80
|
|
||||||
deployment "nginx-deployment" autoscaled
|
|
||||||
```
|
|
||||||
|
|
||||||
RollingUpdate Deployments support running multiple versions of an application at the same time. When you
|
|
||||||
or an autoscaler scales a RollingUpdate Deployment that is in the middle of a rollout (either in progress
|
|
||||||
or paused), then the Deployment controller will balance the additional replicas in the existing active
|
|
||||||
ReplicaSets (ReplicaSets with Pods) in order to mitigate risk. This is called *proportional scaling*.
|
|
||||||
|
|
||||||
For example, you are running a Deployment with 10 replicas, [maxSurge](#max-surge)=3, and [maxUnavailable](#max-unavailable)=2.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get deploy
|
|
||||||
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
|
||||||
nginx-deployment 10 10 10 10 50s
|
|
||||||
```
|
|
||||||
|
|
||||||
You update to a new image which happens to be unresolvable from inside the cluster.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl set image deploy/nginx-deployment nginx=nginx:sometag
|
|
||||||
deployment "nginx-deployment" image updated
|
|
||||||
```
|
|
||||||
|
|
||||||
The image update starts a new rollout with ReplicaSet nginx-deployment-1989198191 but it's blocked due to the
|
|
||||||
maxUnavailable requirement that we mentioned above.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-1989198191 5 5 0 9s
|
|
||||||
nginx-deployment-618515232 8 8 8 1m
|
|
||||||
```
|
|
||||||
|
|
||||||
Then a new scaling request for the Deployment comes along. The autoscaler increments the Deployment replicas
|
|
||||||
to 15. The Deployment controller needs to decide where to add these new 5 replicas. If we weren't using
|
|
||||||
proportional scaling, all 5 of them would be added in the new ReplicaSet. With proportional scaling, we
|
|
||||||
spread the additional replicas across all ReplicaSets. Bigger proportions go to the ReplicaSets with the
|
|
||||||
most replicas and lower proportions go to ReplicaSets with less replicas. Any leftovers are added to the
|
|
||||||
ReplicaSet with the most replicas. ReplicaSets with zero replicas are not scaled up.
|
|
||||||
|
|
||||||
In our example above, 3 replicas will be added to the old ReplicaSet and 2 replicas will be added to the
|
|
||||||
new ReplicaSet. The rollout process should eventually move all replicas to the new ReplicaSet, assuming
|
|
||||||
the new replicas become healthy.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get deploy
|
|
||||||
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
|
|
||||||
nginx-deployment 15 18 7 8 7m
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-1989198191 7 7 0 7m
|
|
||||||
nginx-deployment-618515232 11 11 11 7m
|
|
||||||
```
|
|
||||||
|
|
||||||
## Pausing and Resuming a Deployment
|
|
||||||
|
|
||||||
You can also pause a Deployment mid-way and then resume it. A use case is to support canary deployment.
|
|
||||||
|
|
||||||
Update the Deployment again and then pause the Deployment with `kubectl rollout pause`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1; kubectl rollout pause deployment/nginx-deployment
|
|
||||||
deployment "nginx-deployment" image updated
|
|
||||||
deployment "nginx-deployment" paused
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that any current state of the Deployment will continue its function, but new updates to the Deployment will not have an effect as long as the Deployment is paused.
|
|
||||||
|
|
||||||
The Deployment was still in progress when we paused it, so the actions of scaling up and down Replica Sets are paused too.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-1564180365 2 2 2 1h
|
|
||||||
nginx-deployment-2035384211 2 2 0 1h
|
|
||||||
nginx-deployment-3066724191 0 0 0 1h
|
|
||||||
```
|
|
||||||
|
|
||||||
In a separate terminal, watch for rollout status changes and you'll see the rollout won't continue:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout status deployment/nginx-deployment
|
|
||||||
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
|
||||||
```
|
|
||||||
|
|
||||||
To resume the Deployment, simply do `kubectl rollout resume`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout resume deployment/nginx-deployment
|
|
||||||
deployment "nginx-deployment" resumed
|
|
||||||
```
|
|
||||||
|
|
||||||
Then the Deployment will continue and finish the rollout:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout status deployment/nginx-deployment
|
|
||||||
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
|
||||||
Waiting for deployment spec update to be observed...
|
|
||||||
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
|
||||||
deployment nginx-deployment successfully rolled out
|
|
||||||
```
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get rs
|
|
||||||
NAME DESIRED CURRENT READY AGE
|
|
||||||
nginx-deployment-1564180365 3 3 3 1h
|
|
||||||
nginx-deployment-2035384211 0 0 0 1h
|
|
||||||
nginx-deployment-3066724191 0 0 0 1h
|
|
||||||
```
|
|
||||||
|
|
||||||
Note: You cannot rollback a paused Deployment until you resume it.
|
|
||||||
|
|
||||||
|
|
||||||
## Deployment status
|
|
||||||
|
|
||||||
A Deployment enters various states during its lifecycle. It can be [progressing](#progressing-deployment) while rolling out a new ReplicaSet,
|
|
||||||
it can be [complete](#complete-deployment), or it can [fail to progress](#failed-deployment).
|
|
||||||
|
|
||||||
### Progressing Deployment
|
|
||||||
|
|
||||||
Kubernetes marks a Deployment as _progressing_ when one of the following tasks is performed:
|
|
||||||
|
|
||||||
* The Deployment is in the process of creating a new ReplicaSet.
|
|
||||||
* The Deployment is scaling up an existing ReplicaSet.
|
|
||||||
* The Deployment is scaling down an existing ReplicaSet.
|
|
||||||
* New pods become available.
|
|
||||||
|
|
||||||
You can monitor the progress for a Deployment by using `kubectl rollout status`.
|
|
||||||
|
|
||||||
### Complete Deployment
|
|
||||||
|
|
||||||
Kubernetes marks a Deployment as _complete_ when it has the following characteristics:
|
|
||||||
|
|
||||||
* The Deployment has minimum availability. Minimum availability means that the Deployment's number of available replicas
|
|
||||||
equals or exceeds the number required by the Deployment strategy.
|
|
||||||
* All of the replicas associated with the Deployment have been updated to the latest version you've specified, meaning any
|
|
||||||
updates you've requested have been completed.
|
|
||||||
* No old pods for the Deployment are running.
|
|
||||||
|
|
||||||
You can check if a Deployment has completed by using `kubectl rollout status`. If the rollout completed successfully, `kubectl rollout status` returns a zero exit code.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout status deploy/nginx
|
|
||||||
Waiting for rollout to finish: 2 of 3 updated replicas are available...
|
|
||||||
deployment "nginx" successfully rolled out
|
|
||||||
$ echo $?
|
|
||||||
0
|
|
||||||
```
|
|
||||||
|
|
||||||
### Failed Deployment
|
|
||||||
|
|
||||||
Your Deployment may get stuck trying to deploy its newest ReplicaSet without ever completing. This can occur due to some of the following factors:
|
|
||||||
|
|
||||||
* Insufficient quota
|
|
||||||
* Readiness probe failures
|
|
||||||
* Image pull errors
|
|
||||||
* Insufficient permissions
|
|
||||||
* Limit ranges
|
|
||||||
* Application runtime misconfiguration
|
|
||||||
|
|
||||||
One way you can detect this condition is to specify a deadline parameter in your Deployment spec: ([`spec.progressDeadlineSeconds`](#progress-deadline-seconds)). `spec.progressDeadlineSeconds` denotes the number of seconds the Deployment controller waits before indicating (via the Deployment status) that the Deployment progress has stalled.
|
|
||||||
|
|
||||||
The following `kubectl` command sets the spec with `progressDeadlineSeconds` to make the controller report lack of progress for a Deployment after 10 minutes:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl patch deployment/nginx-deployment -p '{"spec":{"progressDeadlineSeconds":600}}'
|
|
||||||
"nginx-deployment" patched
|
|
||||||
```
|
|
||||||
Once the deadline has been exceeded, the Deployment controller adds a DeploymentCondition with the following attributes to
|
|
||||||
the Deployment's `status.conditions`:
|
|
||||||
|
|
||||||
* Type=Progressing
|
|
||||||
* Status=False
|
|
||||||
* Reason=ProgressDeadlineExceeded
|
|
||||||
|
|
||||||
See the [Kubernetes API conventions](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#typical-status-properties) for more information on status conditions.
|
|
||||||
|
|
||||||
Note that in version 1.5, Kubernetes will take no action on a stalled Deployment other than to report a status condition with
|
|
||||||
`Reason=ProgressDeadlineExceeded`.
|
|
||||||
|
|
||||||
**Note:** If you pause a Deployment, Kubernetes does not check progress against your specified deadline. You can safely pause a Deployment in the middle of a rollout and resume without triggering the condition for exceeding the deadline.
|
|
||||||
|
|
||||||
You may experience transient errors with your Deployments, either due to a low timeout that you have set or due to any other kind
|
|
||||||
of error that can be treated as transient. For example, let's suppose you have insufficient quota. If you describe the Deployment
|
|
||||||
you will notice the following section:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl describe deployment nginx-deployment
|
|
||||||
<...>
|
|
||||||
Conditions:
|
|
||||||
Type Status Reason
|
|
||||||
---- ------ ------
|
|
||||||
Available True MinimumReplicasAvailable
|
|
||||||
Progressing True ReplicaSetUpdated
|
|
||||||
ReplicaFailure True FailedCreate
|
|
||||||
<...>
|
|
||||||
```
|
|
||||||
|
|
||||||
If you run `kubectl get deployment nginx-deployment -o yaml`, the Deployement status might look like this:
|
|
||||||
|
|
||||||
```
|
|
||||||
status:
|
|
||||||
availableReplicas: 2
|
|
||||||
conditions:
|
|
||||||
- lastTransitionTime: 2016-10-04T12:25:39Z
|
|
||||||
lastUpdateTime: 2016-10-04T12:25:39Z
|
|
||||||
message: Replica set "nginx-deployment-4262182780" is progressing.
|
|
||||||
reason: ReplicaSetUpdated
|
|
||||||
status: "True"
|
|
||||||
type: Progressing
|
|
||||||
- lastTransitionTime: 2016-10-04T12:25:42Z
|
|
||||||
lastUpdateTime: 2016-10-04T12:25:42Z
|
|
||||||
message: Deployment has minimum availability.
|
|
||||||
reason: MinimumReplicasAvailable
|
|
||||||
status: "True"
|
|
||||||
type: Available
|
|
||||||
- lastTransitionTime: 2016-10-04T12:25:39Z
|
|
||||||
lastUpdateTime: 2016-10-04T12:25:39Z
|
|
||||||
message: 'Error creating: pods "nginx-deployment-4262182780-" is forbidden: exceeded quota:
|
|
||||||
object-counts, requested: pods=1, used: pods=3, limited: pods=2'
|
|
||||||
reason: FailedCreate
|
|
||||||
status: "True"
|
|
||||||
type: ReplicaFailure
|
|
||||||
observedGeneration: 3
|
|
||||||
replicas: 2
|
|
||||||
unavailableReplicas: 2
|
|
||||||
```
|
|
||||||
|
|
||||||
Eventually, once the Deployment progress deadline is exceeded, Kubernetes updates the status and the reason for the Progressing condition:
|
|
||||||
|
|
||||||
```
|
|
||||||
Conditions:
|
|
||||||
Type Status Reason
|
|
||||||
---- ------ ------
|
|
||||||
Available True MinimumReplicasAvailable
|
|
||||||
Progressing False ProgressDeadlineExceeded
|
|
||||||
ReplicaFailure True FailedCreate
|
|
||||||
```
|
|
||||||
|
|
||||||
You can address an issue of insufficient quota by scaling down your Deployment, by scaling down other controllers you may be running,
|
|
||||||
or by increasing quota in your namespace. If you satisfy the quota conditions and the Deployment controller then completes the Deployment
|
|
||||||
rollout, you'll see the Deployment's status update with a successful condition (`Status=True` and `Reason=NewReplicaSetAvailable`).
|
|
||||||
|
|
||||||
```
|
|
||||||
Conditions:
|
|
||||||
Type Status Reason
|
|
||||||
---- ------ ------
|
|
||||||
Available True MinimumReplicasAvailable
|
|
||||||
Progressing True NewReplicaSetAvailable
|
|
||||||
```
|
|
||||||
|
|
||||||
`Type=Available` with `Status=True` means that your Deployment has minimum availability. Minimum availability is dictated
|
|
||||||
by the parameters specified in the deployment strategy. `Type=Progressing` with `Status=True` means that your Deployment
|
|
||||||
is either in the middle of a rollout and it is progressing or that it has successfully completed its progress and the minimum
|
|
||||||
required new replicas are available (see the Reason of the condition for the particulars - in our case
|
|
||||||
`Reason=NewReplicaSetAvailable` means that the Deployment is complete).
|
|
||||||
|
|
||||||
You can check if a Deployment has failed to progress by using `kubectl rollout status`. `kubectl rollout status` returns a non-zero exit code if the Deployment has exceeded the progression deadline.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl rollout status deploy/nginx
|
|
||||||
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
|
|
||||||
error: deployment "nginx" exceeded its progress deadline
|
|
||||||
$ echo $?
|
|
||||||
1
|
|
||||||
```
|
|
||||||
|
|
||||||
### Operating on a failed deployment
|
|
||||||
|
|
||||||
All actions that apply to a complete Deployment also apply to a failed Deployment. You can scale it up/down, roll back
|
|
||||||
to a previous revision, or even pause it if you need to apply multiple tweaks in the Deployment pod template.
|
|
||||||
|
|
||||||
## Use Cases
|
|
||||||
|
|
||||||
### Canary Deployment
|
|
||||||
|
|
||||||
If you want to roll out releases to a subset of users or servers using the Deployment, you can create multiple Deployments, one for each release,
|
|
||||||
following the canary pattern described in [managing resources](/docs/concepts/cluster-administration/manage-deployment/#canary-deployments).
|
|
||||||
|
|
||||||
## Writing a Deployment Spec
|
|
||||||
|
|
||||||
As with all other Kubernetes configs, a Deployment needs `apiVersion`, `kind`, and
|
|
||||||
`metadata` fields. For general information about working with config files,
|
|
||||||
see [deploying applications](/docs/user-guide/deploying-applications), [configuring containers](/docs/user-guide/configuring-containers), and [using kubectl to manage resources](/docs/user-guide/working-with-resources) documents.
|
|
||||||
|
|
||||||
A Deployment also needs a [`.spec` section](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status).
|
|
||||||
|
|
||||||
### Pod Template
|
|
||||||
|
|
||||||
The `.spec.template` is the only required field of the `.spec`.
|
|
||||||
|
|
||||||
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template). It has exactly
|
|
||||||
the same schema as a [Pod](/docs/user-guide/pods), except it is nested and does not have an
|
|
||||||
`apiVersion` or `kind`.
|
|
||||||
|
|
||||||
In addition to required fields for a Pod, a pod template in a Deployment must specify appropriate
|
|
||||||
labels (i.e. don't overlap with other controllers, see [selector](#selector)) and an appropriate restart policy.
|
|
||||||
|
|
||||||
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
|
|
||||||
if not specified.
|
|
||||||
|
|
||||||
### Replicas
|
|
||||||
|
|
||||||
`.spec.replicas` is an optional field that specifies the number of desired Pods. It defaults
|
|
||||||
to 1.
|
|
||||||
|
|
||||||
### Selector
|
|
||||||
|
|
||||||
`.spec.selector` is an optional field that specifies a [label selector](/docs/user-guide/labels/#label-selectors) for the Pods
|
|
||||||
targeted by this deployment.
|
|
||||||
|
|
||||||
If specified, `.spec.selector` must match `.spec.template.metadata.labels`, or it will
|
|
||||||
be rejected by the API. If `.spec.selector` is unspecified, `.spec.selector.matchLabels` will be defaulted to
|
|
||||||
`.spec.template.metadata.labels`.
|
|
||||||
|
|
||||||
Deployment may kill Pods whose labels match the selector, in the case that their
|
|
||||||
template is different than `.spec.template` or if the total number of such Pods
|
|
||||||
exceeds `.spec.replicas`. It will bring up new Pods with `.spec.template` if
|
|
||||||
number of Pods are less than the desired number.
|
|
||||||
|
|
||||||
Note that you should not create other pods whose labels match this selector, either directly, via another Deployment or via another controller such as Replica Sets or Replication Controllers. Otherwise, the Deployment will think that those pods were created by it. Kubernetes will not stop you from doing this.
|
|
||||||
|
|
||||||
If you have multiple controllers that have overlapping selectors, the controllers will fight with each other's and won't behave correctly.
|
|
||||||
|
|
||||||
### Strategy
|
|
||||||
|
|
||||||
`.spec.strategy` specifies the strategy used to replace old Pods by new ones.
|
|
||||||
`.spec.strategy.type` can be "Recreate" or "RollingUpdate". "RollingUpdate" is
|
|
||||||
the default value.
|
|
||||||
|
|
||||||
#### Recreate Deployment
|
|
||||||
|
|
||||||
All existing Pods are killed before new ones are created when
|
|
||||||
`.spec.strategy.type==Recreate`.
|
|
||||||
|
|
||||||
#### Rolling Update Deployment
|
|
||||||
|
|
||||||
The Deployment updates Pods in a [rolling update](/docs/tasks/run-application/rolling-update-replication-controller/) fashion
|
|
||||||
when `.spec.strategy.type==RollingUpdate`.
|
|
||||||
You can specify `maxUnavailable` and `maxSurge` to control
|
|
||||||
the rolling update process.
|
|
||||||
|
|
||||||
##### Max Unavailable
|
|
||||||
|
|
||||||
`.spec.strategy.rollingUpdate.maxUnavailable` is an optional field that specifies the
|
|
||||||
maximum number of Pods that can be unavailable during the update process.
|
|
||||||
The value can be an absolute number (e.g. 5) or a percentage of desired Pods
|
|
||||||
(e.g. 10%).
|
|
||||||
The absolute number is calculated from percentage by rounding up.
|
|
||||||
This can not be 0 if `.spec.strategy.rollingUpdate.maxSurge` is 0.
|
|
||||||
By default, a fixed value of 1 is used.
|
|
||||||
|
|
||||||
For example, when this value is set to 30%, the old Replica Set can be scaled down to
|
|
||||||
70% of desired Pods immediately when the rolling update starts. Once new Pods are
|
|
||||||
ready, old Replica Set can be scaled down further, followed by scaling up the new Replica Set,
|
|
||||||
ensuring that the total number of Pods available at all times during the
|
|
||||||
update is at least 70% of the desired Pods.
|
|
||||||
|
|
||||||
##### Max Surge
|
|
||||||
|
|
||||||
`.spec.strategy.rollingUpdate.maxSurge` is an optional field that specifies the
|
|
||||||
maximum number of Pods that can be created above the desired number of Pods.
|
|
||||||
Value can be an absolute number (e.g. 5) or a percentage of desired Pods
|
|
||||||
(e.g. 10%).
|
|
||||||
This can not be 0 if `MaxUnavailable` is 0.
|
|
||||||
The absolute number is calculated from percentage by rounding up.
|
|
||||||
By default, a value of 1 is used.
|
|
||||||
|
|
||||||
For example, when this value is set to 30%, the new Replica Set can be scaled up immediately when
|
|
||||||
the rolling update starts, such that the total number of old and new Pods do not exceed
|
|
||||||
130% of desired Pods. Once old Pods have been killed,
|
|
||||||
the new Replica Set can be scaled up further, ensuring that the total number of Pods running
|
|
||||||
at any time during the update is at most 130% of desired Pods.
|
|
||||||
|
|
||||||
### Progress Deadline Seconds
|
|
||||||
|
|
||||||
`.spec.progressDeadlineSeconds` is an optional field that specifies the number of seconds you want
|
|
||||||
to wait for your Deployment to progress before the system reports back that the Deployment has
|
|
||||||
[failed progressing](#failed-deployment) - surfaced as a condition with `Type=Progressing`, `Status=False`.
|
|
||||||
and `Reason=ProgressDeadlineExceeded` in the status of the resource. The deployment controller will keep
|
|
||||||
retrying the Deployment. In the future, once automatic rollback will be implemented, the deployment
|
|
||||||
controller will roll back a Deployment as soon as it observes such a condition.
|
|
||||||
|
|
||||||
If specified, this field needs to be greater than `.spec.minReadySeconds`.
|
|
||||||
|
|
||||||
### Min Ready Seconds
|
|
||||||
|
|
||||||
`.spec.minReadySeconds` is an optional field (with default value of 600s) that specifies the
|
|
||||||
minimum number of seconds for which a newly created Pod should be ready
|
|
||||||
without any of its containers crashing, for it to be considered available.
|
|
||||||
This defaults to 0 (the Pod will be considered available as soon as it is ready).
|
|
||||||
To learn more about when a Pod is considered ready, see [Container Probes](/docs/user-guide/pod-states/#container-probes).
|
|
||||||
|
|
||||||
### Rollback To
|
|
||||||
|
|
||||||
`.spec.rollbackTo` is an optional field with the configuration the Deployment is rolling back to. Setting this field will trigger a rollback, and this field will be cleared every time a rollback is done.
|
|
||||||
|
|
||||||
#### Revision
|
|
||||||
|
|
||||||
`.spec.rollbackTo.revision` is an optional field specifying the revision to rollback to. This defaults to 0, meaning rollback to the last revision in history.
|
|
||||||
|
|
||||||
### Revision History Limit
|
|
||||||
|
|
||||||
A deployment's revision history is stored in the replica sets it controls.
|
|
||||||
|
|
||||||
`.spec.revisionHistoryLimit` is an optional field (with default value of two) that specifies the number of old Replica Sets to retain to allow rollback. Its ideal value depends on the frequency and stability of new deployments. All old Replica Sets will be kept by default, consuming resources in `etcd` and crowding the output of `kubectl get rs`, if this field is not set. The configuration of each Deployment revision is stored in its Replica Sets; therefore, once an old Replica Set is deleted, you lose the ability to rollback to that revision of Deployment.
|
|
||||||
|
|
||||||
More specifically, setting this field to zero means that all old replica sets with 0 replica will be cleaned up.
|
|
||||||
In this case, a new deployment rollout cannot be undone, since its revision history is cleaned up.
|
|
||||||
|
|
||||||
### Paused
|
|
||||||
|
|
||||||
`.spec.paused` is an optional boolean field for pausing and resuming a Deployment. It defaults to false (a Deployment is not paused).
|
|
||||||
|
|
||||||
## Alternative to Deployments
|
|
||||||
|
|
||||||
### kubectl rolling update
|
|
||||||
|
|
||||||
[Kubectl rolling update](/docs/user-guide/kubectl/kubectl_rolling-update) updates Pods and Replication Controllers in a similar fashion.
|
|
||||||
But Deployments are recommended, since they are declarative, server side, and have additional features, such as rolling back to any previous revision even after the rolling update is done.
|
|
||||||
|
|
|
@ -5,14 +5,6 @@ assignees:
|
||||||
title: Names
|
title: Names
|
||||||
---
|
---
|
||||||
|
|
||||||
All objects in the Kubernetes REST API are unambiguously identified by a Name and a UID.
|
{% include user-guide-content-moved.md %}
|
||||||
|
|
||||||
For non-unique user-provided attributes, Kubernetes provides [labels](/docs/user-guide/labels) and [annotations](/docs/user-guide/annotations).
|
[Names](/docs/concepts/overview/working-with-objects/names/)
|
||||||
|
|
||||||
## Names
|
|
||||||
|
|
||||||
Names are generally client-provided. Only one object of a given kind can have a given name at a time (i.e., they are spatially unique). But if you delete an object, you can make a new object with the same name. Names are used to refer to an object in a resource URL, such as `/api/v1/pods/some-name`. By convention, the names of Kubernetes resources should be up to maximum length of 253 characters and consist of lower case alphanumeric characters, `-`, and `.`, but certain resources have more specific restrictions. See the [identifiers design doc](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/identifiers.md) for the precise syntax rules for names.
|
|
||||||
|
|
||||||
## UIDs
|
|
||||||
|
|
||||||
UID are generated by Kubernetes. Every object created over the whole lifetime of a Kubernetes cluster has a distinct UID (i.e., they are spatially and temporally unique).
|
|
||||||
|
|
|
@ -4,291 +4,6 @@ assignees:
|
||||||
title: Ingress Resources
|
title: Ingress Resources
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
__Terminology__
|
[Ingress Resources](/docs/concepts/services-networking/ingress/)
|
||||||
|
|
||||||
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
|
|
||||||
|
|
||||||
* Node: A single virtual or physical machine in a Kubernetes cluster.
|
|
||||||
* Cluster: A group of nodes firewalled from the internet, that are the primary compute resources managed by Kubernetes.
|
|
||||||
* Edge router: A router that enforces the firewall policy for your cluster. This could be a gateway managed by a cloudprovider or a physical piece of hardware.
|
|
||||||
* Cluster network: A set of links, logical or physical, that facilitate communication within a cluster according to the [Kubernetes networking model](/docs/admin/networking/). Examples of a Cluster network include Overlays such as [flannel](https://github.com/coreos/flannel#flannel) or SDNs such as [OVS](/docs/admin/ovs-networking/).
|
|
||||||
* Service: A Kubernetes [Service](/docs/user-guide/services/) that identifies a set of pods using label selectors. Unless mentioned otherwise, Services are assumed to have virtual IPs only routable within the cluster network.
|
|
||||||
|
|
||||||
## What is Ingress?
|
|
||||||
|
|
||||||
Typically, services and pods have IPs only routable by the cluster network. All traffic that ends up at an edge router is either dropped or forwarded elsewhere. Conceptually, this might look like:
|
|
||||||
|
|
||||||
```
|
|
||||||
internet
|
|
||||||
|
|
|
||||||
------------
|
|
||||||
[ Services ]
|
|
||||||
```
|
|
||||||
|
|
||||||
An Ingress is a collection of rules that allow inbound connections to reach the cluster services.
|
|
||||||
|
|
||||||
```
|
|
||||||
internet
|
|
||||||
|
|
|
||||||
[ Ingress ]
|
|
||||||
--|-----|--
|
|
||||||
[ Services ]
|
|
||||||
```
|
|
||||||
|
|
||||||
It can be configured to give services externally-reachable urls, load balance traffic, terminate SSL, offer name based virtual hosting etc. Users request ingress by POSTing the Ingress resource to the API server. An [Ingress controller](#ingress-controllers) is responsible for fulfilling the Ingress, usually with a loadbalancer, though it may also configure your edge router or additional frontends to help handle the traffic in an HA manner.
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
Before you start using the Ingress resource, there are a few things you should understand. The Ingress is a beta resource, not available in any Kubernetes release prior to 1.1. You need an Ingress controller to satisfy an Ingress, simply creating the resource will have no effect.
|
|
||||||
|
|
||||||
GCE/GKE deploys an ingress controller on the master. You can deploy any number of custom ingress controllers in a pod. You must annotate each ingress with the appropriate class, as indicated [here](https://github.com/kubernetes/ingress/tree/master/controllers/nginx#running-multiple-ingress-controllers) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md#disabling-glbc).
|
|
||||||
|
|
||||||
Make sure you review the [beta limitations](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md) of this controller. In environments other than GCE/GKE, you need to [deploy a controller](https://github.com/kubernetes/ingress/tree/master/controllers) as a pod.
|
|
||||||
|
|
||||||
## The Ingress Resource
|
|
||||||
|
|
||||||
A minimal Ingress might look like:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: test-ingress
|
|
||||||
spec:
|
|
||||||
rules:
|
|
||||||
- http:
|
|
||||||
paths:
|
|
||||||
- path: /testpath
|
|
||||||
backend:
|
|
||||||
serviceName: test
|
|
||||||
servicePort: 80
|
|
||||||
```
|
|
||||||
|
|
||||||
*POSTing this to the API server will have no effect if you have not configured an [Ingress controller](#ingress-controllers).*
|
|
||||||
|
|
||||||
__Lines 1-4__: As with all other Kubernetes config, an Ingress needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/deploying-applications), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
|
|
||||||
|
|
||||||
__Lines 5-7__: Ingress [spec](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status) has all the information needed to configure a loadbalancer or proxy server. Most importantly, it contains a list of rules matched against all incoming requests. Currently the Ingress resource only supports http rules.
|
|
||||||
|
|
||||||
__Lines 8-9__: Each http rule contains the following information: A host (e.g.: foo.bar.com, defaults to * in this example), a list of paths (e.g.: /testpath) each of which has an associated backend (test:80). Both the host and path must match the content of an incoming request before the loadbalancer directs traffic to the backend.
|
|
||||||
|
|
||||||
__Lines 10-12__: A backend is a service:port combination as described in the [services doc](/docs/user-guide/services). Ingress traffic is typically sent directly to the endpoints matching a backend.
|
|
||||||
|
|
||||||
__Global Parameters__: For the sake of simplicity the example Ingress has no global parameters, see the [api-reference](https://releases.k8s.io/{{page.githubbranch}}/pkg/apis/extensions/v1beta1/types.go) for a full definition of the resource. One can specify a global default backend in the absence of which requests that don't match a path in the spec are sent to the default backend of the Ingress controller.
|
|
||||||
|
|
||||||
## Ingress controllers
|
|
||||||
|
|
||||||
In order for the Ingress resource to work, the cluster must have an Ingress controller running. This is unlike other types of controllers, which typically run as part of the `kube-controller-manager` binary, and which are typically started automatically as part of cluster creation. You need to choose the ingress controller implementation that is the best fit for your cluster, or implement one. Examples and instructions can be found [here](https://github.com/kubernetes/ingress/tree/master/controllers).
|
|
||||||
|
|
||||||
## Before you begin
|
|
||||||
|
|
||||||
The following document describes a set of cross platform features exposed through the Ingress resource. Ideally, all Ingress controllers should fulfill this specification, but we're not there yet. The docs for the GCE and nginx controllers are [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md) respectively. **Make sure you review controller specific docs so you understand the caveats of each one**.
|
|
||||||
|
|
||||||
## Types of Ingress
|
|
||||||
|
|
||||||
### Single Service Ingress
|
|
||||||
|
|
||||||
There are existing Kubernetes concepts that allow you to expose a single service (see [alternatives](#alternatives)), however you can do so through an Ingress as well, by specifying a *default backend* with no rules.
|
|
||||||
|
|
||||||
{% include code.html language="yaml" file="ingress.yaml" ghlink="/docs/user-guide/ingress.yaml" %}
|
|
||||||
|
|
||||||
If you create it using `kubectl create -f` you should see:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get ing
|
|
||||||
NAME RULE BACKEND ADDRESS
|
|
||||||
test-ingress - testsvc:80 107.178.254.228
|
|
||||||
```
|
|
||||||
|
|
||||||
Where `107.178.254.228` is the IP allocated by the Ingress controller to satisfy this Ingress. The `RULE` column shows that all traffic send to the IP is directed to the Kubernetes Service listed under `BACKEND`.
|
|
||||||
|
|
||||||
### Simple fanout
|
|
||||||
|
|
||||||
As described previously, pods within kubernetes have IPs only visible on the cluster network, so we need something at the edge accepting ingress traffic and proxying it to the right endpoints. This component is usually a highly available loadbalancer. An Ingress allows you to keep the number of loadbalancers down to a minimum, for example, a setup like:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
foo.bar.com -> 178.91.123.132 -> / foo s1:80
|
|
||||||
/ bar s2:80
|
|
||||||
```
|
|
||||||
|
|
||||||
would require an Ingress such as:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: test
|
|
||||||
spec:
|
|
||||||
rules:
|
|
||||||
- host: foo.bar.com
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- path: /foo
|
|
||||||
backend:
|
|
||||||
serviceName: s1
|
|
||||||
servicePort: 80
|
|
||||||
- path: /bar
|
|
||||||
backend:
|
|
||||||
serviceName: s2
|
|
||||||
servicePort: 80
|
|
||||||
```
|
|
||||||
|
|
||||||
When you create the Ingress with `kubectl create -f`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get ing
|
|
||||||
NAME RULE BACKEND ADDRESS
|
|
||||||
test -
|
|
||||||
foo.bar.com
|
|
||||||
/foo s1:80
|
|
||||||
/bar s2:80
|
|
||||||
```
|
|
||||||
The Ingress controller will provision an implementation specific loadbalancer that satisfies the Ingress, as long as the services (s1, s2) exist. When it has done so, you will see the address of the loadbalancer under the last column of the Ingress.
|
|
||||||
|
|
||||||
### Name based virtual hosting
|
|
||||||
|
|
||||||
Name-based virtual hosts use multiple host names for the same IP address.
|
|
||||||
|
|
||||||
```
|
|
||||||
foo.bar.com --| |-> foo.bar.com s1:80
|
|
||||||
| 178.91.123.132 |
|
|
||||||
bar.foo.com --| |-> bar.foo.com s2:80
|
|
||||||
```
|
|
||||||
|
|
||||||
The following Ingress tells the backing loadbalancer to route requests based on the [Host header](https://tools.ietf.org/html/rfc7230#section-5.4).
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: test
|
|
||||||
spec:
|
|
||||||
rules:
|
|
||||||
- host: foo.bar.com
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- backend:
|
|
||||||
serviceName: s1
|
|
||||||
servicePort: 80
|
|
||||||
- host: bar.foo.com
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- backend:
|
|
||||||
serviceName: s2
|
|
||||||
servicePort: 80
|
|
||||||
```
|
|
||||||
|
|
||||||
__Default Backends__: An Ingress with no rules, like the one shown in the previous section, sends all traffic to a single default backend. You can use the same technique to tell a loadbalancer where to find your website's 404 page, by specifying a set of rules *and* a default backend. Traffic is routed to your default backend if none of the Hosts in your Ingress match the Host in the request header, and/or none of the paths match the url of the request.
|
|
||||||
|
|
||||||
### TLS
|
|
||||||
|
|
||||||
You can secure an Ingress by specifying a [secret](/docs/user-guide/secrets) that contains a TLS private key and certificate. Currently the Ingress only supports a single TLS port, 443, and assumes TLS termination. If the TLS configuration section in an Ingress specifies different hosts, they will be multiplexed on the same port according to the hostname specified through the SNI TLS extension (provided the Ingress controller supports SNI). The TLS secret must contain keys named `tls.crt` and `tls.key` that contain the certificate and private key to use for TLS, e.g.:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
tls.crt: base64 encoded cert
|
|
||||||
tls.key: base64 encoded key
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: testsecret
|
|
||||||
namespace: default
|
|
||||||
type: Opaque
|
|
||||||
```
|
|
||||||
|
|
||||||
Referencing this secret in an Ingress will tell the Ingress controller to secure the channel from the client to the loadbalancer using TLS:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: no-rules-map
|
|
||||||
spec:
|
|
||||||
tls:
|
|
||||||
- secretName: testsecret
|
|
||||||
backend:
|
|
||||||
serviceName: s1
|
|
||||||
servicePort: 80
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that there is a gap between TLS features supported by various Ingress controllers. Please refer to documentation on [nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md#https), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#tls), or any other platform specific Ingress controller to understand how TLS works in your environment.
|
|
||||||
|
|
||||||
### Loadbalancing
|
|
||||||
|
|
||||||
An Ingress controller is bootstrapped with some loadbalancing policy settings that it applies to all Ingress, such as the loadbalancing algorithm, backend weight scheme etc. More advanced loadbalancing concepts (e.g.: persistent sessions, dynamic weights) are not yet exposed through the Ingress. You can still get these features through the [service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). With time, we plan to distill loadbalancing patterns that are applicable cross platform into the Ingress resource.
|
|
||||||
|
|
||||||
It's also worth noting that even though health checks are not exposed directly through the Ingress, there exist parallel concepts in Kubernetes such as [readiness probes](/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/) which allow you to achieve the same end result. Please review the controller specific docs to see how they handle health checks ([nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#health-checks)).
|
|
||||||
|
|
||||||
## Updating an Ingress
|
|
||||||
|
|
||||||
Say you'd like to add a new Host to an existing Ingress, you can update it by editing the resource:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get ing
|
|
||||||
NAME RULE BACKEND ADDRESS
|
|
||||||
test - 178.91.123.132
|
|
||||||
foo.bar.com
|
|
||||||
/foo s1:80
|
|
||||||
$ kubectl edit ing test
|
|
||||||
```
|
|
||||||
|
|
||||||
This should pop up an editor with the existing yaml, modify it to include the new Host.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
spec:
|
|
||||||
rules:
|
|
||||||
- host: foo.bar.com
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- backend:
|
|
||||||
serviceName: s1
|
|
||||||
servicePort: 80
|
|
||||||
path: /foo
|
|
||||||
- host: bar.baz.com
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- backend:
|
|
||||||
serviceName: s2
|
|
||||||
servicePort: 80
|
|
||||||
path: /foo
|
|
||||||
..
|
|
||||||
```
|
|
||||||
|
|
||||||
saving it will update the resource in the API server, which should tell the Ingress controller to reconfigure the loadbalancer.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get ing
|
|
||||||
NAME RULE BACKEND ADDRESS
|
|
||||||
test - 178.91.123.132
|
|
||||||
foo.bar.com
|
|
||||||
/foo s1:80
|
|
||||||
bar.baz.com
|
|
||||||
/foo s2:80
|
|
||||||
```
|
|
||||||
|
|
||||||
You can achieve the same by invoking `kubectl replace -f` on a modified Ingress yaml file.
|
|
||||||
|
|
||||||
## Failing across availability zones
|
|
||||||
|
|
||||||
Techniques for spreading traffic across failure domains differs between cloud providers. Please check the documentation of the relevant Ingress controller for details. Please refer to the federation [doc](/docs/user-guide/federation/) for details on deploying Ingress in a federated cluster.
|
|
||||||
|
|
||||||
## Future Work
|
|
||||||
|
|
||||||
* Various modes of HTTPS/TLS support (e.g.: SNI, re-encryption)
|
|
||||||
* Requesting an IP or Hostname via claims
|
|
||||||
* Combining L4 and L7 Ingress
|
|
||||||
* More Ingress controllers
|
|
||||||
|
|
||||||
Please track the [L7 and Ingress proposal](https://github.com/kubernetes/kubernetes/pull/12827) for more details on the evolution of the resource, and the [Ingress repository](https://github.com/kubernetes/ingress/tree/master) for more details on the evolution of various Ingress controllers.
|
|
||||||
|
|
||||||
## Alternatives
|
|
||||||
|
|
||||||
You can expose a Service in multiple ways that don't directly involve the Ingress resource:
|
|
||||||
|
|
||||||
* Use [Service.Type=LoadBalancer](/docs/user-guide/services/#type-loadbalancer)
|
|
||||||
* Use [Service.Type=NodePort](/docs/user-guide/services/#type-nodeport)
|
|
||||||
* Use a [Port Proxy](https://github.com/kubernetes/contrib/tree/master/for-demos/proxy-to-service)
|
|
||||||
* Deploy the [Service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). This allows you to share a single IP among multiple Services and achieve more advanced loadbalancing through Service Annotations.
|
|
||||||
|
|
|
@ -6,83 +6,6 @@ assignees:
|
||||||
title: Namespaces
|
title: Namespaces
|
||||||
---
|
---
|
||||||
|
|
||||||
Kubernetes supports multiple virtual clusters backed by the same physical cluster.
|
{% include user-guide-content-moved.md %}
|
||||||
These virtual clusters are called namespaces.
|
|
||||||
|
|
||||||
## When to Use Multiple Namespaces
|
[Namespaces](/docs/concepts/overview/working-with-objects/namespaces/)
|
||||||
|
|
||||||
Namespaces are intended for use in environments with many users spread across multiple
|
|
||||||
teams, or projects. For clusters with a few to tens of users, you should not
|
|
||||||
need to create or think about namespaces at all. Start using namespaces when you
|
|
||||||
need the features they provide.
|
|
||||||
|
|
||||||
Namespaces provide a scope for names. Names of resources need to be unique within a namespace, but not across namespaces.
|
|
||||||
|
|
||||||
Namespaces are a way to divide cluster resources between multiple uses (via [resource quota](/docs/admin/resourcequota/)).
|
|
||||||
|
|
||||||
In future versions of Kubernetes, objects in the same namespace will have the same
|
|
||||||
access control policies by default.
|
|
||||||
|
|
||||||
It is not necessary to use multiple namespaces just to separate slightly different
|
|
||||||
resources, such as different versions of the same software: use [labels](/docs/user-guide/labels) to distinguish
|
|
||||||
resources within the same namespace.
|
|
||||||
|
|
||||||
## Working with Namespaces
|
|
||||||
|
|
||||||
Creation and deletion of namespaces is described in the [Admin Guide documentation
|
|
||||||
for namespaces](/docs/admin/namespaces)
|
|
||||||
|
|
||||||
### Viewing namespaces
|
|
||||||
|
|
||||||
You can list the current namespaces in a cluster using:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get namespaces
|
|
||||||
NAME LABELS STATUS
|
|
||||||
default <none> Active
|
|
||||||
kube-system <none> Active
|
|
||||||
```
|
|
||||||
|
|
||||||
Kubernetes starts with two initial namespaces:
|
|
||||||
|
|
||||||
* `default` The default namespace for objects with no other namespace
|
|
||||||
* `kube-system` The namespace for objects created by the Kubernetes system
|
|
||||||
|
|
||||||
### Setting the namespace for a request
|
|
||||||
|
|
||||||
To temporarily set the namespace for a request, use the `--namespace` flag.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl --namespace=<insert-namespace-name-here> run nginx --image=nginx
|
|
||||||
$ kubectl --namespace=<insert-namespace-name-here> get pods
|
|
||||||
```
|
|
||||||
|
|
||||||
### Setting the namespace preference
|
|
||||||
|
|
||||||
You can permanently save the namespace for all subsequent kubectl commands in that
|
|
||||||
context.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl config set-context $(kubectl config current-context) --namespace=<insert-namespace-name-here>
|
|
||||||
# Validate it
|
|
||||||
$ kubectl config view | grep namespace:
|
|
||||||
```
|
|
||||||
|
|
||||||
## Namespaces and DNS
|
|
||||||
|
|
||||||
When you create a [Service](/docs/user-guide/services), it creates a corresponding [DNS entry](/docs/admin/dns).
|
|
||||||
This entry is of the form `<service-name>.<namespace-name>.svc.cluster.local`, which means
|
|
||||||
that if a container just uses `<service-name>` it will resolve to the service which
|
|
||||||
is local to a namespace. This is useful for using the same configuration across
|
|
||||||
multiple namespaces such as Development, Staging and Production. If you want to reach
|
|
||||||
across namespaces, you need to use the fully qualified domain name (FQDN).
|
|
||||||
|
|
||||||
## Not All Objects are in a Namespace
|
|
||||||
|
|
||||||
Most Kubernetes resources (e.g. pods, services, replication controllers, and others) are
|
|
||||||
in some namespace. However namespace resources are not themselves in a namespace.
|
|
||||||
And low-level resources, such as [nodes](/docs/admin/node) and
|
|
||||||
persistentVolumes, are not in any namespace. Events are an exception: they may or may not
|
|
||||||
have a namespace, depending on the object the event is about.
|
|
||||||
|
|
|
@ -5,94 +5,6 @@ assignees:
|
||||||
title: Network Policies
|
title: Network Policies
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
A network policy is a specification of how selections of pods are allowed to communicate with each other and other network endpoints.
|
[Network Policies](/docs/concepts/services-networking/networkpolicies/)
|
||||||
|
|
||||||
`NetworkPolicy` resources use labels to select pods and define whitelist rules which allow traffic to the selected pods in addition to what is allowed by the isolation policy for a given namespace.
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
You must enable the `extensions/v1beta1/networkpolicies` runtime config in your apiserver to enable this resource.
|
|
||||||
|
|
||||||
You must also be using a networking solution which supports `NetworkPolicy` - simply creating the
|
|
||||||
resource without a controller to implement it will have no effect.
|
|
||||||
|
|
||||||
## Configuring Namespace Isolation Policy
|
|
||||||
|
|
||||||
Isolation can be configured on a per-namespace basis. Once isolation is configured on a namespace it will be applied to all pods in that namespace. Currently, only isolation policy on inbound traffic (ingress) can be defined.
|
|
||||||
|
|
||||||
The following ingress isolation types being supported:
|
|
||||||
|
|
||||||
- `DefaultDeny`: Pods in the namespace will be inaccessible from any source except the pod's local node.
|
|
||||||
|
|
||||||
Ingress isolation can be enabled using an annotation on the Namespace.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: Namespace
|
|
||||||
apiVersion: v1
|
|
||||||
metadata:
|
|
||||||
annotations:
|
|
||||||
net.beta.kubernetes.io/network-policy: |
|
|
||||||
{
|
|
||||||
"ingress": {
|
|
||||||
"isolation": "DefaultDeny"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
To configure the annotation via `kubectl`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
{% raw %}
|
|
||||||
kubectl annotate ns <namespace> "net.beta.kubernetes.io/network-policy={\"ingress\": {\"isolation\": \"DefaultDeny\"}}"
|
|
||||||
{% endraw %}
|
|
||||||
```
|
|
||||||
|
|
||||||
See the [NetworkPolicy getting started guide](/docs/getting-started-guides/network-policy/walkthrough) for an example.
|
|
||||||
|
|
||||||
## The `NetworkPolicy` Resource
|
|
||||||
|
|
||||||
See the [api-reference](/docs/api-reference/extensions/v1beta1/definitions/#_v1beta1_networkpolicy) for a full definition of the resource.
|
|
||||||
|
|
||||||
A minimal `NetworkPolicy` might look like this:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: extensions/v1beta1
|
|
||||||
kind: NetworkPolicy
|
|
||||||
metadata:
|
|
||||||
name: test-network-policy
|
|
||||||
namespace: default
|
|
||||||
spec:
|
|
||||||
podSelector:
|
|
||||||
matchLabels:
|
|
||||||
role: db
|
|
||||||
ingress:
|
|
||||||
- from:
|
|
||||||
- namespaceSelector:
|
|
||||||
matchLabels:
|
|
||||||
project: myproject
|
|
||||||
- podSelector:
|
|
||||||
matchLabels:
|
|
||||||
role: frontend
|
|
||||||
ports:
|
|
||||||
- protocol: tcp
|
|
||||||
port: 6379
|
|
||||||
```
|
|
||||||
|
|
||||||
*POSTing this to the API server will have no effect unless your chosen networking solution supports network policy.*
|
|
||||||
|
|
||||||
__Mandatory Fields__: As with all other Kubernetes config, a `NetworkPolicy` needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/simple-yaml), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
|
|
||||||
|
|
||||||
__spec__: `NetworkPolicy` [spec](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) has all the information needed to define a network isolation policy in the deployed controller.
|
|
||||||
|
|
||||||
__podSelector__: Each `NetworkPolicy` includes a `podSelector` which selects the grouping of pods to which the `ingress` rules in the policy apply.
|
|
||||||
|
|
||||||
__ingress__: Each `NetworkPolicy` includes a list of whitelist `ingress` rules. Each rule allows traffic which matches both the `from` and `ports` sections.
|
|
||||||
|
|
||||||
This example NetworkPolicy has the following characteristics:
|
|
||||||
|
|
||||||
1. applies to all pods in the default namespace with the label "role=db"
|
|
||||||
2. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the current namespace with the label "role=frontend" (due to the podSelector list element)
|
|
||||||
3. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the namespace "myproject" (due to the namespaceSelector list element)
|
|
||||||
|
|
|
@ -7,673 +7,6 @@ assignees:
|
||||||
title: Persistent Volumes
|
title: Persistent Volumes
|
||||||
---
|
---
|
||||||
|
|
||||||
This document describes the current state of `PersistentVolumes` in Kubernetes. Familiarity with [volumes](/docs/concepts/storage/volumes/) is suggested.
|
{% include user-guide-content-moved.md %}
|
||||||
|
|
||||||
* TOC
|
[Persistent Volumes](/docs/concepts/storage/persistent-volumes/)
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
Managing storage is a distinct problem from managing compute. The `PersistentVolume` subsystem provides an API for users and administrators that abstracts details of how storage is provided from how it is consumed. To do this we introduce two new API resources: `PersistentVolume` and `PersistentVolumeClaim`.
|
|
||||||
|
|
||||||
A `PersistentVolume` (PV) is a piece of networked storage in the cluster that has been provisioned by an administrator. It is a resource in the cluster just like a node is a cluster resource. PVs are volume plugins like Volumes, but have a lifecycle independent of any individual pod that uses the PV. This API object captures the details of the implementation of the storage, be that NFS, iSCSI, or a cloud-provider-specific storage system.
|
|
||||||
|
|
||||||
A `PersistentVolumeClaim` (PVC) is a request for storage by a user. It is similar to a pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., can be mounted once read/write or many times read-only).
|
|
||||||
|
|
||||||
While `PersistentVolumeClaims` allow a user to consume abstract storage
|
|
||||||
resources, it is common that users need `PersistentVolumes` with varying
|
|
||||||
properties, such as performance, for different problems. Cluster administrators
|
|
||||||
need to be able to offer a variety of `PersistentVolumes` that differ in more
|
|
||||||
ways than just size and access modes, without exposing users to the details of
|
|
||||||
how those volumes are implemented. For these needs there is the `StorageClass`
|
|
||||||
resource.
|
|
||||||
|
|
||||||
A `StorageClass` provides a way for administrators to describe the "classes" of
|
|
||||||
storage they offer. Different classes might map to quality-of-service levels,
|
|
||||||
or to backup policies, or to arbitrary policies determined by the cluster
|
|
||||||
administrators. Kubernetes itself is unopinionated about what classes
|
|
||||||
represent. This concept is sometimes called "profiles" in other storage
|
|
||||||
systems.
|
|
||||||
|
|
||||||
Please see the [detailed walkthrough with working examples](/docs/user-guide/persistent-volumes/walkthrough/).
|
|
||||||
|
|
||||||
|
|
||||||
## Lifecycle of a volume and claim
|
|
||||||
|
|
||||||
PVs are resources in the cluster. PVCs are requests for those resources and also act as claim checks to the resource. The interaction between PVs and PVCs follows this lifecycle:
|
|
||||||
|
|
||||||
### Provisioning
|
|
||||||
|
|
||||||
There are two ways PVs may be provisioned: statically or dynamically.
|
|
||||||
|
|
||||||
#### Static
|
|
||||||
A cluster administrator creates a number of PVs. They carry the details of the real storage which is available for use by cluster users. They exist in the Kubernetes API and are available for consumption.
|
|
||||||
|
|
||||||
#### Dynamic
|
|
||||||
When none of the static PVs the administrator created matches a user's `PersistentVolumeClaim`, the cluster may try to dynamically provision a volume specially for the PVC. This provisioning is based on `StorageClasses`: the PVC must request a class and the administrator must have created and configured that class in order for dynamic provisioning to occur. Claims that request the class `""` effectively disable dynamic provisioning for themselves.
|
|
||||||
|
|
||||||
### Binding
|
|
||||||
|
|
||||||
A user creates, or has already created in the case of dynamic provisioning, a `PersistentVolumeClaim` with a specific amount of storage requested and with certain access modes. A control loop in the master watches for new PVCs, finds a matching PV (if possible), and binds them together. If a PV was dynamically provisioned for a new PVC, the loop will always bind that PV to the PVC. Otherwise, the user will always get at least what they asked for, but the volume may be in excess of what was requested. Once bound, `PersistentVolumeClaim` binds are exclusive, regardless of the mode used to bind them.
|
|
||||||
|
|
||||||
Claims will remain unbound indefinitely if a matching volume does not exist. Claims will be bound as matching volumes become available. For example, a cluster provisioned with many 50Gi PVs would not match a PVC requesting 100Gi. The PVC can be bound when a 100Gi PV is added to the cluster.
|
|
||||||
|
|
||||||
### Using
|
|
||||||
|
|
||||||
Pods use claims as volumes. The cluster inspects the claim to find the bound volume and mounts that volume for a pod. For volumes which support multiple access modes, the user specifies which mode desired when using their claim as a volume in a pod.
|
|
||||||
|
|
||||||
Once a user has a claim and that claim is bound, the bound PV belongs to the user for as long as they need it. Users schedule Pods and access their claimed PVs by including a persistentVolumeClaim in their Pod's volumes block. [See below for syntax details](#claims-as-volumes).
|
|
||||||
|
|
||||||
### Releasing
|
|
||||||
|
|
||||||
When a user is done with their volume, they can delete the PVC objects from the API which allows reclamation of the resource. The volume is considered "released" when the claim is deleted, but it is not yet available for another claim. The previous claimant's data remains on the volume which must be handled according to policy.
|
|
||||||
|
|
||||||
### Reclaiming
|
|
||||||
|
|
||||||
The reclaim policy for a `PersistentVolume` tells the cluster what to do with the volume after it has been released of its claim. Currently, volumes can either be Retained, Recycled or Deleted. Retention allows for manual reclamation of the resource. For those volume plugins that support it, deletion removes both the `PersistentVolume` object from Kubernetes, as well as deleting the associated storage asset in external infrastructure (such as an AWS EBS, GCE PD, Azure Disk, or Cinder volume). Volumes that were dynamically provisioned are always deleted.
|
|
||||||
|
|
||||||
#### Recycling
|
|
||||||
|
|
||||||
If supported by appropriate volume plugin, recycling performs a basic scrub (`rm -rf /thevolume/*`) on the volume and makes it available again for a new claim.
|
|
||||||
|
|
||||||
However, an administrator can configure a custom recycler pod templates using the Kubernetes controller manager command line arguments as described [here](/docs/admin/kube-controller-manager/). The custom recycler pod template must contain a `volumes` specification, as shown in the example below:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Pod
|
|
||||||
metadata:
|
|
||||||
name: pv-recycler-
|
|
||||||
namespace: default
|
|
||||||
spec:
|
|
||||||
restartPolicy: Never
|
|
||||||
volumes:
|
|
||||||
- name: vol
|
|
||||||
hostPath:
|
|
||||||
path: /any/path/it/will/be/replaced
|
|
||||||
containers:
|
|
||||||
- name: pv-recycler
|
|
||||||
image: "gcr.io/google_containers/busybox"
|
|
||||||
command: ["/bin/sh", "-c", "test -e /scrub && rm -rf /scrub/..?* /scrub/.[!.]* /scrub/* && test -z \"$(ls -A /scrub)\" || exit 1"]
|
|
||||||
volumeMounts:
|
|
||||||
- name: vol
|
|
||||||
mountPath: /scrub
|
|
||||||
```
|
|
||||||
|
|
||||||
However, the particular path specified in the custom recycler pod template in the `volumes` part is replaced with the particular path of the volume that is being recycled.
|
|
||||||
|
|
||||||
## Types of Persistent Volumes
|
|
||||||
|
|
||||||
`PersistentVolume` types are implemented as plugins. Kubernetes currently supports the following plugins:
|
|
||||||
|
|
||||||
* GCEPersistentDisk
|
|
||||||
* AWSElasticBlockStore
|
|
||||||
* AzureFile
|
|
||||||
* AzureDisk
|
|
||||||
* FC (Fibre Channel)
|
|
||||||
* Flocker
|
|
||||||
* NFS
|
|
||||||
* iSCSI
|
|
||||||
* RBD (Ceph Block Device)
|
|
||||||
* CephFS
|
|
||||||
* Cinder (OpenStack block storage)
|
|
||||||
* Glusterfs
|
|
||||||
* VsphereVolume
|
|
||||||
* Quobyte Volumes
|
|
||||||
* HostPath (single node testing only -- local storage is not supported in any way and WILL NOT WORK in a multi-node cluster)
|
|
||||||
* VMware Photon
|
|
||||||
* Portworx Volumes
|
|
||||||
* ScaleIO Volumes
|
|
||||||
|
|
||||||
## Persistent Volumes
|
|
||||||
|
|
||||||
Each PV contains a spec and status, which is the specification and status of the volume.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolume
|
|
||||||
metadata:
|
|
||||||
name: pv0003
|
|
||||||
spec:
|
|
||||||
capacity:
|
|
||||||
storage: 5Gi
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
persistentVolumeReclaimPolicy: Recycle
|
|
||||||
storageClassName: slow
|
|
||||||
nfs:
|
|
||||||
path: /tmp
|
|
||||||
server: 172.17.0.2
|
|
||||||
```
|
|
||||||
|
|
||||||
### Capacity
|
|
||||||
|
|
||||||
Generally, a PV will have a specific storage capacity. This is set using the PV's `capacity` attribute. See the Kubernetes [Resource Model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) to understand the units expected by `capacity`.
|
|
||||||
|
|
||||||
Currently, storage size is the only resource that can be set or requested. Future attributes may include IOPS, throughput, etc.
|
|
||||||
|
|
||||||
### Access Modes
|
|
||||||
|
|
||||||
A `PersistentVolume` can be mounted on a host in any way supported by the resource provider. As shown in the table below, providers will have different capabilities and each PV's access modes are set to the specific modes supported by that particular volume. For example, NFS can support multiple read/write clients, but a specific NFS PV might be exported on the server as read-only. Each PV gets its own set of access modes describing that specific PV's capabilities.
|
|
||||||
|
|
||||||
The access modes are:
|
|
||||||
|
|
||||||
* ReadWriteOnce -- the volume can be mounted as read-write by a single node
|
|
||||||
* ReadOnlyMany -- the volume can be mounted read-only by many nodes
|
|
||||||
* ReadWriteMany -- the volume can be mounted as read-write by many nodes
|
|
||||||
|
|
||||||
In the CLI, the access modes are abbreviated to:
|
|
||||||
|
|
||||||
* RWO - ReadWriteOnce
|
|
||||||
* ROX - ReadOnlyMany
|
|
||||||
* RWX - ReadWriteMany
|
|
||||||
|
|
||||||
> __Important!__ A volume can only be mounted using one access mode at a time, even if it supports many. For example, a GCEPersistentDisk can be mounted as ReadWriteOnce by a single node or ReadOnlyMany by many nodes, but not at the same time.
|
|
||||||
|
|
||||||
|
|
||||||
| Volume Plugin | ReadWriteOnce| ReadOnlyMany| ReadWriteMany|
|
|
||||||
| :--- | :---: | :---: | :---: |
|
|
||||||
| AWSElasticBlockStore | ✓ | - | - |
|
|
||||||
| AzureFile | ✓ | ✓ | ✓ |
|
|
||||||
| AzureDisk | ✓ | - | - |
|
|
||||||
| CephFS | ✓ | ✓ | ✓ |
|
|
||||||
| Cinder | ✓ | - | - |
|
|
||||||
| FC | ✓ | ✓ | - |
|
|
||||||
| FlexVolume | ✓ | ✓ | - |
|
|
||||||
| Flocker | ✓ | - | - |
|
|
||||||
| GCEPersistentDisk | ✓ | ✓ | - |
|
|
||||||
| Glusterfs | ✓ | ✓ | ✓ |
|
|
||||||
| HostPath | ✓ | - | - |
|
|
||||||
| iSCSI | ✓ | ✓ | - |
|
|
||||||
| PhotonPersistentDisk | ✓ | - | - |
|
|
||||||
| Quobyte | ✓ | ✓ | ✓ |
|
|
||||||
| NFS | ✓ | ✓ | ✓ |
|
|
||||||
| RBD | ✓ | ✓ | - |
|
|
||||||
| VsphereVolume | ✓ | - | - |
|
|
||||||
| PortworxVolume | ✓ | - | ✓ |
|
|
||||||
| ScaleIO | ✓ | ✓ | - |
|
|
||||||
|
|
||||||
### Class
|
|
||||||
|
|
||||||
A PV can have a class, which is specified by setting the
|
|
||||||
`storageClassName` attribute to the name of a
|
|
||||||
`StorageClass`. A PV of a particular class can only be bound to PVCs requesting
|
|
||||||
that class. A PV with no `storageClassName` has no class and can only be bound
|
|
||||||
to PVCs that request no particular class.
|
|
||||||
|
|
||||||
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
|
|
||||||
of the `storageClassName` attribute. This annotation is still working, however
|
|
||||||
it will become fully deprecated in a future Kubernetes release.
|
|
||||||
|
|
||||||
### Reclaim Policy
|
|
||||||
|
|
||||||
Current reclaim policies are:
|
|
||||||
|
|
||||||
* Retain -- manual reclamation
|
|
||||||
* Recycle -- basic scrub ("rm -rf /thevolume/*")
|
|
||||||
* Delete -- associated storage asset such as AWS EBS, GCE PD, Azure Disk, or OpenStack Cinder volume is deleted
|
|
||||||
|
|
||||||
Currently, only NFS and HostPath support recycling. AWS EBS, GCE PD, Azure Disk, and Cinder volumes support deletion.
|
|
||||||
|
|
||||||
### Phase
|
|
||||||
|
|
||||||
A volume will be in one of the following phases:
|
|
||||||
|
|
||||||
* Available -- a free resource that is not yet bound to a claim
|
|
||||||
* Bound -- the volume is bound to a claim
|
|
||||||
* Released -- the claim has been deleted, but the resource is not yet reclaimed by the cluster
|
|
||||||
* Failed -- the volume has failed its automatic reclamation
|
|
||||||
|
|
||||||
The CLI will show the name of the PVC bound to the PV.
|
|
||||||
|
|
||||||
### Mount Options
|
|
||||||
|
|
||||||
A Kubernetes administrator can specify additional mount options for when a Persistent Volume is being mounted on a node.
|
|
||||||
|
|
||||||
You can specify a mount option by using the annotation `volume.beta.kubernetes.io/mount-options` on
|
|
||||||
your Persistent Volume.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: "v1"
|
|
||||||
kind: "PersistentVolume"
|
|
||||||
metadata:
|
|
||||||
name: gce-disk-1
|
|
||||||
annotations:
|
|
||||||
volume.beta.kubernetes.io/mount-options: "discard"
|
|
||||||
spec:
|
|
||||||
capacity:
|
|
||||||
storage: "10Gi"
|
|
||||||
accessModes:
|
|
||||||
- "ReadWriteOnce"
|
|
||||||
gcePersistentDisk:
|
|
||||||
fsType: "ext4"
|
|
||||||
pdName: "gce-disk-1
|
|
||||||
```
|
|
||||||
|
|
||||||
A mount option is a string which will be cumulatively joined and used while mounting volume to the disk.
|
|
||||||
|
|
||||||
Note that not all Persistent volume types support mount options. In Kubernetes version 1.6, the following
|
|
||||||
volume types support mount options.
|
|
||||||
|
|
||||||
* GCEPersistentDisk
|
|
||||||
* AWSElasticBlockStore
|
|
||||||
* AzureFile
|
|
||||||
* AzureDisk
|
|
||||||
* NFS
|
|
||||||
* iSCSI
|
|
||||||
* RBD (Ceph Block Device)
|
|
||||||
* CephFS
|
|
||||||
* Cinder (OpenStack block storage)
|
|
||||||
* Glusterfs
|
|
||||||
* VsphereVolume
|
|
||||||
* Quobyte Volumes
|
|
||||||
* VMware Photon
|
|
||||||
|
|
||||||
|
|
||||||
## PersistentVolumeClaims
|
|
||||||
|
|
||||||
Each PVC contains a spec and status, which is the specification and status of the claim.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
apiVersion: v1
|
|
||||||
metadata:
|
|
||||||
name: myclaim
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 8Gi
|
|
||||||
storageClassName: slow
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
release: "stable"
|
|
||||||
matchExpressions:
|
|
||||||
- {key: environment, operator: In, values: [dev]}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Access Modes
|
|
||||||
|
|
||||||
Claims use the same conventions as volumes when requesting storage with specific access modes.
|
|
||||||
|
|
||||||
### Resources
|
|
||||||
|
|
||||||
Claims, like pods, can request specific quantities of a resource. In this case, the request is for storage. The same [resource model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) applies to both volumes and claims.
|
|
||||||
|
|
||||||
### Selector
|
|
||||||
|
|
||||||
Claims can specify a [label selector](/docs/user-guide/labels/#label-selectors) to further filter the set of volumes. Only the volumes whose labels match the selector can be bound to the claim. The selector can consist of two fields:
|
|
||||||
|
|
||||||
* matchLabels - the volume must have a label with this value
|
|
||||||
* matchExpressions - a list of requirements made by specifying key, list of values, and operator that relates the key and values. Valid operators include In, NotIn, Exists, and DoesNotExist.
|
|
||||||
|
|
||||||
All of the requirements, from both `matchLabels` and `matchExpressions` are ANDed together – they must all be satisfied in order to match.
|
|
||||||
|
|
||||||
### Class
|
|
||||||
|
|
||||||
A claim can request a particular class by specifying the name of a
|
|
||||||
`StorageClass` using the attribute `storageClassName`.
|
|
||||||
Only PVs of the requested class, ones with the same `storageClassName` as the PVC, can
|
|
||||||
be bound to the PVC.
|
|
||||||
|
|
||||||
PVCs don't necessarily have to request a class. A PVC with its `storageClasName` set
|
|
||||||
equal to `""` is always interpreted to be requesting a PV with no class, so it
|
|
||||||
can only be bound to PVs with no class (no annotation or one set equal to
|
|
||||||
`""`). A PVC with no `storageClassName` is not quite the same and is treated differently
|
|
||||||
by the cluster depending on whether the
|
|
||||||
[`DefaultStorageClass` admission plugin](/docs/admin/admission-controllers/#defaultstorageclass)
|
|
||||||
is turned on.
|
|
||||||
|
|
||||||
* If the admission plugin is turned on, the administrator may specify a
|
|
||||||
default `StorageClass`. All PVCs that have no `storageClassName` can be bound only to
|
|
||||||
PVs of that default. Specifying a default `StorageClass` is done by setting the
|
|
||||||
annotation `storageclass.kubernetes.io/is-default-class` equal to "true" in
|
|
||||||
a `StorageClass` object. If the administrator does not specify a default, the
|
|
||||||
cluster responds to PVC creation as if the admission plugin were turned off. If
|
|
||||||
more than one default is specified, the admission plugin forbids the creation of
|
|
||||||
all PVCs.
|
|
||||||
* If the admission plugin is turned off, there is no notion of a default
|
|
||||||
`StorageClass`. All PVCs that have no `storageClassName` can be bound only to PVs that
|
|
||||||
have no class. In this case, the PVCs that have no `storageClassName` are treated the
|
|
||||||
same way as PVCs that have their `storageClassName` set to `""`.
|
|
||||||
|
|
||||||
Depending on installation method, a default StorageClass may be deployed
|
|
||||||
to Kubernetes cluster by addon manager during installation.
|
|
||||||
|
|
||||||
When a PVC specifies a `selector` in addition to requesting a `StorageClass`,
|
|
||||||
the requirements are ANDed together: only a PV of the requested class and with
|
|
||||||
the requested labels may be bound to the PVC. Note that currently, a PVC with a
|
|
||||||
non-empty `selector` can't have a PV dynamically provisioned for it.
|
|
||||||
|
|
||||||
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
|
|
||||||
of `storageClassName` attribute. This annotation is still working, however
|
|
||||||
it won't be supported in a future Kubernetes release.
|
|
||||||
|
|
||||||
## Claims As Volumes
|
|
||||||
|
|
||||||
Pods access storage by using the claim as a volume. Claims must exist in the same namespace as the pod using the claim. The cluster finds the claim in the pod's namespace and uses it to get the `PersistentVolume` backing the claim. The volume is then mounted to the host and into the pod.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: Pod
|
|
||||||
apiVersion: v1
|
|
||||||
metadata:
|
|
||||||
name: mypod
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: myfrontend
|
|
||||||
image: dockerfile/nginx
|
|
||||||
volumeMounts:
|
|
||||||
- mountPath: "/var/www/html"
|
|
||||||
name: mypd
|
|
||||||
volumes:
|
|
||||||
- name: mypd
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: myclaim
|
|
||||||
```
|
|
||||||
|
|
||||||
### A Note on Namespaces
|
|
||||||
|
|
||||||
`PersistentVolumes` binds are exclusive, and since `PersistentVolumeClaims` are namespaced objects, mounting claims with "Many" modes (`ROX`, `RWX`) is only possible within one namespace.
|
|
||||||
|
|
||||||
## StorageClasses
|
|
||||||
|
|
||||||
Each `StorageClass` contains the fields `provisioner` and `parameters`, which
|
|
||||||
are used when a `PersistentVolume` belonging to the class needs to be
|
|
||||||
dynamically provisioned.
|
|
||||||
|
|
||||||
The name of a `StorageClass` object is significant, and is how users can
|
|
||||||
request a particular class. Administrators set the name and other parameters
|
|
||||||
of a class when first creating `StorageClass` objects, and the objects cannot
|
|
||||||
be updated once they are created.
|
|
||||||
|
|
||||||
Administrators can specify a default `StorageClass` just for PVCs that don't
|
|
||||||
request any particular class to bind to: see the
|
|
||||||
[`PersistentVolumeClaim` section](#persistentvolumeclaims)
|
|
||||||
for details.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: standard
|
|
||||||
provisioner: kubernetes.io/aws-ebs
|
|
||||||
parameters:
|
|
||||||
type: gp2
|
|
||||||
```
|
|
||||||
|
|
||||||
### Provisioner
|
|
||||||
Storage classes have a provisioner that determines what volume plugin is used
|
|
||||||
for provisioning PVs. This field must be specified.
|
|
||||||
|
|
||||||
You are not restricted to specifying the "internal" provisioners
|
|
||||||
listed here (whose names are prefixed with "kubernetes.io" and shipped
|
|
||||||
alongside Kubernetes). You can also run and specify external provisioners,
|
|
||||||
which are independent programs that follow a [specification](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/volume-provisioning.md)
|
|
||||||
defined by Kubernetes. Authors of external provisioners have full discretion
|
|
||||||
over where their code lives, how the provisioner is shipped, how it needs to be
|
|
||||||
run, what volume plugin it uses (including Flex), etc. The repository [kubernetes-incubator/external-storage](https://github.com/kubernetes-incubator/external-storage)
|
|
||||||
houses a library for writing external provisioners that implements the bulk of
|
|
||||||
the specification plus various community-maintained external provisioners.
|
|
||||||
|
|
||||||
### Parameters
|
|
||||||
Storage classes have parameters that describe volumes belonging to the storage
|
|
||||||
class. Different parameters may be accepted depending on the `provisioner`. For
|
|
||||||
example, the value `io1`, for the parameter `type`, and the parameter
|
|
||||||
`iopsPerGB` are specific to EBS. When a parameter is omitted, some default is
|
|
||||||
used.
|
|
||||||
|
|
||||||
#### AWS
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: slow
|
|
||||||
provisioner: kubernetes.io/aws-ebs
|
|
||||||
parameters:
|
|
||||||
type: io1
|
|
||||||
zone: us-east-1d
|
|
||||||
iopsPerGB: "10"
|
|
||||||
```
|
|
||||||
|
|
||||||
* `type`: `io1`, `gp2`, `sc1`, `st1`. See [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) for details. Default: `gp2`.
|
|
||||||
* `zone`: AWS zone. If not specified, a random zone from those where Kubernetes cluster has a node is chosen.
|
|
||||||
* `iopsPerGB`: only for `io1` volumes. I/O operations per second per GiB. AWS volume plugin multiplies this with size of requested volume to compute IOPS of the volume and caps it at 20 000 IOPS (maximum supported by AWS, see [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html). A string is expected here, i.e. `"10"`, not `10`.
|
|
||||||
* `encrypted`: denotes whether the EBS volume should be encrypted or not. Valid values are `"true"` or `"false"`. A string is expected here, i.e. `"true"`, not `true`.
|
|
||||||
* `kmsKeyId`: optional. The full Amazon Resource Name of the key to use when encrypting the volume. If none is supplied but `encrypted` is true, a key is generated by AWS. See AWS docs for valid ARN value.
|
|
||||||
|
|
||||||
#### GCE
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: slow
|
|
||||||
provisioner: kubernetes.io/gce-pd
|
|
||||||
parameters:
|
|
||||||
type: pd-standard
|
|
||||||
zone: us-central1-a
|
|
||||||
```
|
|
||||||
|
|
||||||
* `type`: `pd-standard` or `pd-ssd`. Default: `pd-standard`
|
|
||||||
* `zone`: GCE zone. If not specified, a random zone in the same region as controller-manager will be chosen.
|
|
||||||
|
|
||||||
#### Glusterfs
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
kind: StorageClass
|
|
||||||
metadata:
|
|
||||||
name: slow
|
|
||||||
provisioner: kubernetes.io/glusterfs
|
|
||||||
parameters:
|
|
||||||
resturl: "http://127.0.0.1:8081"
|
|
||||||
restauthenabled: "true"
|
|
||||||
restuser: "admin"
|
|
||||||
secretNamespace: "default"
|
|
||||||
secretName: "heketi-secret"
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
* `resturl`: Gluster REST service/Heketi service url which provision gluster volumes on demand. The general format should be `IPaddress:Port` and this is a mandatory parameter for GlusterFS dynamic provisioner. If Heketi service is exposed as a routable service in openshift/kubernetes setup, this can have a format similar to
|
|
||||||
`http://heketi-storage-project.cloudapps.mystorage.com` where the fqdn is a resolvable heketi service url.
|
|
||||||
* `restauthenabled` : Gluster REST service authentication boolean that enables authentication to the REST server. If this value is 'true', `restuser` and `restuserkey` or `secretNamespace` + `secretName` have to be filled. This option is deprecated, authentication is enabled when any of `restuser`, `restuserkey`, `secretName` or `secretNamespace` is specified.
|
|
||||||
* `restuser` : Gluster REST service/Heketi user who has access to create volumes in the Gluster Trusted Pool.
|
|
||||||
* `restuserkey` : Gluster REST service/Heketi user's password which will be used for authentication to the REST server. This parameter is deprecated in favor of `secretNamespace` + `secretName`.
|
|
||||||
* `secretNamespace` + `secretName` : Identification of Secret instance that contains user password to use when talking to Gluster REST service. These parameters are optional, empty password will be used when both `secretNamespace` and `secretName` are omitted. The provided secret must have type "kubernetes.io/glusterfs", e.g. created in this way:
|
|
||||||
```
|
|
||||||
$ kubectl create secret generic heketi-secret --type="kubernetes.io/glusterfs" --from-literal=key='opensesame' --namespace=default
|
|
||||||
```
|
|
||||||
|
|
||||||
#### OpenStack Cinder
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: gold
|
|
||||||
provisioner: kubernetes.io/cinder
|
|
||||||
parameters:
|
|
||||||
type: fast
|
|
||||||
availability: nova
|
|
||||||
```
|
|
||||||
|
|
||||||
* `type`: [VolumeType](http://docs.openstack.org/admin-guide/dashboard-manage-volumes.html) created in Cinder. Default is empty.
|
|
||||||
* `availability`: Availability Zone. Default is empty.
|
|
||||||
|
|
||||||
#### vSphere
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: fast
|
|
||||||
provisioner: kubernetes.io/vsphere-volume
|
|
||||||
parameters:
|
|
||||||
diskformat: zeroedthick
|
|
||||||
```
|
|
||||||
|
|
||||||
* `diskformat`: `thin`, `zeroedthick` and `eagerzeroedthick`. Default: `"thin"`.
|
|
||||||
|
|
||||||
#### Ceph RBD
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
kind: StorageClass
|
|
||||||
metadata:
|
|
||||||
name: fast
|
|
||||||
provisioner: kubernetes.io/rbd
|
|
||||||
parameters:
|
|
||||||
monitors: 10.16.153.105:6789
|
|
||||||
adminId: kube
|
|
||||||
adminSecretName: ceph-secret
|
|
||||||
adminSecretNamespace: kube-system
|
|
||||||
pool: kube
|
|
||||||
userId: kube
|
|
||||||
userSecretName: ceph-secret-user
|
|
||||||
```
|
|
||||||
|
|
||||||
* `monitors`: Ceph monitors, comma delimited. This parameter is required.
|
|
||||||
* `adminId`: Ceph client ID that is capable of creating images in the pool. Default is "admin".
|
|
||||||
* `adminSecretNamespace`: The namespace for `adminSecret`. Default is "default".
|
|
||||||
* `adminSecret`: Secret Name for `adminId`. This parameter is required. The provided secret must have type "kubernetes.io/rbd".
|
|
||||||
* `pool`: Ceph RBD pool. Default is "rbd".
|
|
||||||
* `userId`: Ceph client ID that is used to map the RBD image. Default is the same as `adminId`.
|
|
||||||
* `userSecretName`: The name of Ceph Secret for `userId` to map RBD image. It must exist in the same namespace as PVCs. This parameter is required. The provided secret must have type "kubernetes.io/rbd", e.g. created in this way:
|
|
||||||
```
|
|
||||||
$ kubectl create secret generic ceph-secret --type="kubernetes.io/rbd" --from-literal=key='QVFEQ1pMdFhPUnQrSmhBQUFYaERWNHJsZ3BsMmNjcDR6RFZST0E9PQ==' --namespace=kube-system
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Quobyte
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
kind: StorageClass
|
|
||||||
metadata:
|
|
||||||
name: slow
|
|
||||||
provisioner: kubernetes.io/quobyte
|
|
||||||
parameters:
|
|
||||||
quobyteAPIServer: "http://138.68.74.142:7860"
|
|
||||||
registry: "138.68.74.142:7861"
|
|
||||||
adminSecretName: "quobyte-admin-secret"
|
|
||||||
adminSecretNamespace: "kube-system"
|
|
||||||
user: "root"
|
|
||||||
group: "root"
|
|
||||||
quobyteConfig: "BASE"
|
|
||||||
quobyteTenant: "DEFAULT"
|
|
||||||
```
|
|
||||||
|
|
||||||
* `quobyteAPIServer`: API Server of Quobyte in the format `http(s)://api-server:7860`
|
|
||||||
* `registry`: Quobyte registry to use to mount the volume. You can specify the registry as ``<host>:<port>`` pair or if you want to specify multiple registries you just have to put a comma between them e.q. ``<host1>:<port>,<host2>:<port>,<host3>:<port>``. The host can be an IP address or if you have a working DNS you can also provide the DNS names.
|
|
||||||
* `adminSecretNamespace`: The namespace for `adminSecretName`. Default is "default".
|
|
||||||
* `adminSecretName`: secret that holds information about the Quobyte user and the password to authenticate against the API server. The provided secret must have type "kubernetes.io/quobyte", e.g. created in this way:
|
|
||||||
```
|
|
||||||
$ kubectl create secret generic quobyte-admin-secret --type="kubernetes.io/quobyte" --from-literal=key='opensesame' --namespace=kube-system
|
|
||||||
```
|
|
||||||
* `user`: maps all access to this user. Default is "root".
|
|
||||||
* `group`: maps all access to this group. Default is "nfsnobody".
|
|
||||||
* `quobyteConfig`: use the specified configuration to create the volume. You can create a new configuration or modify an existing one with the Web console or the quobyte CLI. Default is "BASE".
|
|
||||||
* `quobyteTenant`: use the specified tenant ID to create/delete the volume. This Quobyte tenant has to be already present in Quobyte. Default is "DEFAULT".
|
|
||||||
|
|
||||||
#### Azure Disk
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: slow
|
|
||||||
provisioner: kubernetes.io/azure-disk
|
|
||||||
parameters:
|
|
||||||
skuName: Standard_LRS
|
|
||||||
location: eastus
|
|
||||||
storageAccount: azure_storage_account_name
|
|
||||||
```
|
|
||||||
|
|
||||||
* `skuName`: Azure storage account Sku tier. Default is empty.
|
|
||||||
* `location`: Azure storage account location. Default is empty.
|
|
||||||
* `storageAccount`: Azure storage account name. If storage account is not provided, all storage accounts associated with the resource group are searched to find one that matches `skuName` and `location`. If storage account is provided, it must reside in the same resource group as the cluster, and `skuName` and `location` are ignored.
|
|
||||||
|
|
||||||
#### Portworx Volume
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: portworx-io-priority-high
|
|
||||||
provisioner: kubernetes.io/portworx-volume
|
|
||||||
parameters:
|
|
||||||
repl: "1"
|
|
||||||
snap_interval: "70"
|
|
||||||
io_priority: "high"
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
* `fs`: filesystem to be laid out: [none/xfs/ext4] (default: `ext4`).
|
|
||||||
* `block_size`: block size in Kbytes (default: `32`).
|
|
||||||
* `repl`: number of synchronous replicas to be provided in the form of replication factor [1..3] (default: `1`) A string is expected here i.e.`"1"` and not `1`.
|
|
||||||
* `io_priority`: determines whether the volume will be created from higher performance or a lower priority storage [high/medium/low] (default: `low`).
|
|
||||||
* `snap_interval`: clock/time interval in minutes for when to trigger snapshots. Snapshots are incremental based on difference with the prior snapshot, 0 disables snaps (default: `0`). A string is expected here i.e. `"70"` and not `70`.
|
|
||||||
* `aggregation_level`: specifies the number of chunks the volume would be distributed into, 0 indicates a non-aggregated volume (default: `0`). A string is expected here i.e. `"0"` and not `0`
|
|
||||||
* `ephemeral`: specifies whether the volume should be cleaned-up after unmount or should be persistent. `emptyDir` use case can set this value to true and `persistent volumes` use case such as for databases like Cassandra should set to false, [true/false] (default `false`). A string is expected here i.e. `"true"` and not `true`.
|
|
||||||
|
|
||||||
#### ScaleIO
|
|
||||||
```yaml
|
|
||||||
kind: StorageClass
|
|
||||||
apiVersion: storage.k8s.io/v1
|
|
||||||
metadata:
|
|
||||||
name: slow
|
|
||||||
provisioner: kubernetes.io/scaleio
|
|
||||||
parameters:
|
|
||||||
gateway: https://192.168.99.200:443/api
|
|
||||||
system: scaleio
|
|
||||||
protectionDomain: default
|
|
||||||
storagePool: default
|
|
||||||
storageMode: ThinProvisionned
|
|
||||||
secretRef: sio-secret
|
|
||||||
readOnly: false
|
|
||||||
fsType: xfs
|
|
||||||
```
|
|
||||||
|
|
||||||
* `provisioner`: attribute is set to `kubernetes.io/scaleio`
|
|
||||||
* `gateway`: address to a ScaleIO API gateway (required)
|
|
||||||
* `system`: the name of the ScaleIO system (required)
|
|
||||||
* `protectionDomain`: the name of the ScaleIO protection domain
|
|
||||||
* `storagePool`: the name of the volume storage pool
|
|
||||||
* `storageMode`: the storage provision mode: `ThinProvisionned` (default) or `ThickProvisionned`
|
|
||||||
* `secretRef`: reference to a configuered Secret object (required, see detail below)
|
|
||||||
* `readOnly`: specifies the access mode to the mounted volume
|
|
||||||
* `fsType`: the file system to use for the volume
|
|
||||||
|
|
||||||
The ScaleIO Kubernetes volume plugin requires a configuered Secret object.
|
|
||||||
The secret must be created with type `kubernetes.io/scaleio` and use the same namespace value as that of the PVC where it is referenced
|
|
||||||
as shown in the following command:
|
|
||||||
|
|
||||||
```
|
|
||||||
$> kubectl create secret generic sio-secret --type="kubernetes.io/scaleio" --from-literal=username=sioadmin --from-literal=password=d2NABDNjMA== --namespace=default
|
|
||||||
```
|
|
||||||
|
|
||||||
## Writing Portable Configuration
|
|
||||||
|
|
||||||
If you're writing configuration templates or examples that run on a wide range of clusters
|
|
||||||
and need persistent storage, we recommend that you use the following pattern:
|
|
||||||
|
|
||||||
- Do include PersistentVolumeClaim objects in your bundle of config (alongside Deployments, ConfigMaps, etc).
|
|
||||||
- Do not include PersistentVolume objects in the config, since the user instantiating the config may not have
|
|
||||||
permission to create PersistentVolumes.
|
|
||||||
- Give the user the option of providing a storage class name when instantiating the template.
|
|
||||||
- If the user provides a storage class name, and the cluster is version 1.4 or newer, put that value into the `volume.beta.kubernetes.io/storage-class` annotation of the PVC.
|
|
||||||
This will cause the PVC to match the right storage class if the cluster has StorageClasses enabled by the admin.
|
|
||||||
- If the user does not provide a storage class name or the cluster is version 1.3, then instead put a `volume.alpha.kubernetes.io/storage-class: default` annotation on the PVC.
|
|
||||||
- This will cause a PV to be automatically provisioned for the user with sane default characteristics on some clusters.
|
|
||||||
- Despite the word `alpha` in the name, the code behind this annotation has `beta` level support.
|
|
||||||
- Do not use `volume.beta.kubernetes.io/storage-class:` with any value including the empty string since it will prevent DefaultStorageClass admission controller
|
|
||||||
from running if enabled.
|
|
||||||
- In your tooling, do watch for PVCs that are not getting bound after some time and surface this to the user, as this may indicate that the cluster has no dynamic
|
|
||||||
storage support (in which case the user should create a matching PV) or the cluster has no storage system (in which case the user cannot deploy config requiring
|
|
||||||
PVCs).
|
|
||||||
- In the future, we expect most clusters to have `DefaultStorageClass` enabled, and to have some form of storage available. However, there may not be any
|
|
||||||
storage class names which work on all clusters, so continue to not set one by default.
|
|
||||||
At some point, the alpha annotation will cease to have meaning, but the unset `storageClass` field on the PVC
|
|
||||||
will have the desired effect.
|
|
||||||
|
|
|
@ -10,435 +10,6 @@ assignees:
|
||||||
title: Pet Sets
|
title: Pet Sets
|
||||||
---
|
---
|
||||||
|
|
||||||
__Warning:__ Starting in Kubernetes version 1.5, PetSet has been renamed to
|
{% include user-guide-content-moved.md %}
|
||||||
[StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/).
|
|
||||||
To use (or continue to use) PetSet in Kubernetes 1.5 or higher, you must
|
|
||||||
[migrate your existing PetSets to StatefulSets](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/).
|
|
||||||
|
|
||||||
__This document has been deprecated__, but can still apply if you're using
|
|
||||||
Kubernetes version 1.4 or earlier.
|
|
||||||
|
|
||||||
* TOC
|
|
||||||
{:toc}
|
|
||||||
|
|
||||||
__Terminology__
|
|
||||||
|
|
||||||
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
|
|
||||||
|
|
||||||
* Node: A single virtual or physical machine in a Kubernetes cluster.
|
|
||||||
* Cluster: A group of nodes in a single failure domain, unless mentioned otherwise.
|
|
||||||
* Persistent Volume Claim (PVC): A request for storage, typically a [persistent volume](/docs/user-guide/persistent-volumes/walkthrough/).
|
|
||||||
* Host name: The hostname attached to the UTS namespace of the pod, i.e. the output of `hostname` in the pod.
|
|
||||||
* DNS/Domain name: A *cluster local* domain name resolvable using standard methods (e.g.: [gethostbyname](http://linux.die.net/man/3/gethostbyname)).
|
|
||||||
* Ordinality: the property of being "ordinal", or occupying a position in a sequence.
|
|
||||||
* Pet: a single member of a PetSet; more generally, a stateful application.
|
|
||||||
* Peer: a process running a server, capable of communicating with other such processes.
|
|
||||||
|
|
||||||
__Prerequisites__
|
|
||||||
|
|
||||||
This doc assumes familiarity with the following Kubernetes concepts:
|
|
||||||
|
|
||||||
* [Pods](/docs/user-guide/pods/single-container/)
|
|
||||||
* [Cluster DNS](/docs/admin/dns/)
|
|
||||||
* [Headless Services](/docs/user-guide/services/#headless-services)
|
|
||||||
* [Persistent Volumes](/docs/concepts/storage/volumes/)
|
|
||||||
* [Persistent Volume Provisioning](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md)
|
|
||||||
|
|
||||||
You need a working Kubernetes cluster at version >= 1.3, with a healthy DNS [cluster addon](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/README.md) at version >= 15. You cannot use PetSet on a hosted Kubernetes provider that has disabled `alpha` resources.
|
|
||||||
|
|
||||||
## What is a PetSet?
|
|
||||||
|
|
||||||
In Kubernetes, most pod management abstractions group them into disposable units of work that compose a micro service. Replication controllers for example, are designed with a weak guarantee - that there should be N replicas of a particular pod template. The pods are treated as stateless units, if one of them is unhealthy or superseded by a newer version, the system just disposes it.
|
|
||||||
|
|
||||||
```
|
|
||||||
foo.default.svc.cluster.local
|
|
||||||
|service|
|
|
||||||
/ \
|
|
||||||
| pod-asdf | | pod-zxcv |
|
|
||||||
```
|
|
||||||
|
|
||||||
A PetSet, in contrast, is a group of stateful pods that require a stronger notion of identity. The document refers to these as "clustered applications".
|
|
||||||
|
|
||||||
```
|
|
||||||
*.foo.default.svc.cluster.local
|
|
||||||
| mysql-0 | <-> | mysql-1 |
|
|
||||||
[pv 0] [pv 1]
|
|
||||||
```
|
|
||||||
|
|
||||||
The co-ordinated deployment of clustered applications is notoriously hard. They require stronger notions of identity and membership, which they use in opaque internal protocols, and are especially prone to race conditions and deadlock. Traditionally administrators have deployed these applications by leveraging nodes as stable, long-lived entities with persistent storage and static ips.
|
|
||||||
|
|
||||||
The goal of PetSet is to decouple this dependency by assigning identities to individual instances of an application that are not anchored to the underlying physical infrastructure. For the rest of this document we will refer to these entities as "Pets". Our use of this term is predated by the "Pets vs Cattle" analogy.
|
|
||||||
|
|
||||||
__Relationship between Pets and Pods__: PetSet requires there be {0..N-1} Pets. Each Pet has a deterministic name - PetSetName-Ordinal, and a unique identity. Each Pet has at most one pod, and each PetSet has at most one Pet with a given identity.
|
|
||||||
|
|
||||||
## When to use PetSet?
|
|
||||||
|
|
||||||
A PetSet ensures that a specified number of "pets" with unique identities are running at any given time. The identity of a Pet is comprised of:
|
|
||||||
|
|
||||||
* a stable hostname, available in DNS
|
|
||||||
* an ordinal index
|
|
||||||
* stable storage: linked to the ordinal & hostname
|
|
||||||
|
|
||||||
These properties are useful in deploying stateful applications. However most stateful applications are also clustered, meaning they form groups with strict membership requirements that rely on stored state. PetSet also helps with the 2 most common problems encountered managing such clustered applications:
|
|
||||||
|
|
||||||
* discovery of peers for quorum
|
|
||||||
* startup/teardown ordering
|
|
||||||
|
|
||||||
Only use PetSet if your application requires some or all of these properties. Managing pods as stateless replicas is vastly easier.
|
|
||||||
|
|
||||||
Example workloads for PetSet:
|
|
||||||
|
|
||||||
* Databases like MySQL or PostgreSQL that require a single instance attached to an NFS persistent volume at any time
|
|
||||||
* Clustered software like Zookeeper, Etcd, or Elasticsearch that require stable membership.
|
|
||||||
|
|
||||||
## Alpha limitations
|
|
||||||
|
|
||||||
Before you start deploying applications as PetSets, there are a few limitations you should understand.
|
|
||||||
|
|
||||||
* PetSet is an *alpha* resource, not available in any Kubernetes release prior to 1.3.
|
|
||||||
* As with all alpha/beta resources, it can be disabled through the `--runtime-config` option passed to the apiserver, and in fact most likely will be disabled on hosted offerings of Kubernetes.
|
|
||||||
* The only updatable field on a PetSet is `replicas`
|
|
||||||
* The storage for a given pet must either be provisioned by a [persistent volume provisioner](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md) based on the requested `storage class`, or pre-provisioned by an admin. Note that persistent volume provisioning is also currently in alpha.
|
|
||||||
* Deleting and/or scaling a PetSet down will *not* delete the volumes associated with the PetSet. This is done to ensure safety first, your data is more valuable than an auto purge of all related PetSet resources. **Deleting the Persistent Volume Claims will result in a deletion of the associated volumes**.
|
|
||||||
* All PetSets currently require a "governing service", or a Service responsible for the network identity of the pets. The user is responsible for this Service.
|
|
||||||
* Updating an existing PetSet is currently a manual process, meaning you either need to deploy a new PetSet with the new image version, or orphan Pets one by one, update their image, and join them back to the cluster.
|
|
||||||
|
|
||||||
## Example PetSet
|
|
||||||
|
|
||||||
We'll create a basic PetSet to demonstrate how Pets are assigned unique and "sticky" identities.
|
|
||||||
|
|
||||||
{% include code.html language="yaml" file="petset.yaml" ghlink="/docs/user-guide/petset.yaml" %}
|
|
||||||
|
|
||||||
Saving this config into `petset.yaml` and submitting it to a Kubernetes cluster should create the defined PetSet and Pets it manages:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create -f petset.yaml
|
|
||||||
service "nginx" created
|
|
||||||
petset "nginx" created
|
|
||||||
```
|
|
||||||
|
|
||||||
## Pet Identity
|
|
||||||
|
|
||||||
The identity of a Pet sticks to it, regardless of which node it's (re) scheduled on. We can examine the identity of the pets we just created.
|
|
||||||
|
|
||||||
### Ordinal index
|
|
||||||
|
|
||||||
you should see 2 pods with predictable names formatted thus: `$(petset name)-$(ordinal index assigned by petset controller)`
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get po
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
web-0 1/1 Running 0 10m
|
|
||||||
web-1 1/1 Running 0 10m
|
|
||||||
```
|
|
||||||
|
|
||||||
### Stable storage
|
|
||||||
|
|
||||||
2 persistent volumes, one per pod. This is auto created by the PetSet based on the `volumeClaimTemplate` field
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get pv
|
|
||||||
NAME CAPACITY ACCESSMODES STATUS CLAIM REASON AGE
|
|
||||||
pvc-90234946-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-0 11m
|
|
||||||
pvc-902733c2-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-1 11m
|
|
||||||
```
|
|
||||||
|
|
||||||
### Network identity
|
|
||||||
|
|
||||||
The network identity has 2 parts. First, we created a headless Service that controls the domain within which we create Pets. The domain managed by this Service takes the form: `$(service name).$(namespace).svc.cluster.local`, where "cluster.local" is the [cluster domain](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it). As each pet is created, it gets a matching DNS subdomain, taking the form: `$(petname).$(governing service domain)`, where the governing service is defined by the `serviceName` field on the PetSet.
|
|
||||||
|
|
||||||
Here are some examples of choices for Cluster Domain, Service name, PetSet name, and how that affects the DNS names for the Pets and the hostnames in the Pet's pods:
|
|
||||||
|
|
||||||
Cluster Domain | Service (ns/name) | PetSet (ns/name) | PetSet Domain | Pet DNS | Pet Hostname |
|
|
||||||
-------------- | ----------------- | ----------------- | -------------- | ------- | ------------ |
|
|
||||||
cluster.local | default/nginx | default/web | nginx.default.svc.cluster.local | web-{0..N-1}.nginx.default.svc.cluster.local | web-{0..N-1} |
|
|
||||||
cluster.local | foo/nginx | foo/web | nginx.foo.svc.cluster.local | web-{0..N-1}.nginx.foo.svc.cluster.local | web-{0..N-1} |
|
|
||||||
kube.local | foo/nginx | foo/web | nginx.foo.svc.kube.local | web-{0..N-1}.nginx.foo.svc.kube.local | web-{0..N-1} |
|
|
||||||
|
|
||||||
Note that Cluster Domain will be set to `cluster.local` unless [otherwise configured](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it).
|
|
||||||
|
|
||||||
Let's verify our assertion with a simple test.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get svc
|
|
||||||
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
|
||||||
nginx None <none> 80/TCP 12m
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
First, the PetSet provides a stable hostname:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ for i in 0 1; do kubectl exec web-$i -- sh -c 'hostname'; done
|
|
||||||
web-0
|
|
||||||
web-1
|
|
||||||
```
|
|
||||||
|
|
||||||
And the hostname is linked to the in-cluster DNS address:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl run -i --tty --image busybox dns-test --restart=Never /bin/sh
|
|
||||||
dns-test # nslookup web-0.nginx
|
|
||||||
Server: 10.0.0.10
|
|
||||||
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
|
|
||||||
|
|
||||||
Name: web-0.nginx
|
|
||||||
Address 1: 10.180.3.5
|
|
||||||
|
|
||||||
dns-test # nslookup web-1.nginx
|
|
||||||
Server: 10.0.0.10
|
|
||||||
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
|
|
||||||
|
|
||||||
Name: web-1.nginx
|
|
||||||
Address 1: 10.180.0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
The containers are running nginx webservers, which by default will look for an index.html file in `/usr/share/nginx/html/index.html`. That directory is backed by a `PersistentVolume` created by the PetSet. So let's write our hostname there:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ for i in 0 1; do
|
|
||||||
kubectl exec web-$i -- sh -c 'echo $(hostname) > /usr/share/nginx/html/index.html';
|
|
||||||
done
|
|
||||||
```
|
|
||||||
|
|
||||||
And verify each webserver serves its own hostname:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ for i in 0 1; do kubectl exec -it web-$i -- curl localhost; done
|
|
||||||
web-0
|
|
||||||
web-1
|
|
||||||
```
|
|
||||||
|
|
||||||
Now delete all pods in the petset:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl delete po -l app=nginx
|
|
||||||
pod "web-0" deleted
|
|
||||||
pod "web-1" deleted
|
|
||||||
```
|
|
||||||
|
|
||||||
Wait for them to come back up, and try to retrieve the previously written hostname through the DNS name of the peer. They match, because the storage, DNS name, and hostname stick to the Pet no matter where it gets scheduled:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl exec -it web-1 -- curl web-0.nginx
|
|
||||||
web-0
|
|
||||||
$ kubectl exec -it web-0 -- curl web-1.nginx
|
|
||||||
web-1
|
|
||||||
```
|
|
||||||
|
|
||||||
## Peer discovery
|
|
||||||
|
|
||||||
A pet can piece together its own identity:
|
|
||||||
|
|
||||||
1. Use the [downward api](/docs/user-guide/downward-api/) to find its pod name
|
|
||||||
2. Run `hostname` to find its DNS name
|
|
||||||
3. Run `mount` or `df` to find its volumes (usually this is unnecessary)
|
|
||||||
|
|
||||||
It's not necessary to "discover" the governing Service of a PetSet, since it's known at creation time you can simply pass it down through an [environment variable](/docs/user-guide/environment-guide).
|
|
||||||
|
|
||||||
Usually pets also need to find their peers. In the previous nginx example, we just used `kubectl` to get the names of existing pods, and as humans, we could tell which ones belonged to a given PetSet. Another way to find peers is by contacting the API server, just like `kubectl`, but that has several disadvantages (you end up implementing a Kubernetes specific init system that runs as pid 1 in your application container).
|
|
||||||
|
|
||||||
PetSet gives you a way to discover your peers using DNS records. To illustrate this we can use the previous example (note: one usually doesn't `apt-get` in a container).
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl exec -it web-0 /bin/sh
|
|
||||||
web-0 # apt-get update && apt-get install -y dnsutils
|
|
||||||
...
|
|
||||||
|
|
||||||
web-0 # nslookup -type=srv nginx.default
|
|
||||||
Server: 10.0.0.10
|
|
||||||
Address: 10.0.0.10#53
|
|
||||||
|
|
||||||
nginx.default.svc.cluster.local service = 10 50 0 web-1.ub.default.svc.cluster.local.
|
|
||||||
nginx.default.svc.cluster.local service = 10 50 0 web-0.ub.default.svc.cluster.local.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Updating a PetSet
|
|
||||||
|
|
||||||
You cannot update any field of the PetSet except `spec.replicas` and the `containers` in the podTemplate. Updating `spec.replicas` will scale the PetSet, updating `containers` will not have any effect till a Pet is deleted, at which time it is recreated with the modified podTemplate.
|
|
||||||
|
|
||||||
## Scaling a PetSet
|
|
||||||
|
|
||||||
You can scale a PetSet by updating the "replicas" field. Note however that the controller will only:
|
|
||||||
|
|
||||||
1. Create one pet at a time, in order from {0..N-1}, and wait till each one is in [Running and Ready](/docs/user-guide/pod-states) before creating the next
|
|
||||||
2. Delete one pet at a time, in reverse order from {N-1..0}, and wait till each one is completely shutdown (past its [terminationGracePeriodSeconds](/docs/user-guide/pods/index#termination-of-pods)) before deleting the next
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get po
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
web-0 1/1 Running 0 30s
|
|
||||||
web-1 1/1 Running 0 36s
|
|
||||||
|
|
||||||
$ kubectl patch petset web -p '{"spec":{"replicas":3}}'
|
|
||||||
"web" patched
|
|
||||||
|
|
||||||
$ kubectl get po
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
web-0 1/1 Running 0 40s
|
|
||||||
web-1 1/1 Running 0 46s
|
|
||||||
web-2 1/1 Running 0 8s
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also use the `kubectl scale` command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get petset
|
|
||||||
NAME DESIRED CURRENT AGE
|
|
||||||
web 3 3 24m
|
|
||||||
|
|
||||||
$ kubectl scale petset web --replicas=5
|
|
||||||
petset "web" scaled
|
|
||||||
|
|
||||||
$ kubectl get po --watch-only
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
web-0 1/1 Running 0 10m
|
|
||||||
web-1 1/1 Running 0 27m
|
|
||||||
web-2 1/1 Running 0 10m
|
|
||||||
web-3 1/1 Running 0 3m
|
|
||||||
web-4 0/1 ContainerCreating 0 48s
|
|
||||||
|
|
||||||
$ kubectl get petset web
|
|
||||||
NAME DESIRED CURRENT AGE
|
|
||||||
web 5 5 30m
|
|
||||||
```
|
|
||||||
|
|
||||||
Note however, that scaling up to N and back down to M *will not* delete the volumes of the M-N pets, as described in the section on [deletion](#deleting-a-petset), i.e. scaling back up to M creates new pets that use the same volumes. To see this in action, scale the PetSet back down to 3:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get po --watch-only
|
|
||||||
web-4 1/1 Terminating 0 4m
|
|
||||||
web-4 1/1 Terminating 0 4m
|
|
||||||
web-3 1/1 Terminating 0 6m
|
|
||||||
web-3 1/1 Terminating 0 6m
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that we still have 5 pvcs:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get pvc
|
|
||||||
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
|
|
||||||
www-web-0 Bound pvc-42ca5cef-8113-11e6-82f6-42010af00002 1Gi RWO 32m
|
|
||||||
www-web-1 Bound pvc-42de30af-8113-11e6-82f6-42010af00002 1Gi RWO 32m
|
|
||||||
www-web-2 Bound pvc-ba416413-8115-11e6-82f6-42010af00002 1Gi RWO 14m
|
|
||||||
www-web-3 Bound pvc-ba45f19c-8115-11e6-82f6-42010af00002 1Gi RWO 14m
|
|
||||||
www-web-4 Bound pvc-ba47674a-8115-11e6-82f6-42010af00002 1Gi RWO 14m
|
|
||||||
```
|
|
||||||
|
|
||||||
This allows you to upgrade the image of a petset and have it come back up with the same data, as described in the next section.
|
|
||||||
|
|
||||||
## Image upgrades
|
|
||||||
|
|
||||||
PetSet currently *does not* support automated image upgrade as noted in the section on [limitations](#alpha-limitations), however you can update the `image` field of any container in the podTemplate and delete Pets one by one, the PetSet controller will recreate it with the new image.
|
|
||||||
|
|
||||||
Edit the image on the PetSet to `gcr.io/google_containers/nginx-slim:0.7` and delete 1 Pet:
|
|
||||||
|
|
||||||
```shell{% raw %}
|
|
||||||
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
|
|
||||||
gcr.io/google_containers/nginx-slim:0.8
|
|
||||||
gcr.io/google_containers/nginx-slim:0.8
|
|
||||||
gcr.io/google_containers/nginx-slim:0.8
|
|
||||||
|
|
||||||
$ kubectl delete po web-0
|
|
||||||
pod "web-0" deleted
|
|
||||||
|
|
||||||
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
|
|
||||||
gcr.io/google_containers/nginx-slim:0.7
|
|
||||||
gcr.io/google_containers/nginx-slim:0.8
|
|
||||||
gcr.io/google_containers/nginx-slim:0.8
|
|
||||||
{% endraw %}```
|
|
||||||
|
|
||||||
Delete the remaining 2:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl delete po web-1 web-2
|
|
||||||
pod "web-1" deleted
|
|
||||||
pod "web-2" deleted
|
|
||||||
```
|
|
||||||
|
|
||||||
Wait till the PetSet is stable and check the images:
|
|
||||||
|
|
||||||
```shell{% raw %}
|
|
||||||
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
|
|
||||||
gcr.io/google_containers/nginx-slim:0.7
|
|
||||||
gcr.io/google_containers/nginx-slim:0.7
|
|
||||||
gcr.io/google_containers/nginx-slim:0.7
|
|
||||||
{% endraw %}```
|
|
||||||
|
|
||||||
## Deleting a PetSet
|
|
||||||
|
|
||||||
Deleting a PetSet through kubectl will scale it down to 0, thereby deleting all the Pets. If you wish to delete just the PetSet and not the Pets, use `--cascade=false`:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl delete -f petset.yaml --cascade=false
|
|
||||||
petset "web" deleted
|
|
||||||
|
|
||||||
$ kubectl get po -l app=nginx
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
web-0 1/1 Running 0 21h
|
|
||||||
web-1 1/1 Running 0 21h
|
|
||||||
|
|
||||||
$ kubectl delete po -l app=nginx
|
|
||||||
pod "web-0" deleted
|
|
||||||
pod "web-1" deleted
|
|
||||||
```
|
|
||||||
|
|
||||||
Deleting the pods will *not* delete the volumes. Until we finalize the recycle policy for these volumes they will have to get cleaned up by an admin. This is to ensure that you have the chance to copy data off the volume before deleting it. Simply deleting the PVC after the pods have left the [terminating state](/docs/user-guide/pods/index#termination-of-pods) should trigger deletion of the backing Persistent Volumes.
|
|
||||||
|
|
||||||
**Note: you will lose all your data once the PVC is deleted, do this with caution.**
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get po -l app=nginx
|
|
||||||
$ kubectl get pvc -l app=nginx
|
|
||||||
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
|
|
||||||
www-web-0 Bound pvc-62d271cd-3822-11e6-b1b7-42010af00002 0 21h
|
|
||||||
www-web-1 Bound pvc-62d6750e-3822-11e6-b1b7-42010af00002 0 21h
|
|
||||||
|
|
||||||
$ kubectl delete pvc -l app=nginx
|
|
||||||
$ kubectl get pv
|
|
||||||
```
|
|
||||||
|
|
||||||
If you simply want to clean everything:
|
|
||||||
|
|
||||||
```shell{% raw %}
|
|
||||||
$ grace=$(kubectl get po web-0 --template '{{.spec.terminationGracePeriodSeconds}}')
|
|
||||||
$ kubectl delete petset,po -l app=nginx
|
|
||||||
$ sleep $grace
|
|
||||||
$ kubectl delete pvc -l app=nginx
|
|
||||||
{% endraw %}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
You might have noticed an `annotations` field in all the PetSets shown above.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
annotations:
|
|
||||||
pod.alpha.kubernetes.io/initialized: "true"
|
|
||||||
```
|
|
||||||
|
|
||||||
This field is a debugging hook. It pauses any scale up/down operations on the entire PetSet. If you'd like to pause a petset after each pet, set it to `false` in the template, wait for each pet to come up, verify it has initialized correctly, and then set it to `true` using `kubectl edit` on the pet (setting it to `false` on *any pet* is enough to pause the PetSet). If you don't need it, create the PetSet with it set to `true` as shown. This is surprisingly useful in debugging bootstrapping race conditions.
|
|
||||||
|
|
||||||
## Future Work
|
|
||||||
|
|
||||||
There are a LOT of planned improvements since PetSet is still in alpha.
|
|
||||||
|
|
||||||
* Data gravity and local storage
|
|
||||||
* Richer notification events
|
|
||||||
* Public network identities
|
|
||||||
* WAN cluster deployments (multi-AZ/region/cloud provider)
|
|
||||||
* Image and node upgrades
|
|
||||||
|
|
||||||
This list goes on, if you have examples, ideas or thoughts, please contribute.
|
|
||||||
|
|
||||||
## Alternatives
|
|
||||||
|
|
||||||
Deploying one RC of size 1/Service per pod is a popular alternative, as is simply deploying a DaemonSet that utilizes the identity of a Node.
|
|
||||||
|
|
||||||
## Next steps
|
|
||||||
|
|
||||||
* Learn about [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/),
|
|
||||||
the replacement for PetSet introduced in Kubernetes version 1.5.
|
|
||||||
* [Migrate your existing PetSets to StatefulSets](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/)
|
|
||||||
when upgrading to Kubernetes version 1.5 or higher.
|
|
||||||
|
|
||||||
|
[PetSets](/docs/concepts/workloads/controllers/petset/)
|
||||||
|
|
|
@ -3,194 +3,6 @@ assignees:
|
||||||
title: Pods
|
title: Pods
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
|
[Pods](/docs/concepts/workloads/pods/pod/)
|
||||||
_pods_ are the smallest deployable units of computing that can be created and
|
|
||||||
managed in Kubernetes.
|
|
||||||
|
|
||||||
## What is a Pod?
|
|
||||||
|
|
||||||
A _pod_ (as in a pod of whales or pea pod) is a group of one or more containers
|
|
||||||
(such as Docker containers), the shared storage for those containers, and
|
|
||||||
options about how to run the containers. Pods are always co-located and
|
|
||||||
co-scheduled, and run in a shared context. A pod models an
|
|
||||||
application-specific "logical host" - it contains one or more application
|
|
||||||
containers which are relatively tightly coupled — in a pre-container
|
|
||||||
world, they would have executed on the same physical or virtual machine.
|
|
||||||
|
|
||||||
While Kubernetes supports more container runtimes than just Docker, Docker is
|
|
||||||
the most commonly known runtime, and it helps to describe pods in Docker terms.
|
|
||||||
|
|
||||||
The shared context of a pod is a set of Linux namespaces, cgroups, and
|
|
||||||
potentially other facets of isolation - the same things that isolate a Docker
|
|
||||||
container. Within a pod's context, the individual applications may have
|
|
||||||
further sub-isolations applied.
|
|
||||||
|
|
||||||
Containers within a pod share an IP address and port space, and
|
|
||||||
can find each other via `localhost`. They can also communicate with each
|
|
||||||
other using standard inter-process communications like SystemV semaphores or
|
|
||||||
POSIX shared memory. Containers in different pods have distinct IP addresses
|
|
||||||
and can not communicate by IPC.
|
|
||||||
|
|
||||||
Applications within a pod also have access to shared volumes, which are defined
|
|
||||||
as part of a pod and are made available to be mounted into each application's
|
|
||||||
filesystem.
|
|
||||||
|
|
||||||
In terms of [Docker](https://www.docker.com/) constructs, a pod is modelled as
|
|
||||||
a group of Docker containers with shared namespaces and shared
|
|
||||||
[volumes](/docs/concepts/storage/volumes/). PID namespace sharing is not yet implemented in Docker.
|
|
||||||
|
|
||||||
Like individual application containers, pods are considered to be relatively
|
|
||||||
ephemeral (rather than durable) entities. As discussed in [life of a
|
|
||||||
pod](/docs/user-guide/pod-states/), pods are created, assigned a unique ID (UID), and
|
|
||||||
scheduled to nodes where they remain until termination (according to restart
|
|
||||||
policy) or deletion. If a node dies, the pods scheduled to that node are
|
|
||||||
scheduled for deletion, after a timeout period. A given pod (as defined by a UID) is not
|
|
||||||
"rescheduled" to a new node; instead, it can be replaced by an identical pod,
|
|
||||||
with even the same name if desired, but with a new UID (see [replication
|
|
||||||
controller](/docs/user-guide/replication-controller/) for more details). (In the future, a
|
|
||||||
higher-level API may support pod migration.)
|
|
||||||
|
|
||||||
When something is said to have the same lifetime as a pod, such as a volume,
|
|
||||||
that means that it exists as long as that pod (with that UID) exists. If that
|
|
||||||
pod is deleted for any reason, even if an identical replacement is created, the
|
|
||||||
related thing (e.g. volume) is also destroyed and created anew.
|
|
||||||
|
|
||||||
![pod diagram](/images/docs/pod.svg){: style="max-width: 50%" }
|
|
||||||
|
|
||||||
*A multi-container pod that contains a file puller and a
|
|
||||||
web server that uses a persistent volume for shared storage between the containers.*
|
|
||||||
|
|
||||||
## Motivation for pods
|
|
||||||
|
|
||||||
### Management
|
|
||||||
|
|
||||||
Pods are a model of the pattern of multiple cooperating processes which form a
|
|
||||||
cohesive unit of service. They simplify application deployment and management
|
|
||||||
by providing a higher-level abstraction than the set of their constituent
|
|
||||||
applications. Pods serve as unit of deployment, horizontal scaling, and
|
|
||||||
replication. Colocation (co-scheduling), shared fate (e.g. termination),
|
|
||||||
coordinated replication, resource sharing, and dependency management are
|
|
||||||
handled automatically for containers in a pod.
|
|
||||||
|
|
||||||
### Resource sharing and communication
|
|
||||||
|
|
||||||
Pods enable data sharing and communication among their constituents.
|
|
||||||
|
|
||||||
The applications in a pod all use the same network namespace (same IP and port
|
|
||||||
space), and can thus "find" each other and communicate using `localhost`.
|
|
||||||
Because of this, applications in a pod must coordinate their usage of ports.
|
|
||||||
Each pod has an IP address in a flat shared networking space that has full
|
|
||||||
communication with other physical computers and pods across the network.
|
|
||||||
|
|
||||||
The hostname is set to the pod's Name for the application containers within the
|
|
||||||
pod. [More details on networking](/docs/admin/networking/).
|
|
||||||
|
|
||||||
In addition to defining the application containers that run in the pod, the pod
|
|
||||||
specifies a set of shared storage volumes. Volumes enable data to survive
|
|
||||||
container restarts and to be shared among the applications within the pod.
|
|
||||||
|
|
||||||
## Uses of pods
|
|
||||||
|
|
||||||
Pods can be used to host vertically integrated application stacks (e.g. LAMP),
|
|
||||||
but their primary motivation is to support co-located, co-managed helper
|
|
||||||
programs, such as:
|
|
||||||
|
|
||||||
* content management systems, file and data loaders, local cache managers, etc.
|
|
||||||
* log and checkpoint backup, compression, rotation, snapshotting, etc.
|
|
||||||
* data change watchers, log tailers, logging and monitoring adapters, event publishers, etc.
|
|
||||||
* proxies, bridges, and adapters
|
|
||||||
* controllers, managers, configurators, and updaters
|
|
||||||
|
|
||||||
Individual pods are not intended to run multiple instances of the same
|
|
||||||
application, in general.
|
|
||||||
|
|
||||||
For a longer explanation, see [The Distributed System ToolKit: Patterns for
|
|
||||||
Composite
|
|
||||||
Containers](http://blog.kubernetes.io/2015/06/the-distributed-system-toolkit-patterns.html).
|
|
||||||
|
|
||||||
## Alternatives considered
|
|
||||||
|
|
||||||
_Why not just run multiple programs in a single (Docker) container?_
|
|
||||||
|
|
||||||
1. Transparency. Making the containers within the pod visible to the
|
|
||||||
infrastructure enables the infrastructure to provide services to those
|
|
||||||
containers, such as process management and resource monitoring. This
|
|
||||||
facilitates a number of conveniences for users.
|
|
||||||
2. Decoupling software dependencies. The individual containers may be
|
|
||||||
versioned, rebuilt and redeployed independently. Kubernetes may even support
|
|
||||||
live updates of individual containers someday.
|
|
||||||
3. Ease of use. Users don't need to run their own process managers, worry about
|
|
||||||
signal and exit-code propagation, etc.
|
|
||||||
4. Efficiency. Because the infrastructure takes on more responsibility,
|
|
||||||
containers can be lighter weight.
|
|
||||||
|
|
||||||
_Why not support affinity-based co-scheduling of containers?_
|
|
||||||
|
|
||||||
That approach would provide co-location, but would not provide most of the
|
|
||||||
benefits of pods, such as resource sharing, IPC, guaranteed fate sharing, and
|
|
||||||
simplified management.
|
|
||||||
|
|
||||||
## Durability of pods (or lack thereof)
|
|
||||||
|
|
||||||
Pods aren't intended to be treated as durable entities. They won't survive scheduling failures, node failures, or other evictions, such as due to lack of resources, or in the case of node maintenance.
|
|
||||||
|
|
||||||
In general, users shouldn't need to create pods directly. They should almost always use controllers (e.g., [Deployments](/docs/user-guide/deployments/)), even for singletons. Controllers provide self-healing with a cluster scope, as well as replication and rollout management.
|
|
||||||
|
|
||||||
The use of collective APIs as the primary user-facing primitive is relatively common among cluster scheduling systems, including [Borg](https://research.google.com/pubs/pub43438.html), [Marathon](https://mesosphere.github.io/marathon/docs/rest-api.html), [Aurora](http://aurora.apache.org/documentation/latest/configuration-reference/#job-schema), and [Tupperware](http://www.slideshare.net/Docker/aravindnarayanan-facebook140613153626phpapp02-37588997).
|
|
||||||
|
|
||||||
Pod is exposed as a primitive in order to facilitate:
|
|
||||||
|
|
||||||
* scheduler and controller pluggability
|
|
||||||
* support for pod-level operations without the need to "proxy" them via controller APIs
|
|
||||||
* decoupling of pod lifetime from controller lifetime, such as for bootstrapping
|
|
||||||
* decoupling of controllers and services — the endpoint controller just watches pods
|
|
||||||
* clean composition of Kubelet-level functionality with cluster-level functionality — Kubelet is effectively the "pod controller"
|
|
||||||
* high-availability applications, which will expect pods to be replaced in advance of their termination and certainly in advance of deletion, such as in the case of planned evictions, image prefetching, or live pod migration [#3949](http://issue.k8s.io/3949)
|
|
||||||
|
|
||||||
There is new first-class support for stateful pods with the [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/) controller (currently in beta). The feature was alpha in 1.4 and was called [PetSet](/docs/user-guide/petset/). For prior versions of Kubernetes, best practice for having stateful pods is to create a replication controller with `replicas` equal to `1` and a corresponding service, see [this MySQL deployment example](/docs/tutorials/stateful-application/run-stateful-application/).
|
|
||||||
|
|
||||||
## Termination of Pods
|
|
||||||
|
|
||||||
Because pods represent running processes on nodes in the cluster, it is important to allow those processes to gracefully terminate when they are no longer needed (vs being violently killed with a KILL signal and having no chance to clean up). Users should be able to request deletion and know when processes terminate, but also be able to ensure that deletes eventually complete. When a user requests deletion of a pod the system records the intended grace period before the pod is allowed to be forcefully killed, and a TERM signal is sent to the main process in each container. Once the grace period has expired the KILL signal is sent to those processes and the pod is then deleted from the API server. If the Kubelet or the container manager is restarted while waiting for processes to terminate, the termination will be retried with the full grace period.
|
|
||||||
|
|
||||||
An example flow:
|
|
||||||
|
|
||||||
1. User sends command to delete Pod, with default grace period (30s)
|
|
||||||
2. The Pod in the API server is updated with the time beyond which the Pod is considered "dead" along with the grace period.
|
|
||||||
3. Pod shows up as "Terminating" when listed in client commands
|
|
||||||
4. (simultaneous with 3) When the Kubelet sees that a Pod has been marked as terminating because the time in 2 has been set, it begins the pod shutdown process.
|
|
||||||
1. If the pod has defined a [preStop hook](/docs/concepts/containers/container-lifecycle-hooks/#hook-details), it is invoked inside of the pod. If the `preStop` hook is still running after the grace period expires, step 2 is then invoked with a small (2 second) extended grace period.
|
|
||||||
2. The processes in the Pod are sent the TERM signal.
|
|
||||||
5. (simultaneous with 3), Pod is removed from endpoints list for service, and are no longer considered part of the set of running pods for replication controllers. Pods that shutdown slowly can continue to serve traffic as load balancers (like the service proxy) remove them from their rotations.
|
|
||||||
6. When the grace period expires, any processes still running in the Pod are killed with SIGKILL.
|
|
||||||
7. The Kubelet will finish deleting the Pod on the API server by setting grace period 0 (immediate deletion). The Pod disappears from the API and is no longer visible from the client.
|
|
||||||
|
|
||||||
By default, all deletes are graceful within 30 seconds. The `kubectl delete` command supports the `--grace-period=<seconds>` option which allows a user to override the default and specify their own value. The value `0` [force deletes](/docs/user-guide/pods/#force-termination-of-pods) the pod. In kubectl version >= 1.5, you must specify an additional flag `--force` along with `--grace-period=0` in order to perform force deletions.
|
|
||||||
|
|
||||||
### Force deletion of pods
|
|
||||||
|
|
||||||
Force deletion of a pod is defined as deletion of a pod from the cluster state and etcd immediately. When a force deletion is performed, the apiserver does not wait for confirmation from the kubelet that the pod has been terminated on the node it was running on. It removes the pod in the API immediately so a new pod can be created with the same name. On the node, pods that are set to terminate immediately will still be given a small grace period before being force killed.
|
|
||||||
|
|
||||||
Force deletions can be potentially dangerous for some pods and should be performed with caution. In case of StatefulSet pods, please refer to the task documentation for [deleting Pods from a StatefulSet](/docs/tasks/manage-stateful-set/delete-pods/#deleting-pods).
|
|
||||||
|
|
||||||
## Privileged mode for pod containers
|
|
||||||
|
|
||||||
From Kubernetes v1.1, any container in a pod can enable privileged mode, using the `privileged` flag on the `SecurityContext` of the container spec. This is useful for containers that want to use linux capabilities like manipulating the network stack and accessing devices. Processes within the container get almost the same privileges that are available to processes outside a container. With privileged mode, it should be easier to write network and volume plugins as separate pods that don't need to be compiled into the kubelet.
|
|
||||||
|
|
||||||
If the master is running Kubernetes v1.1 or higher, and the nodes are running a version lower than v1.1, then new privileged pods will be accepted by api-server, but will not be launched. They will be pending state.
|
|
||||||
If user calls `kubectl describe pod FooPodName`, user can see the reason why the pod is in pending state. The events table in the describe command output will say:
|
|
||||||
`Error validating pod "FooPodName"."FooPodNamespace" from api, ignoring: spec.containers[0].securityContext.privileged: forbidden '<*>(0xc2089d3248)true'`
|
|
||||||
|
|
||||||
|
|
||||||
If the master is running a version lower than v1.1, then privileged pods cannot be created. If user attempts to create a pod, that has a privileged container, the user will get the following error:
|
|
||||||
`The Pod "FooPodName" is invalid.
|
|
||||||
spec.containers[0].securityContext.privileged: forbidden '<*>(0xc20b222db0)true'`
|
|
||||||
|
|
||||||
## API Object
|
|
||||||
|
|
||||||
Pod is a top-level resource in the Kubernetes REST API. More details about the
|
|
||||||
API object can be found at: [Pod API
|
|
||||||
object](/docs/api-reference/v1.6/#pod-v1-core).
|
|
||||||
|
|
|
@ -6,97 +6,6 @@ assignees:
|
||||||
title: Replica Sets
|
title: Replica Sets
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## What is a ReplicaSet?
|
[ReplicaSets](/docs/concepts/workloads/controllers/replicaset/)
|
||||||
|
|
||||||
ReplicaSet is the next-generation Replication Controller. The only difference
|
|
||||||
between a _ReplicaSet_ and a
|
|
||||||
[_Replication Controller_](/docs/user-guide/replication-controller/) right now is
|
|
||||||
the selector support. ReplicaSet supports the new set-based selector requirements
|
|
||||||
as described in the [labels user guide](/docs/user-guide/labels/#label-selectors)
|
|
||||||
whereas a Replication Controller only supports equality-based selector requirements.
|
|
||||||
|
|
||||||
Most [`kubectl`](/docs/user-guide/kubectl/) commands that support
|
|
||||||
Replication Controllers also support ReplicaSets. One exception is the
|
|
||||||
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command. If
|
|
||||||
you want the rolling update functionality please consider using Deployments
|
|
||||||
instead. Also, the
|
|
||||||
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command is
|
|
||||||
imperative whereas Deployments are declarative, so we recommend using Deployments
|
|
||||||
through the [`rollout`](/docs/user-guide/kubectl/kubectl_rollout/) command.
|
|
||||||
|
|
||||||
While ReplicaSets can be used independently, today it's mainly used by
|
|
||||||
[Deployments](/docs/user-guide/deployments/) as a mechanism to orchestrate pod
|
|
||||||
creation, deletion and updates. When you use Deployments you don't have to worry
|
|
||||||
about managing the ReplicaSets that they create. Deployments own and manage
|
|
||||||
their ReplicaSets.
|
|
||||||
|
|
||||||
## When to use a ReplicaSet?
|
|
||||||
|
|
||||||
A ReplicaSet ensures that a specified number of pod “replicas” are running at any given
|
|
||||||
time. However, a Deployment is a higher-level concept that manages ReplicaSets and
|
|
||||||
provides declarative updates to pods along with a lot of other useful features.
|
|
||||||
Therefore, we recommend using Deployments instead of directly using ReplicaSets, unless
|
|
||||||
you require custom update orchestration or don't require updates at all.
|
|
||||||
|
|
||||||
This actually means that you may never need to manipulate ReplicaSet objects:
|
|
||||||
use directly a Deployment and define your application in the spec section.
|
|
||||||
|
|
||||||
## Example
|
|
||||||
|
|
||||||
{% include code.html language="yaml" file="replicasets/frontend.yaml" ghlink="/docs/user-guide/replicasets/frontend.yaml" %}
|
|
||||||
|
|
||||||
Saving this config into `frontend.yaml` and submitting it to a Kubernetes cluster should
|
|
||||||
create the defined ReplicaSet and the pods that it manages.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create -f frontend.yaml
|
|
||||||
replicaset "frontend" created
|
|
||||||
$ kubectl describe rs/frontend
|
|
||||||
Name: frontend
|
|
||||||
Namespace: default
|
|
||||||
Image(s): gcr.io/google_samples/gb-frontend:v3
|
|
||||||
Selector: tier=frontend,tier in (frontend)
|
|
||||||
Labels: app=guestbook,tier=frontend
|
|
||||||
Replicas: 3 current / 3 desired
|
|
||||||
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
|
|
||||||
No volumes.
|
|
||||||
Events:
|
|
||||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
|
||||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
|
||||||
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-qhloh
|
|
||||||
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-dnjpy
|
|
||||||
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-9si5l
|
|
||||||
$ kubectl get pods
|
|
||||||
NAME READY STATUS RESTARTS AGE
|
|
||||||
frontend-9si5l 1/1 Running 0 1m
|
|
||||||
frontend-dnjpy 1/1 Running 0 1m
|
|
||||||
frontend-qhloh 1/1 Running 0 1m
|
|
||||||
```
|
|
||||||
|
|
||||||
## ReplicaSet as an Horizontal Pod Autoscaler target
|
|
||||||
|
|
||||||
A ReplicaSet can also be a target for
|
|
||||||
[Horizontal Pod Autoscalers (HPA)](/docs/user-guide/horizontal-pod-autoscaling/),
|
|
||||||
i.e. a ReplicaSet can be auto-scaled by an HPA. Here is an example HPA targeting
|
|
||||||
the ReplicaSet we created in the previous example.
|
|
||||||
|
|
||||||
{% include code.html language="yaml" file="replicasets/hpa-rs.yaml" ghlink="/docs/user-guide/replicasets/hpa-rs.yaml" %}
|
|
||||||
|
|
||||||
|
|
||||||
Saving this config into `hpa-rs.yaml` and submitting it to a Kubernetes cluster should
|
|
||||||
create the defined HPA that autoscales the target ReplicaSet depending on the CPU usage
|
|
||||||
of the replicated pods.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
kubectl create -f hpa-rs.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
Alternatively, you can just use the `kubectl autoscale` command to accomplish the same
|
|
||||||
(and it's easier!)
|
|
||||||
|
|
||||||
```shell
|
|
||||||
kubectl autoscale rs frontend
|
|
||||||
```
|
|
||||||
|
|
|
@ -5,257 +5,6 @@ assignees:
|
||||||
title: Replication Controller
|
title: Replication Controller
|
||||||
---
|
---
|
||||||
|
|
||||||
* TOC
|
{% include user-guide-content-moved.md %}
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## What is a ReplicationController?
|
[ReplicationControllers](/docs/concepts/workloads/controllers/replicationcontroller/)
|
||||||
|
|
||||||
A _ReplicationController_ ensures that a specified number of pod "replicas" are running at any one
|
|
||||||
time. In other words, a ReplicationController makes sure that a pod or homogeneous set of pods are
|
|
||||||
always up and available.
|
|
||||||
If there are too many pods, it will kill some. If there are too few, the
|
|
||||||
ReplicationController will start more. Unlike manually created pods, the pods maintained by a
|
|
||||||
ReplicationController are automatically replaced if they fail, get deleted, or are terminated.
|
|
||||||
For example, your pods get re-created on a node after disruptive maintenance such as a kernel upgrade.
|
|
||||||
For this reason, we recommend that you use a ReplicationController even if your application requires
|
|
||||||
only a single pod. You can think of a ReplicationController as something similar to a process supervisor,
|
|
||||||
but rather than individual processes on a single node, the ReplicationController supervises multiple pods
|
|
||||||
across multiple nodes.
|
|
||||||
|
|
||||||
ReplicationController is often abbreviated to "rc" or "rcs" in discussion, and as a shortcut in
|
|
||||||
kubectl commands.
|
|
||||||
|
|
||||||
A simple case is to create 1 ReplicationController object in order to reliably run one instance of
|
|
||||||
a Pod indefinitely. A more complex use case is to run several identical replicas of a replicated
|
|
||||||
service, such as web servers.
|
|
||||||
|
|
||||||
## Running an example ReplicationController
|
|
||||||
|
|
||||||
Here is an example ReplicationController config. It runs 3 copies of the nginx web server.
|
|
||||||
|
|
||||||
{% include code.html language="yaml" file="replication.yaml" ghlink="/docs/user-guide/replication.yaml" %}
|
|
||||||
|
|
||||||
Run the example job by downloading the example file and then running this command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create -f ./replication.yaml
|
|
||||||
replicationcontroller "nginx" created
|
|
||||||
```
|
|
||||||
|
|
||||||
Check on the status of the ReplicationController using this command:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl describe replicationcontrollers/nginx
|
|
||||||
Name: nginx
|
|
||||||
Namespace: default
|
|
||||||
Image(s): nginx
|
|
||||||
Selector: app=nginx
|
|
||||||
Labels: app=nginx
|
|
||||||
Replicas: 3 current / 3 desired
|
|
||||||
Pods Status: 0 Running / 3 Waiting / 0 Succeeded / 0 Failed
|
|
||||||
Events:
|
|
||||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
|
||||||
--------- -------- ----- ---- ------------- ---- ------ -------
|
|
||||||
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-qrm3m
|
|
||||||
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-3ntk0
|
|
||||||
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-4ok8v
|
|
||||||
```
|
|
||||||
|
|
||||||
Here, 3 pods have been made, but none are running yet, perhaps because the image is being pulled.
|
|
||||||
A little later, the same command may show:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
|
|
||||||
```
|
|
||||||
|
|
||||||
To list all the pods that belong to the rc in a machine readable form, you can use a command like this:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ pods=$(kubectl get pods --selector=app=nginx --output=jsonpath={.items..metadata.name})
|
|
||||||
echo $pods
|
|
||||||
nginx-3ntk0 nginx-4ok8v nginx-qrm3m
|
|
||||||
```
|
|
||||||
|
|
||||||
Here, the selector is the same as the selector for the ReplicationController (seen in the
|
|
||||||
`kubectl describe` output, and in a different form in `replication.yaml`. The `--output=jsonpath` option
|
|
||||||
specifies an expression that just gets the name from each pod in the returned list.
|
|
||||||
|
|
||||||
|
|
||||||
## Writing a ReplicationController Spec
|
|
||||||
|
|
||||||
As with all other Kubernetes config, a Job needs `apiVersion`, `kind`, and `metadata` fields. For
|
|
||||||
general information about working with config files, see [here](/docs/user-guide/simple-yaml/),
|
|
||||||
[here](/docs/user-guide/configuring-containers/), and [here](/docs/user-guide/working-with-resources/).
|
|
||||||
|
|
||||||
A ReplicationController also needs a [`.spec` section](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status).
|
|
||||||
|
|
||||||
### Pod Template
|
|
||||||
|
|
||||||
The `.spec.template` is the only required field of the `.spec`.
|
|
||||||
|
|
||||||
The `.spec.template` is a [pod template](#pod-template). It has exactly
|
|
||||||
the same schema as a [pod](/docs/user-guide/pods/), except it is nested and does not have an `apiVersion` or
|
|
||||||
`kind`.
|
|
||||||
|
|
||||||
In addition to required fields for a Pod, a pod template in a ReplicationController must specify appropriate
|
|
||||||
labels (i.e. don't overlap with other controllers, see [pod selector](#pod-selector)) and an appropriate restart policy.
|
|
||||||
|
|
||||||
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
|
|
||||||
if not specified.
|
|
||||||
|
|
||||||
For local container restarts, ReplicationControllers delegate to an agent on the node,
|
|
||||||
for example the [Kubelet](/docs/admin/kubelet/) or Docker.
|
|
||||||
|
|
||||||
### Labels on the ReplicationController
|
|
||||||
|
|
||||||
The ReplicationController can itself have labels (`.metadata.labels`). Typically, you
|
|
||||||
would set these the same as the `.spec.template.metadata.labels`; if `.metadata.labels` is not specified
|
|
||||||
then it is defaulted to `.spec.template.metadata.labels`. However, they are allowed to be
|
|
||||||
different, and the `.metadata.labels` do not affect the behavior of the ReplicationController.
|
|
||||||
|
|
||||||
### Pod Selector
|
|
||||||
|
|
||||||
The `.spec.selector` field is a [label selector](/docs/user-guide/labels/#label-selectors). A replication
|
|
||||||
controller manages all the pods with labels which match the selector. It does not distinguish
|
|
||||||
between pods which it created or deleted versus pods which some other person or process created or
|
|
||||||
deleted. This allows the ReplicationController to be replaced without affecting the running pods.
|
|
||||||
|
|
||||||
If specified, the `.spec.template.metadata.labels` must be equal to the `.spec.selector`, or it will
|
|
||||||
be rejected by the API. If `.spec.selector` is unspecified, it will be defaulted to
|
|
||||||
`.spec.template.metadata.labels`.
|
|
||||||
|
|
||||||
Also you should not normally create any pods whose labels match this selector, either directly, via
|
|
||||||
another ReplicationController or via another controller such as Job. Otherwise, the
|
|
||||||
ReplicationController will think that those pods were created by it. Kubernetes will not stop you
|
|
||||||
from doing this.
|
|
||||||
|
|
||||||
If you do end up with multiple controllers that have overlapping selectors, you
|
|
||||||
will have to manage the deletion yourself (see [below](#updating-a-replication-controller)).
|
|
||||||
|
|
||||||
### Multiple Replicas
|
|
||||||
|
|
||||||
You can specify how many pods should run concurrently by setting `.spec.replicas` to the number
|
|
||||||
of pods you would like to have running concurrently. The number running at any time may be higher
|
|
||||||
or lower, such as if the replicas was just increased or decreased, or if a pod is gracefully
|
|
||||||
shutdown, and a replacement starts early.
|
|
||||||
|
|
||||||
If you do not specify `.spec.replicas`, then it defaults to 1.
|
|
||||||
|
|
||||||
## Working with ReplicationControllers
|
|
||||||
|
|
||||||
### Deleting a ReplicationController and its Pods
|
|
||||||
|
|
||||||
To delete a ReplicationController and all its pods, use [`kubectl
|
|
||||||
delete`](/docs/user-guide/kubectl/kubectl_delete/). Kubectl will scale the ReplicationController to zero and wait
|
|
||||||
for it to delete each pod before deleting the ReplicationController itself. If this kubectl
|
|
||||||
command is interrupted, it can be restarted.
|
|
||||||
|
|
||||||
When using the REST API or go client library, you need to do the steps explicitly (scale replicas to
|
|
||||||
0, wait for pod deletions, then delete the ReplicationController).
|
|
||||||
|
|
||||||
### Deleting just a ReplicationController
|
|
||||||
|
|
||||||
You can delete a ReplicationController without affecting any of its pods.
|
|
||||||
|
|
||||||
Using kubectl, specify the `--cascade=false` option to [`kubectl delete`](/docs/user-guide/kubectl/kubectl_delete/).
|
|
||||||
|
|
||||||
When using the REST API or go client library, simply delete the ReplicationController object.
|
|
||||||
|
|
||||||
Once the original is deleted, you can create a new ReplicationController to replace it. As long
|
|
||||||
as the old and new `.spec.selector` are the same, then the new one will adopt the old pods.
|
|
||||||
However, it will not make any effort to make existing pods match a new, different pod template.
|
|
||||||
To update pods to a new spec in a controlled way, use a [rolling update](#rolling-updates).
|
|
||||||
|
|
||||||
### Isolating pods from a ReplicationController
|
|
||||||
|
|
||||||
Pods may be removed from a ReplicationController's target set by changing their labels. This technique may be used to remove pods from service for debugging, data recovery, etc. Pods that are removed in this way will be replaced automatically (assuming that the number of replicas is not also changed).
|
|
||||||
|
|
||||||
## Common usage patterns
|
|
||||||
|
|
||||||
### Rescheduling
|
|
||||||
|
|
||||||
As mentioned above, whether you have 1 pod you want to keep running, or 1000, a ReplicationController will ensure that the specified number of pods exists, even in the event of node failure or pod termination (e.g., due to an action by another control agent).
|
|
||||||
|
|
||||||
### Scaling
|
|
||||||
|
|
||||||
The ReplicationController makes it easy to scale the number of replicas up or down, either manually or by an auto-scaling control agent, by simply updating the `replicas` field.
|
|
||||||
|
|
||||||
### Rolling updates
|
|
||||||
|
|
||||||
The ReplicationController is designed to facilitate rolling updates to a service by replacing pods one-by-one.
|
|
||||||
|
|
||||||
As explained in [#1353](http://issue.k8s.io/1353), the recommended approach is to create a new ReplicationController with 1 replica, scale the new (+1) and old (-1) controllers one by one, and then delete the old controller after it reaches 0 replicas. This predictably updates the set of pods regardless of unexpected failures.
|
|
||||||
|
|
||||||
Ideally, the rolling update controller would take application readiness into account, and would ensure that a sufficient number of pods were productively serving at any given time.
|
|
||||||
|
|
||||||
The two ReplicationControllers would need to create pods with at least one differentiating label, such as the image tag of the primary container of the pod, since it is typically image updates that motivate rolling updates.
|
|
||||||
|
|
||||||
Rolling update is implemented in the client tool
|
|
||||||
[`kubectl rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update). Visit [`kubectl rolling-update` task](/docs/tasks/run-application/rolling-update-replication-controller/) for more concrete examples.
|
|
||||||
|
|
||||||
### Multiple release tracks
|
|
||||||
|
|
||||||
In addition to running multiple releases of an application while a rolling update is in progress, it's common to run multiple releases for an extended period of time, or even continuously, using multiple release tracks. The tracks would be differentiated by labels.
|
|
||||||
|
|
||||||
For instance, a service might target all pods with `tier in (frontend), environment in (prod)`. Now say you have 10 replicated pods that make up this tier. But you want to be able to 'canary' a new version of this component. You could set up a ReplicationController with `replicas` set to 9 for the bulk of the replicas, with labels `tier=frontend, environment=prod, track=stable`, and another ReplicationController with `replicas` set to 1 for the canary, with labels `tier=frontend, environment=prod, track=canary`. Now the service is covering both the canary and non-canary pods. But you can mess with the ReplicationControllers separately to test things out, monitor the results, etc.
|
|
||||||
|
|
||||||
### Using ReplicationControllers with Services
|
|
||||||
|
|
||||||
Multiple ReplicationControllers can sit behind a single service, so that, for example, some traffic
|
|
||||||
goes to the old version, and some goes to the new version.
|
|
||||||
|
|
||||||
A ReplicationController will never terminate on its own, but it isn't expected to be as long-lived as services. Services may be composed of pods controlled by multiple ReplicationControllers, and it is expected that many ReplicationControllers may be created and destroyed over the lifetime of a service (for instance, to perform an update of pods that run the service). Both services themselves and their clients should remain oblivious to the ReplicationControllers that maintain the pods of the services.
|
|
||||||
|
|
||||||
## Writing programs for Replication
|
|
||||||
|
|
||||||
Pods created by a ReplicationController are intended to be fungible and semantically identical, though their configurations may become heterogeneous over time. This is an obvious fit for replicated stateless servers, but ReplicationControllers can also be used to maintain availability of master-elected, sharded, and worker-pool applications. Such applications should use dynamic work assignment mechanisms, such as the [etcd lock module](https://coreos.com/docs/distributed-configuration/etcd-modules/) or [RabbitMQ work queues](https://www.rabbitmq.com/tutorials/tutorial-two-python.html), as opposed to static/one-time customization of the configuration of each pod, which is considered an anti-pattern. Any pod customization performed, such as vertical auto-sizing of resources (e.g., cpu or memory), should be performed by another online controller process, not unlike the ReplicationController itself.
|
|
||||||
|
|
||||||
## Responsibilities of the ReplicationController
|
|
||||||
|
|
||||||
The ReplicationController simply ensures that the desired number of pods matches its label selector and are operational. Currently, only terminated pods are excluded from its count. In the future, [readiness](http://issue.k8s.io/620) and other information available from the system may be taken into account, we may add more controls over the replacement policy, and we plan to emit events that could be used by external clients to implement arbitrarily sophisticated replacement and/or scale-down policies.
|
|
||||||
|
|
||||||
The ReplicationController is forever constrained to this narrow responsibility. It itself will not perform readiness nor liveness probes. Rather than performing auto-scaling, it is intended to be controlled by an external auto-scaler (as discussed in [#492](http://issue.k8s.io/492)), which would change its `replicas` field. We will not add scheduling policies (e.g., [spreading](http://issue.k8s.io/367#issuecomment-48428019)) to the ReplicationController. Nor should it verify that the pods controlled match the currently specified template, as that would obstruct auto-sizing and other automated processes. Similarly, completion deadlines, ordering dependencies, configuration expansion, and other features belong elsewhere. We even plan to factor out the mechanism for bulk pod creation ([#170](http://issue.k8s.io/170)).
|
|
||||||
|
|
||||||
The ReplicationController is intended to be a composable building-block primitive. We expect higher-level APIs and/or tools to be built on top of it and other complementary primitives for user convenience in the future. The "macro" operations currently supported by kubectl (run, stop, scale, rolling-update) are proof-of-concept examples of this. For instance, we could imagine something like [Asgard](http://techblog.netflix.com/2012/06/asgard-web-based-cloud-management-and.html) managing ReplicationControllers, auto-scalers, services, scheduling policies, canaries, etc.
|
|
||||||
|
|
||||||
|
|
||||||
## API Object
|
|
||||||
|
|
||||||
Replication controller is a top-level resource in the Kubernetes REST API. More details about the
|
|
||||||
API object can be found at: [ReplicationController API
|
|
||||||
object](/docs/api-reference/v1.6/#replicationcontroller-v1-core).
|
|
||||||
|
|
||||||
## Alternatives to ReplicationController
|
|
||||||
|
|
||||||
### ReplicaSet
|
|
||||||
|
|
||||||
[`ReplicaSet`](/docs/user-guide/replicasets/) is the next-generation ReplicationController that supports the new [set-based label selector](/docs/user-guide/labels/#set-based-requirement).
|
|
||||||
It’s mainly used by [`Deployment`](/docs/user-guide/deployments/) as a mechanism to orchestrate pod creation, deletion and updates.
|
|
||||||
Note that we recommend using Deployments instead of directly using Replica Sets, unless you require custom update orchestration or don’t require updates at all.
|
|
||||||
|
|
||||||
|
|
||||||
### Deployment (Recommended)
|
|
||||||
|
|
||||||
[`Deployment`](/docs/user-guide/deployments/) is a higher-level API object that updates its underlying Replica Sets and their Pods
|
|
||||||
in a similar fashion as `kubectl rolling-update`. Deployments are recommended if you want this rolling update functionality,
|
|
||||||
because unlike `kubectl rolling-update`, they are declarative, server-side, and have additional features.
|
|
||||||
|
|
||||||
### Bare Pods
|
|
||||||
|
|
||||||
Unlike in the case where a user directly created pods, a ReplicationController replaces pods that are deleted or terminated for any reason, such as in the case of node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, we recommend that you use a ReplicationController even if your application requires only a single pod. Think of it similarly to a process supervisor, only it supervises multiple pods across multiple nodes instead of individual processes on a single node. A ReplicationController delegates local container restarts to some agent on the node (e.g., Kubelet or Docker).
|
|
||||||
|
|
||||||
### Job
|
|
||||||
|
|
||||||
Use a [`Job`](/docs/concepts/jobs/run-to-completion-finite-workloads/) instead of a ReplicationController for pods that are expected to terminate on their own
|
|
||||||
(i.e. batch jobs).
|
|
||||||
|
|
||||||
### DaemonSet
|
|
||||||
|
|
||||||
Use a [`DaemonSet`](/docs/admin/daemons/) instead of a ReplicationController for pods that provide a
|
|
||||||
machine-level function, such as machine monitoring or machine logging. These pods have a lifetime that is tied
|
|
||||||
to a machine lifetime: the pod needs to be running on the machine before other pods start, and are
|
|
||||||
safe to terminate when the machine is otherwise ready to be rebooted/shutdown.
|
|
||||||
|
|
||||||
## For more information
|
|
||||||
|
|
||||||
Read [Run Stateless AP Replication Controller](/docs/tutorials/stateless-application/run-stateless-ap-replication-controller/).
|
|
||||||
|
|
|
@ -4,792 +4,6 @@ assignees:
|
||||||
title: Secrets
|
title: Secrets
|
||||||
---
|
---
|
||||||
|
|
||||||
Objects of type `secret` are intended to hold sensitive information, such as
|
{% include user-guide-content-moved.md %}
|
||||||
passwords, OAuth tokens, and ssh keys. Putting this information in a `secret`
|
|
||||||
is safer and more flexible than putting it verbatim in a `pod` definition or in
|
|
||||||
a docker image. See [Secrets design document](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/secrets.md) for more information.
|
|
||||||
|
|
||||||
* TOC
|
[Secrets](/docs/concepts/configuration/secret/)
|
||||||
{:toc}
|
|
||||||
|
|
||||||
## Overview of Secrets
|
|
||||||
|
|
||||||
A Secret is an object that contains a small amount of sensitive data such as
|
|
||||||
a password, a token, or a key. Such information might otherwise be put in a
|
|
||||||
Pod specification or in an image; putting it in a Secret object allows for
|
|
||||||
more control over how it is used, and reduces the risk of accidental exposure.
|
|
||||||
|
|
||||||
Users can create secrets, and the system also creates some secrets.
|
|
||||||
|
|
||||||
To use a secret, a pod needs to reference the secret.
|
|
||||||
A secret can be used with a pod in two ways: as files in a [volume](/docs/concepts/storage/volumes/) mounted on one or more of
|
|
||||||
its containers, or used by kubelet when pulling images for the pod.
|
|
||||||
|
|
||||||
### Built-in Secrets
|
|
||||||
|
|
||||||
#### Service Accounts Automatically Create and Attach Secrets with API Credentials
|
|
||||||
|
|
||||||
Kubernetes automatically creates secrets which contain credentials for
|
|
||||||
accessing the API and it automatically modifies your pods to use this type of
|
|
||||||
secret.
|
|
||||||
|
|
||||||
The automatic creation and use of API credentials can be disabled or overridden
|
|
||||||
if desired. However, if all you need to do is securely access the apiserver,
|
|
||||||
this is the recommended workflow.
|
|
||||||
|
|
||||||
See the [Service Account](/docs/user-guide/service-accounts) documentation for more
|
|
||||||
information on how Service Accounts work.
|
|
||||||
|
|
||||||
### Creating your own Secrets
|
|
||||||
|
|
||||||
#### Creating a Secret Using kubectl create secret
|
|
||||||
|
|
||||||
Say that some pods need to access a database. The
|
|
||||||
username and password that the pods should use is in the files
|
|
||||||
`./username.txt` and `./password.txt` on your local machine.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
# Create files needed for rest of example.
|
|
||||||
$ echo -n "admin" > ./username.txt
|
|
||||||
$ echo -n "1f2d1e2e67df" > ./password.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
The `kubectl create secret` command
|
|
||||||
packages these files into a Secret and creates
|
|
||||||
the object on the Apiserver.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create secret generic db-user-pass --from-file=./username.txt --from-file=./password.txt
|
|
||||||
secret "db-user-pass" created
|
|
||||||
```
|
|
||||||
|
|
||||||
You can check that the secret was created like this:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get secrets
|
|
||||||
NAME TYPE DATA AGE
|
|
||||||
db-user-pass Opaque 2 51s
|
|
||||||
|
|
||||||
$ kubectl describe secrets/db-user-pass
|
|
||||||
Name: db-user-pass
|
|
||||||
Namespace: default
|
|
||||||
Labels: <none>
|
|
||||||
Annotations: <none>
|
|
||||||
|
|
||||||
Type: Opaque
|
|
||||||
|
|
||||||
Data
|
|
||||||
====
|
|
||||||
password.txt: 12 bytes
|
|
||||||
username.txt: 5 bytes
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that neither `get` nor `describe` shows the contents of the file by default.
|
|
||||||
This is to protect the secret from being exposed accidentally to someone looking
|
|
||||||
or from being stored in a terminal log.
|
|
||||||
|
|
||||||
See [decoding a secret](#decoding-a-secret) for how to see the contents.
|
|
||||||
|
|
||||||
#### Creating a Secret Manually
|
|
||||||
|
|
||||||
You can also create a secret object in a file first,
|
|
||||||
in json or yaml format, and then create that object.
|
|
||||||
|
|
||||||
Each item must be base64 encoded:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ echo -n "admin" | base64
|
|
||||||
YWRtaW4=
|
|
||||||
$ echo -n "1f2d1e2e67df" | base64
|
|
||||||
MWYyZDFlMmU2N2Rm
|
|
||||||
```
|
|
||||||
|
|
||||||
Now write a secret object that looks like this:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: mysecret
|
|
||||||
type: Opaque
|
|
||||||
data:
|
|
||||||
username: YWRtaW4=
|
|
||||||
password: MWYyZDFlMmU2N2Rm
|
|
||||||
```
|
|
||||||
|
|
||||||
The data field is a map. Its keys must match
|
|
||||||
[`DNS_SUBDOMAIN`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/design/identifiers.md), except that leading dots are also
|
|
||||||
allowed. The values are arbitrary data, encoded using base64.
|
|
||||||
|
|
||||||
Create the secret using [`kubectl create`](/docs/user-guide/kubectl/kubectl_create/):
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create -f ./secret.yaml
|
|
||||||
secret "mysecret" created
|
|
||||||
```
|
|
||||||
|
|
||||||
**Encoding Note:** The serialized JSON and YAML values of secret data are
|
|
||||||
encoded as base64 strings. Newlines are not valid within these strings and must
|
|
||||||
be omitted. When using the `base64` utility on Darwin/OS X users should avoid
|
|
||||||
using the `-b` option to split long lines. Conversely Linux users *should* add
|
|
||||||
the option `-w 0` to `base64` commands.
|
|
||||||
|
|
||||||
#### Decoding a Secret
|
|
||||||
|
|
||||||
Get back the secret created in the previous section:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl get secret mysecret -o yaml
|
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
username: YWRtaW4=
|
|
||||||
password: MWYyZDFlMmU2N2Rm
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
creationTimestamp: 2016-01-22T18:41:56Z
|
|
||||||
name: mysecret
|
|
||||||
namespace: default
|
|
||||||
resourceVersion: "164619"
|
|
||||||
selfLink: /api/v1/namespaces/default/secrets/mysecret
|
|
||||||
uid: cfee02d6-c137-11e5-8d73-42010af00002
|
|
||||||
type: Opaque
|
|
||||||
```
|
|
||||||
|
|
||||||
Decode the password field:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ echo "MWYyZDFlMmU2N2Rm" | base64 --decode
|
|
||||||
1f2d1e2e67df
|
|
||||||
```
|
|
||||||
|
|
||||||
### Using Secrets
|
|
||||||
|
|
||||||
Secrets can be mounted as data volumes or be exposed as environment variables to
|
|
||||||
be used by a container in a pod. They can also be used by other parts of the
|
|
||||||
system, without being directly exposed to the pod. For example, they can hold
|
|
||||||
credentials that other parts of the system should use to interact with external
|
|
||||||
systems on your behalf.
|
|
||||||
|
|
||||||
#### Using Secrets as Files from a Pod
|
|
||||||
|
|
||||||
To consume a Secret in a volume in a Pod:
|
|
||||||
|
|
||||||
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
|
|
||||||
1. Modify your Pod definition to add a volume under `spec.volumes[]`. Name the volume anything, and have a `spec.volumes[].secret.secretName` field equal to the name of the secret object.
|
|
||||||
1. Add a `spec.containers[].volumeMounts[]` to each container that needs the secret. Specify `spec.containers[].volumeMounts[].readOnly = true` and `spec.containers[].volumeMounts[].mountPath` to an unused directory name where you would like the secrets to appear.
|
|
||||||
1. Modify your image and/or command line so that the program looks for files in that directory. Each key in the secret `data` map becomes the filename under `mountPath`.
|
|
||||||
|
|
||||||
This is an example of a pod that mounts a secret in a volume:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"kind": "Pod",
|
|
||||||
"metadata": {
|
|
||||||
"name": "mypod",
|
|
||||||
"namespace": "myns"
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"containers": [{
|
|
||||||
"name": "mypod",
|
|
||||||
"image": "redis",
|
|
||||||
"volumeMounts": [{
|
|
||||||
"name": "foo",
|
|
||||||
"mountPath": "/etc/foo",
|
|
||||||
"readOnly": true
|
|
||||||
}]
|
|
||||||
}],
|
|
||||||
"volumes": [{
|
|
||||||
"name": "foo",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "mysecret"
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Each secret you want to use needs to be referred to in `spec.volumes`.
|
|
||||||
|
|
||||||
If there are multiple containers in the pod, then each container needs its
|
|
||||||
own `volumeMounts` block, but only one `spec.volumes` is needed per secret.
|
|
||||||
|
|
||||||
You can package many files into one secret, or use many secrets, whichever is convenient.
|
|
||||||
|
|
||||||
**Projection of secret keys to specific paths**
|
|
||||||
|
|
||||||
We can also control the paths within the volume where Secret keys are projected.
|
|
||||||
You can use `spec.volumes[].secret.items` field to change target path of each key:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"kind": "Pod",
|
|
||||||
"metadata": {
|
|
||||||
"name": "mypod",
|
|
||||||
"namespace": "myns"
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"containers": [{
|
|
||||||
"name": "mypod",
|
|
||||||
"image": "redis",
|
|
||||||
"volumeMounts": [{
|
|
||||||
"name": "foo",
|
|
||||||
"mountPath": "/etc/foo",
|
|
||||||
"readOnly": true
|
|
||||||
}]
|
|
||||||
}],
|
|
||||||
"volumes": [{
|
|
||||||
"name": "foo",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "mysecret",
|
|
||||||
"items": [{
|
|
||||||
"key": "username",
|
|
||||||
"path": "my-group/my-username"
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
What will happen:
|
|
||||||
|
|
||||||
* `username` secret is stored under `/etc/foo/my-group/my-username` file instead of `/etc/foo/username`.
|
|
||||||
* `password` secret is not projected
|
|
||||||
|
|
||||||
If `spec.volumes[].secret.items` is used, only keys specified in `items` are projected.
|
|
||||||
To consume all keys from the secret, all of them must be listed in the `items` field.
|
|
||||||
All listed keys must exist in the corresponding secret. Otherwise, the volume is not created.
|
|
||||||
|
|
||||||
**Secret files permissions**
|
|
||||||
|
|
||||||
You can also specify the permission mode bits files part of a secret will have.
|
|
||||||
If you don't specify any, `0644` is used by default. You can specify a default
|
|
||||||
mode for the whole secret volume and override per key if needed.
|
|
||||||
|
|
||||||
For example, you can specify a default mode like this:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"kind": "Pod",
|
|
||||||
"metadata": {
|
|
||||||
"name": "mypod",
|
|
||||||
"namespace": "myns"
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"containers": [{
|
|
||||||
"name": "mypod",
|
|
||||||
"image": "redis",
|
|
||||||
"volumeMounts": [{
|
|
||||||
"name": "foo",
|
|
||||||
"mountPath": "/etc/foo"
|
|
||||||
}]
|
|
||||||
}],
|
|
||||||
"volumes": [{
|
|
||||||
"name": "foo",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "mysecret",
|
|
||||||
"defaultMode": 256
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Then, the secret will be mounted on `/etc/foo` and all the files created by the
|
|
||||||
secret volume mount will have permission `0400`.
|
|
||||||
|
|
||||||
Note that the JSON spec doesn't support octal notation, so use the value 256 for
|
|
||||||
0400 permissions. If you use yaml instead of json for the pod, you can use octal
|
|
||||||
notation to specify permissions in a more natural way.
|
|
||||||
|
|
||||||
You can also use mapping, as in the previous example, and specify different
|
|
||||||
permission for different files like this:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"kind": "Pod",
|
|
||||||
"metadata": {
|
|
||||||
"name": "mypod",
|
|
||||||
"namespace": "myns"
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"containers": [{
|
|
||||||
"name": "mypod",
|
|
||||||
"image": "redis",
|
|
||||||
"volumeMounts": [{
|
|
||||||
"name": "foo",
|
|
||||||
"mountPath": "/etc/foo"
|
|
||||||
}]
|
|
||||||
}],
|
|
||||||
"volumes": [{
|
|
||||||
"name": "foo",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "mysecret",
|
|
||||||
"items": [{
|
|
||||||
"key": "username",
|
|
||||||
"path": "my-group/my-username",
|
|
||||||
"mode": 511
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
In this case, the file resulting in `/etc/foo/my-group/my-username` will have
|
|
||||||
permission value of `0777`. Owing to JSON limitations, you must specify the mode
|
|
||||||
in decimal notation.
|
|
||||||
|
|
||||||
Note that this permission value might be displayed in decimal notation if you
|
|
||||||
read it later.
|
|
||||||
|
|
||||||
**Consuming Secret Values from Volumes**
|
|
||||||
|
|
||||||
Inside the container that mounts a secret volume, the secret keys appear as
|
|
||||||
files and the secret values are base-64 decoded and stored inside these files.
|
|
||||||
This is the result of commands
|
|
||||||
executed inside the container from the example above:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ ls /etc/foo/
|
|
||||||
username
|
|
||||||
password
|
|
||||||
$ cat /etc/foo/username
|
|
||||||
admin
|
|
||||||
$ cat /etc/foo/password
|
|
||||||
1f2d1e2e67df
|
|
||||||
```
|
|
||||||
|
|
||||||
The program in a container is responsible for reading the secrets from the
|
|
||||||
files.
|
|
||||||
|
|
||||||
**Mounted Secrets are updated automatically**
|
|
||||||
|
|
||||||
When a secret being already consumed in a volume is updated, projected keys are eventually updated as well.
|
|
||||||
Kubelet is checking whether the mounted secret is fresh on every periodic sync.
|
|
||||||
However, it is using its local ttl-based cache for getting the current value of the secret.
|
|
||||||
As a result, the total delay from the moment when the secret is updated to the moment when new keys are
|
|
||||||
projected to the pod can be as long as kubelet sync period + ttl of secrets cache in kubelet.
|
|
||||||
|
|
||||||
#### Using Secrets as Environment Variables
|
|
||||||
|
|
||||||
To use a secret in an environment variable in a pod:
|
|
||||||
|
|
||||||
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
|
|
||||||
1. Modify your Pod definition in each container that you wish to consume the value of a secret key to add an environment variable for each secret key you wish to consume. The environment variable that consumes the secret key should populate the secret's name and key in `env[x].valueFrom.secretKeyRef`.
|
|
||||||
1. Modify your image and/or command line so that the program looks for values in the specified environment variables
|
|
||||||
|
|
||||||
This is an example of a pod that uses secrets from environment variables:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Pod
|
|
||||||
metadata:
|
|
||||||
name: secret-env-pod
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: mycontainer
|
|
||||||
image: redis
|
|
||||||
env:
|
|
||||||
- name: SECRET_USERNAME
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: mysecret
|
|
||||||
key: username
|
|
||||||
- name: SECRET_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: mysecret
|
|
||||||
key: password
|
|
||||||
restartPolicy: Never
|
|
||||||
```
|
|
||||||
|
|
||||||
**Consuming Secret Values from Environment Variables**
|
|
||||||
|
|
||||||
Inside a container that consumes a secret in an environment variables, the secret keys appear as
|
|
||||||
normal environment variables containing the base-64 decoded values of the secret data.
|
|
||||||
This is the result of commands executed inside the container from the example above:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ echo $SECRET_USERNAME
|
|
||||||
admin
|
|
||||||
$ echo $SECRET_PASSWORD
|
|
||||||
1f2d1e2e67df
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Using imagePullSecrets
|
|
||||||
|
|
||||||
An imagePullSecret is a way to pass a secret that contains a Docker (or other) image registry
|
|
||||||
password to the Kubelet so it can pull a private image on behalf of your Pod.
|
|
||||||
|
|
||||||
**Manually specifying an imagePullSecret**
|
|
||||||
|
|
||||||
Use of imagePullSecrets is described in the [images documentation](/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod)
|
|
||||||
|
|
||||||
### Arranging for imagePullSecrets to be Automatically Attached
|
|
||||||
|
|
||||||
You can manually create an imagePullSecret, and reference it from
|
|
||||||
a serviceAccount. Any pods created with that serviceAccount
|
|
||||||
or that default to use that serviceAccount, will get their imagePullSecret
|
|
||||||
field set to that of the service account.
|
|
||||||
See [here](/docs/user-guide/service-accounts/#adding-imagepullsecrets-to-a-service-account)
|
|
||||||
for a detailed explanation of that process.
|
|
||||||
|
|
||||||
#### Automatic Mounting of Manually Created Secrets
|
|
||||||
|
|
||||||
We plan to extend the service account behavior so that manually created
|
|
||||||
secrets (e.g. one containing a token for accessing a github account)
|
|
||||||
can be automatically attached to pods based on their service account.
|
|
||||||
*This is not implemented yet. See [issue 9902](http://issue.k8s.io/9902).*
|
|
||||||
|
|
||||||
## Details
|
|
||||||
|
|
||||||
### Restrictions
|
|
||||||
|
|
||||||
Secret volume sources are validated to ensure that the specified object
|
|
||||||
reference actually points to an object of type `Secret`. Therefore, a secret
|
|
||||||
needs to be created before any pods that depend on it.
|
|
||||||
|
|
||||||
Secret API objects reside in a namespace. They can only be referenced by pods
|
|
||||||
in that same namespace.
|
|
||||||
|
|
||||||
Individual secrets are limited to 1MB in size. This is to discourage creation
|
|
||||||
of very large secrets which would exhaust apiserver and kubelet memory.
|
|
||||||
However, creation of many smaller secrets could also exhaust memory. More
|
|
||||||
comprehensive limits on memory usage due to secrets is a planned feature.
|
|
||||||
|
|
||||||
Kubelet only supports use of secrets for Pods it gets from the API server.
|
|
||||||
This includes any pods created using kubectl, or indirectly via a replication
|
|
||||||
controller. It does not include pods created via the kubelets
|
|
||||||
`--manifest-url` flag, its `--config` flag, or its REST API (these are
|
|
||||||
not common ways to create pods.)
|
|
||||||
|
|
||||||
### Secret and Pod Lifetime interaction
|
|
||||||
|
|
||||||
When a pod is created via the API, there is no check whether a referenced
|
|
||||||
secret exists. Once a pod is scheduled, the kubelet will try to fetch the
|
|
||||||
secret value. If the secret cannot be fetched because it does not exist or
|
|
||||||
because of a temporary lack of connection to the API server, kubelet will
|
|
||||||
periodically retry. It will report an event about the pod explaining the
|
|
||||||
reason it is not started yet. Once the secret is fetched, the kubelet will
|
|
||||||
create and mount a volume containing it. None of the pod's containers will
|
|
||||||
start until all the pod's volumes are mounted.
|
|
||||||
|
|
||||||
## Use cases
|
|
||||||
|
|
||||||
### Use-Case: Pod with ssh keys
|
|
||||||
|
|
||||||
Create a secret containing some ssh keys:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create secret generic ssh-key-secret --from-file=ssh-privatekey=/path/to/.ssh/id_rsa --from-file=ssh-publickey=/path/to/.ssh/id_rsa.pub
|
|
||||||
```
|
|
||||||
|
|
||||||
**Security Note:** think carefully before sending your own ssh keys: other users of the cluster may have access to the secret. Use a service account which you want to have accessible to all the users with whom you share the Kubernetes cluster, and can revoke if they are compromised.
|
|
||||||
|
|
||||||
|
|
||||||
Now we can create a pod which references the secret with the ssh key and
|
|
||||||
consumes it in a volume:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"kind": "Pod",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "secret-test-pod",
|
|
||||||
"labels": {
|
|
||||||
"name": "secret-test"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"volumes": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "ssh-key-secret"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"containers": [
|
|
||||||
{
|
|
||||||
"name": "ssh-test-container",
|
|
||||||
"image": "mySshImage",
|
|
||||||
"volumeMounts": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"readOnly": true,
|
|
||||||
"mountPath": "/etc/secret-volume"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
When the container's command runs, the pieces of the key will be available in:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
/etc/secret-volume/ssh-publickey
|
|
||||||
/etc/secret-volume/ssh-privatekey
|
|
||||||
```
|
|
||||||
|
|
||||||
The container is then free to use the secret data to establish an ssh connection.
|
|
||||||
|
|
||||||
### Use-Case: Pods with prod / test credentials
|
|
||||||
|
|
||||||
This example illustrates a pod which consumes a secret containing prod
|
|
||||||
credentials and another pod which consumes a secret with test environment
|
|
||||||
credentials.
|
|
||||||
|
|
||||||
Make the secrets:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
$ kubectl create secret generic prod-db-secret --from-literal=username=produser --from-literal=password=Y4nys7f11
|
|
||||||
secret "prod-db-secret" created
|
|
||||||
$ kubectl create secret generic test-db-secret --from-literal=username=testuser --from-literal=password=iluvtests
|
|
||||||
secret "test-db-secret" created
|
|
||||||
```
|
|
||||||
|
|
||||||
Now make the pods:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"kind": "List",
|
|
||||||
"items":
|
|
||||||
[{
|
|
||||||
"kind": "Pod",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "prod-db-client-pod",
|
|
||||||
"labels": {
|
|
||||||
"name": "prod-db-client"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"volumes": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "prod-db-secret"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"containers": [
|
|
||||||
{
|
|
||||||
"name": "db-client-container",
|
|
||||||
"image": "myClientImage",
|
|
||||||
"volumeMounts": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"readOnly": true,
|
|
||||||
"mountPath": "/etc/secret-volume"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"kind": "Pod",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "test-db-client-pod",
|
|
||||||
"labels": {
|
|
||||||
"name": "test-db-client"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"volumes": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "test-db-secret"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"containers": [
|
|
||||||
{
|
|
||||||
"name": "db-client-container",
|
|
||||||
"image": "myClientImage",
|
|
||||||
"volumeMounts": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"readOnly": true,
|
|
||||||
"mountPath": "/etc/secret-volume"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Both containers will have the following files present on their filesystems:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
/etc/secret-volume/username
|
|
||||||
/etc/secret-volume/password
|
|
||||||
```
|
|
||||||
|
|
||||||
Note how the specs for the two pods differ only in one field; this facilitates
|
|
||||||
creating pods with different capabilities from a common pod config template.
|
|
||||||
|
|
||||||
You could further simplify the base pod specification by using two Service Accounts:
|
|
||||||
one called, say, `prod-user` with the `prod-db-secret`, and one called, say,
|
|
||||||
`test-user` with the `test-db-secret`. Then, the pod spec can be shortened to, for example:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"kind": "Pod",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "prod-db-client-pod",
|
|
||||||
"labels": {
|
|
||||||
"name": "prod-db-client"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"serviceAccount": "prod-db-client",
|
|
||||||
"containers": [
|
|
||||||
{
|
|
||||||
"name": "db-client-container",
|
|
||||||
"image": "myClientImage"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Use-case: Dotfiles in secret volume
|
|
||||||
|
|
||||||
In order to make piece of data 'hidden' (i.e., in a file whose name begins with a dot character), simply
|
|
||||||
make that key begin with a dot. For example, when the following secret is mounted into a volume:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"kind": "Secret",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "dotfile-secret"
|
|
||||||
},
|
|
||||||
"data": {
|
|
||||||
".secret-file": "dmFsdWUtMg0KDQo="
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
"kind": "Pod",
|
|
||||||
"apiVersion": "v1",
|
|
||||||
"metadata": {
|
|
||||||
"name": "secret-dotfiles-pod"
|
|
||||||
},
|
|
||||||
"spec": {
|
|
||||||
"volumes": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"secret": {
|
|
||||||
"secretName": "dotfile-secret"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"containers": [
|
|
||||||
{
|
|
||||||
"name": "dotfile-test-container",
|
|
||||||
"image": "gcr.io/google_containers/busybox",
|
|
||||||
"command": [ "ls", "-l", "/etc/secret-volume" ],
|
|
||||||
"volumeMounts": [
|
|
||||||
{
|
|
||||||
"name": "secret-volume",
|
|
||||||
"readOnly": true,
|
|
||||||
"mountPath": "/etc/secret-volume"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
The `secret-volume` will contain a single file, called `.secret-file`, and
|
|
||||||
the `dotfile-test-container` will have this file present at the path
|
|
||||||
`/etc/secret-volume/.secret-file`.
|
|
||||||
|
|
||||||
**NOTE**
|
|
||||||
|
|
||||||
Files beginning with dot characters are hidden from the output of `ls -l`;
|
|
||||||
you must use `ls -la` to see them when listing directory contents.
|
|
||||||
|
|
||||||
|
|
||||||
### Use-case: Secret visible to one container in a pod
|
|
||||||
|
|
||||||
Consider a program that needs to handle HTTP requests, do some complex business
|
|
||||||
logic, and then sign some messages with an HMAC. Because it has complex
|
|
||||||
application logic, there might be an unnoticed remote file reading exploit in
|
|
||||||
the server, which could expose the private key to an attacker.
|
|
||||||
|
|
||||||
This could be divided into two processes in two containers: a frontend container
|
|
||||||
which handles user interaction and business logic, but which cannot see the
|
|
||||||
private key; and a signer container that can see the private key, and responds
|
|
||||||
to simple signing requests from the frontend (e.g. over localhost networking).
|
|
||||||
|
|
||||||
With this partitioned approach, an attacker now has to trick the application
|
|
||||||
server into doing something rather arbitrary, which may be harder than getting
|
|
||||||
it to read a file.
|
|
||||||
|
|
||||||
<!-- TODO: explain how to do this while still using automation. -->
|
|
||||||
|
|
||||||
## Security Properties
|
|
||||||
|
|
||||||
### Protections
|
|
||||||
|
|
||||||
Because `secret` objects can be created independently of the `pods` that use
|
|
||||||
them, there is less risk of the secret being exposed during the workflow of
|
|
||||||
creating, viewing, and editing pods. The system can also take additional
|
|
||||||
precautions with `secret` objects, such as avoiding writing them to disk where
|
|
||||||
possible.
|
|
||||||
|
|
||||||
A secret is only sent to a node if a pod on that node requires it. It is not
|
|
||||||
written to disk. It is stored in a tmpfs. It is deleted once the pod that
|
|
||||||
depends on it is deleted.
|
|
||||||
|
|
||||||
On most Kubernetes-project-maintained distributions, communication between user
|
|
||||||
to the apiserver, and from apiserver to the kubelets, is protected by SSL/TLS.
|
|
||||||
Secrets are protected when transmitted over these channels.
|
|
||||||
|
|
||||||
Secret data on nodes is stored in tmpfs volumes and thus does not come to rest
|
|
||||||
on the node.
|
|
||||||
|
|
||||||
There may be secrets for several pods on the same node. However, only the
|
|
||||||
secrets that a pod requests are potentially visible within its containers.
|
|
||||||
Therefore, one Pod does not have access to the secrets of another pod.
|
|
||||||
|
|
||||||
There may be several containers in a pod. However, each container in a pod has
|
|
||||||
to request the secret volume in its `volumeMounts` for it to be visible within
|
|
||||||
the container. This can be used to construct useful [security partitions at the
|
|
||||||
Pod level](#use-case-secret-visible-to-one-container-in-a-pod).
|
|
||||||
|
|
||||||
### Risks
|
|
||||||
|
|
||||||
- In the API server secret data is stored as plaintext in etcd; therefore:
|
|
||||||
- Administrators should limit access to etcd to admin users
|
|
||||||
- Secret data in the API server is at rest on the disk that etcd uses; admins may want to wipe/shred disks
|
|
||||||
used by etcd when no longer in use
|
|
||||||
- Applications still need to protect the value of secret after reading it from the volume,
|
|
||||||
such as not accidentally logging it or transmitting it to an untrusted party.
|
|
||||||
- A user who can create a pod that uses a secret can also see the value of that secret. Even
|
|
||||||
if apiserver policy does not allow that user to read the secret object, the user could
|
|
||||||
run a pod which exposes the secret.
|
|
||||||
- If multiple replicas of etcd are run, then the secrets will be shared between them.
|
|
||||||
By default, etcd does not secure peer-to-peer communication with SSL/TLS, though this can be configured.
|
|
||||||
- Currently, anyone with root on any node can read any secret from the apiserver,
|
|
||||||
by impersonating the kubelet. It is a planned feature to only send secrets to
|
|
||||||
nodes that actually require them, to restrict the impact of a root exploit on a
|
|
||||||
single node.
|
|
||||||
|
|
|
@ -1,6 +1,18 @@
|
||||||
$( document ).ready(function() {
|
$( document ).ready(function() {
|
||||||
var oldURLs = ["/README.md","/README.html","/index.md",".html",".md","/v1.1/","/v1.0/"];
|
var oldURLs = ["/README.md","/README.html","/index.md",".html",".md","/v1.1/","/v1.0/"];
|
||||||
var fwdDirs = ["examples/","cluster/","docs/devel","docs/design"];
|
var fwdDirs = ["examples/","cluster/","docs/devel","docs/design"];
|
||||||
|
var forwardingRules = [{
|
||||||
|
"from":"/docs/api-reference/v1/definitions",
|
||||||
|
"pattern":"#_v1_(\\w+)",
|
||||||
|
"to":"/docs/api-reference/v1.6",
|
||||||
|
"postfix":"/#<token>-v1-core"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"from":"/docs/user-guide/kubectl/kubectl_",
|
||||||
|
"pattern":"kubectl_(\\w+)",
|
||||||
|
"to":"/docs/user-guide/kubectl/v1.6",
|
||||||
|
"postfix":"/#<token>"
|
||||||
|
}];
|
||||||
var doRedirect = false;
|
var doRedirect = false;
|
||||||
var notHere = false;
|
var notHere = false;
|
||||||
var forwardingURL = window.location.href;
|
var forwardingURL = window.location.href;
|
||||||
|
@ -26,6 +38,20 @@ $( document ).ready(function() {
|
||||||
"to": "http://kubernetes.io/docs/whatisk8s/"
|
"to": "http://kubernetes.io/docs/whatisk8s/"
|
||||||
}];
|
}];
|
||||||
|
|
||||||
|
forwardingRules.forEach(function(rule) {
|
||||||
|
if (forwardingURL.indexOf(rule.from) > -1) {
|
||||||
|
var re = new RegExp(rule.pattern, 'g');
|
||||||
|
var matchary = re.exec(forwardingURL);
|
||||||
|
var newURL = rule.to;
|
||||||
|
if (matchary !== null) {
|
||||||
|
newURL += rule.postfix.replace("<token>", matchary[1]);
|
||||||
|
}
|
||||||
|
notHere = true;
|
||||||
|
window.location.replace(newURL);
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
for (var i = 0; i < redirects.length; i++) {
|
for (var i = 0; i < redirects.length; i++) {
|
||||||
if (forwardingURL.indexOf(redirects[i].from) > -1){
|
if (forwardingURL.indexOf(redirects[i].from) > -1){
|
||||||
notHere = true;
|
notHere = true;
|
||||||
|
@ -41,6 +67,7 @@ $( document ).ready(function() {
|
||||||
window.location.replace(newURL);
|
window.location.replace(newURL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!notHere) {
|
if (!notHere) {
|
||||||
for (var i = 0; i < oldURLs.length; i++) {
|
for (var i = 0; i < oldURLs.length; i++) {
|
||||||
if (forwardingURL.indexOf(oldURLs[i]) > -1 &&
|
if (forwardingURL.indexOf(oldURLs[i]) > -1 &&
|
||||||
|
|
|
@ -2,7 +2,5 @@
|
||||||
title: Kubernetes API Swagger Spec
|
title: Kubernetes API Swagger Spec
|
||||||
---
|
---
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Kubernetes swagger UI has now been replaced by our generated API reference docs
|
Kubernetes swagger UI has now been replaced by our generated API reference docs
|
||||||
which can be accessed at http://kubernetes.io/docs/api-reference/README/.
|
which can be accessed at [http://kubernetes.io/docs/api-reference/{{page.version}}/](/docs/api-reference/{{page.version}}/).
|
||||||
|
|
|
@ -51,7 +51,6 @@
|
||||||
REPO_TMPL = "https://github.com/kubernetes/kubernetes/tree/%s/%s/:splat"
|
REPO_TMPL = "https://github.com/kubernetes/kubernetes/tree/%s/%s/:splat"
|
||||||
|
|
||||||
fixed_redirects = """# 301 redirects (301 is the default status when no other one is provided for each line)
|
fixed_redirects = """# 301 redirects (301 is the default status when no other one is provided for each line)
|
||||||
/third_party/swagger-ui /kubernetes/third_party/swagger-ui/
|
|
||||||
/resource-quota /docs/admin/resourcequota/
|
/resource-quota /docs/admin/resourcequota/
|
||||||
/horizontal-pod-autoscaler /docs/user-guide/horizontal-pod-autoscaling/
|
/horizontal-pod-autoscaler /docs/user-guide/horizontal-pod-autoscaling/
|
||||||
/docs/user-guide/overview /docs/whatisk8s/
|
/docs/user-guide/overview /docs/whatisk8s/
|
||||||
|
|
|
@ -10,7 +10,6 @@ Disallow: /docs/user-guide/configuring-containers
|
||||||
Disallow: /docs/user-guide/containers
|
Disallow: /docs/user-guide/containers
|
||||||
Disallow: /docs/user-guide/deploying-applications
|
Disallow: /docs/user-guide/deploying-applications
|
||||||
Disallow: /docs/user-guide/getting-into-containers
|
Disallow: /docs/user-guide/getting-into-containers
|
||||||
>>>>>>> fb2ab359... Remove from TOC/Search: pods/init-containers ...
|
|
||||||
Disallow: /docs/user-guide/liveness/index
|
Disallow: /docs/user-guide/liveness/index
|
||||||
Disallow: /docs/user-guide/pod-states
|
Disallow: /docs/user-guide/pod-states
|
||||||
Disallow: /docs/user-guide/simple-nginx
|
Disallow: /docs/user-guide/simple-nginx
|
||||||
|
|
Loading…
Reference in New Issue