Merge branch 'master' into add-cilium

pull/3163/head
Andrew Chen 2017-04-03 11:36:06 -07:00 committed by GitHub
commit 4262df1669
53 changed files with 4645 additions and 4354 deletions

View File

@ -17,7 +17,8 @@ defaults:
scope:
path: ""
values:
version: "v1.6.0"
fullversion: "v1.6.0"
version: "v1.6"
githubbranch: "master"
docsbranch: "master"
-

View File

@ -10,6 +10,8 @@ toc:
- title: Working with Kubernetes Objects
section:
- docs/concepts/overview/working-with-objects/kubernetes-objects.md
- docs/concepts/overview/working-with-objects/names.md
- docs/concepts/overview/working-with-objects/namespaces.md
- docs/concepts/overview/working-with-objects/labels.md
- docs/concepts/overview/working-with-objects/annotations.md
- docs/concepts/overview/kubernetes-api.md
@ -24,17 +26,26 @@ toc:
- title: Pods
section:
- docs/concepts/workloads/pods/pod-overview.md
- docs/concepts/workloads/pods/pod.md
- docs/concepts/workloads/pods/pod-lifecycle.md
- docs/concepts/workloads/pods/init-containers.md
- title: Controllers
section:
- docs/concepts/workloads/controllers/replicaset.md
- docs/concepts/workloads/controllers/replicationcontroller.md
- docs/concepts/workloads/controllers/deployment.md
- docs/concepts/workloads/controllers/statefulset.md
- docs/concepts/workloads/controllers/petset.md
- docs/concepts/workloads/controllers/daemonset.md
- docs/concepts/workloads/controllers/garbage-collection.md
- title: Jobs
section:
- docs/concepts/jobs/run-to-completion-finite-workloads.md
- title: Nodes
section:
- docs/concepts/nodes/node.md
- title: Cluster Administration
section:
- docs/concepts/cluster-administration/manage-deployment.md
@ -44,6 +55,7 @@ toc:
- docs/concepts/cluster-administration/audit.md
- docs/concepts/cluster-administration/resource-usage-monitoring.md
- docs/concepts/cluster-administration/out-of-resource.md
- docs/concepts/cluster-administration/cluster-management.md
- docs/concepts/cluster-administration/multiple-clusters.md
- docs/concepts/cluster-administration/federation.md
- docs/concepts/cluster-administration/federation-service-discovery.md
@ -57,11 +69,14 @@ toc:
- title: Storage
section:
- docs/concepts/storage/volumes.md
- docs/concepts/storage/persistent-volumes.md
- title: Services, Load Balancing, and Networking
section:
- docs/concepts/services-networking/dns-pod-service.md
- docs/concepts/services-networking/connect-applications-service.md
- docs/concepts/services-networking/ingress.md
- docs/concepts/services-networking/networkpolicies.md
- title: Configuration
section:
@ -69,6 +84,7 @@ toc:
- docs/concepts/configuration/container-command-args.md
- docs/concepts/configuration/manage-compute-resources-container.md
- docs/concepts/configuration/assign-pod-node.md
- docs/concepts/configuration/secret.md
- title: Policies
section:

View File

@ -1,10 +1,9 @@
bigheader: "Kubernetes Documentation"
bigheader: "Documentation Home"
abstract: "Documentation for using and learning about Kubernetes."
toc:
- docs/home/index.md
- title: Kubernetes Documentation Home
path: index.md
- docs/home/index.md
- title: Release Notes
path: https://github.com/kubernetes/kubernetes/releases/

View File

@ -41,7 +41,6 @@ toc:
- docs/api-reference/v1/operations.html
- docs/api-reference/v1/definitions.html
- docs/api-reference/labels-annotations-taints.md
- kubernetes/third_party/swagger-ui/index.md
- title: Autoscaling API
section:

View File

@ -5,7 +5,7 @@
<td>
<p><b>NOTICE</b></p>
<p>As of March 14, 2017, the Kubernetes SIG-Docs-Maintainers group have begun migration of the User Guide content as announced previously to the <a href="https://github.com/kubernetes/community/tree/master/sig-docs">SIG Docs community</a> through the <a href="https://groups.google.com/forum/#!forum/kubernetes-sig-docs">kubernetes-sig-docs</a> group and <a href="https://kubernetes.slack.com/messages/sig-docs/">kubernetes.slack.com #sig-docs</a> channel.</p>
<p>The user guides within this section are being refactored into topics within Tutorials, Tasks, and Concepts. Anything that has been moved will have a notice placed in its previous location as well as a link to its new location. The reorganization implements the table of contents (ToC) outlined in the <a href="https://docs.google.com/a/google.com/document/d/18hRCIorVarExB2eBVHTUR6eEJ2VVk5xq1iBmkQv8O6I/edit?usp=sharing">kubernetes-docs-toc</a> document and should improve the documentation's findability and readability for a wider range of audiences.</p>
<p>The user guides within this section are being refactored into topics within Tutorials, Tasks, and Concepts. Anything that has been moved will have a notice placed in its previous location as well as a link to its new location. The reorganization implements a new table of contents and should improve the documentation's findability and readability for a wider range of audiences.</p>
<p>For any questions, please contact: <a href="mailto:kubernetes-sig-docs@googlegroups.com">kubernetes-sig-docs@googlegroups.com</a></p>
</td>
</tr>

View File

@ -5,197 +5,6 @@ assignees:
title: Cluster Management Guide
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
This document describes several topics related to the lifecycle of a cluster: creating a new cluster,
upgrading your cluster's
master and worker nodes, performing node maintenance (e.g. kernel upgrades), and upgrading the Kubernetes API version of a
running cluster.
## Creating and configuring a Cluster
To install Kubernetes on a set of machines, consult one of the existing [Getting Started guides](/docs/getting-started-guides/) depending on your environment.
## Upgrading a cluster
The current state of cluster upgrades is provider dependent, and some releases may require special care when upgrading. It is recommended that administrators consult both the [release notes](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md), as well as the version specific upgrade notes prior to upgrading their clusters.
* [Upgrading to 1.6](/docs/admin/upgrade-1-6)
### Upgrading Google Compute Engine clusters
Google Compute Engine Open Source (GCE-OSS) support master upgrades by deleting and
recreating the master, while maintaining the same Persistent Disk (PD) to ensure that data is retained across the
upgrade.
Node upgrades for GCE use a [Managed Instance Group](https://cloud.google.com/compute/docs/instance-groups/), each node
is sequentially destroyed and then recreated with new software. Any Pods that are running on that node need to be
controlled by a Replication Controller, or manually re-created after the roll out.
Upgrades on open source Google Compute Engine (GCE) clusters are controlled by the `cluster/gce/upgrade.sh` script.
Get its usage by running `cluster/gce/upgrade.sh -h`.
For example, to upgrade just your master to a specific version (v1.0.2):
```shell
cluster/gce/upgrade.sh -M v1.0.2
```
Alternatively, to upgrade your entire cluster to the latest stable release:
```shell
cluster/gce/upgrade.sh release/stable
```
### Upgrading Google Container Engine (GKE) clusters
Google Container Engine automatically updates master components (e.g. `kube-apiserver`, `kube-scheduler`) to the latest
version. It also handles upgrading the operating system and other components that the master runs on.
The node upgrade process is user-initiated and is described in the [GKE documentation.](https://cloud.google.com/container-engine/docs/clusters/upgrade)
### Upgrading clusters on other platforms
Different providers, and tools, will manage upgrades differently. It is recommended that you consult their main documentation regarding upgrades.
* [kops](https://github.com/kubernetes/kops)
* [kargo](https://github.com/kubernetes-incubator/kargo)
* [CoreOS Tectonic](https://coreos.com/tectonic/docs/latest/admin/upgrade.html)
* ...
## Resizing a cluster
If your cluster runs short on resources you can easily add more machines to it if your cluster is running in [Node self-registration mode](/docs/admin/node/#self-registration-of-nodes).
If you're using GCE or GKE it's done by resizing Instance Group managing your Nodes. It can be accomplished by modifying number of instances on `Compute > Compute Engine > Instance groups > your group > Edit group` [Google Cloud Console page](https://console.developers.google.com) or using gcloud CLI:
```shell
gcloud compute instance-groups managed resize kubernetes-minion-group --size 42 --zone $ZONE
```
Instance Group will take care of putting appropriate image on new machines and start them, while Kubelet will register its Node with API server to make it available for scheduling. If you scale the instance group down, system will randomly choose Nodes to kill.
In other environments you may need to configure the machine yourself and tell the Kubelet on which machine API server is running.
### Cluster autoscaling
If you are using GCE or GKE, you can configure your cluster so that it is automatically rescaled based on
pod needs.
As described in [Compute Resource](/docs/user-guide/compute-resources/), users can reserve how much CPU and memory is allocated to pods.
This information is used by the Kubernetes scheduler to find a place to run the pod. If there is
no node that has enough free capacity (or doesn't match other pod requirements) then the pod has
to wait until some pods are terminated or a new node is added.
Cluster autoscaler looks for the pods that cannot be scheduled and checks if adding a new node, similar
to the other in the cluster, would help. If yes, then it resizes the cluster to accommodate the waiting pods.
Cluster autoscaler also scales down the cluster if it notices that some node is not needed anymore for
an extended period of time (10min but it may change in the future).
Cluster autoscaler is configured per instance group (GCE) or node pool (GKE).
If you are using GCE then you can either enable it while creating a cluster with kube-up.sh script.
To configure cluster autoscaler you have to set three environment variables:
* `KUBE_ENABLE_CLUSTER_AUTOSCALER` - it enables cluster autoscaler if set to true.
* `KUBE_AUTOSCALER_MIN_NODES` - minimum number of nodes in the cluster.
* `KUBE_AUTOSCALER_MAX_NODES` - maximum number of nodes in the cluster.
Example:
```shell
KUBE_ENABLE_CLUSTER_AUTOSCALER=true KUBE_AUTOSCALER_MIN_NODES=3 KUBE_AUTOSCALER_MAX_NODES=10 NUM_NODES=5 ./cluster/kube-up.sh
```
On GKE you configure cluster autoscaler either on cluster creation or update or when creating a particular node pool
(which you want to be autoscaled) by passing flags `--enable-autoscaling` `--min-nodes` and `--max-nodes`
to the corresponding `gcloud` commands.
Examples:
```shell
gcloud container clusters create mytestcluster --zone=us-central1-b --enable-autoscaling --min-nodes=3 --max-nodes=10 --num-nodes=5
```
```shell
gcloud container clusters update mytestcluster --enable-autoscaling --min-nodes=1 --max-nodes=15
```
**Cluster autoscaler expects that nodes have not been manually modified (e.g. by adding labels via kubectl) as those properties would not be propagated to the new nodes within the same instance group.**
## Maintenance on a Node
If you need to reboot a node (such as for a kernel upgrade, libc upgrade, hardware repair, etc.), and the downtime is
brief, then when the Kubelet restarts, it will attempt to restart the pods scheduled to it. If the reboot takes longer
(the default time is 5 minutes, controlled by `--pod-eviction-timeout` on the controller-manager),
then the node controller will terminate the pods that are bound to the unavailable node. If there is a corresponding
replica set (or replication controller), then a new copy of the pod will be started on a different node. So, in the case where all
pods are replicated, upgrades can be done without special coordination, assuming that not all nodes will go down at the same time.
If you want more control over the upgrading process, you may use the following workflow:
Use `kubectl drain` to gracefully terminate all pods on the node while marking the node as unschedulable:
```shell
kubectl drain $NODENAME
```
This keeps new pods from landing on the node while you are trying to get them off.
For pods with a replica set, the pod will be replaced by a new pod which will be scheduled to a new node. Additionally, if the pod is part of a service, then clients will automatically be redirected to the new pod.
For pods with no replica set, you need to bring up a new copy of the pod, and assuming it is not part of a service, redirect clients to it.
Perform maintenance work on the node.
Make the node schedulable again:
```shell
kubectl uncordon $NODENAME
```
If you deleted the node's VM instance and created a new one, then a new schedulable node resource will
be created automatically (if you're using a cloud provider that supports
node discovery; currently this is only Google Compute Engine, not including CoreOS on Google Compute Engine using kube-register). See [Node](/docs/admin/node) for more details.
## Advanced Topics
### Upgrading to a different API version
When a new API version is released, you may need to upgrade a cluster to support the new API version (e.g. switching from 'v1' to 'v2' when 'v2' is launched).
This is an infrequent event, but it requires careful management. There is a sequence of steps to upgrade to a new API version.
1. Turn on the new API version.
1. Upgrade the cluster's storage to use the new version.
1. Upgrade all config files. Identify users of the old API version endpoints.
1. Update existing objects in the storage to new version by running `cluster/update-storage-objects.sh`.
1. Turn off the old API version.
### Turn on or off an API version for your cluster
Specific API versions can be turned on or off by passing `--runtime-config=api/<version>` flag while bringing up the API server. For example: to turn off v1 API, pass `--runtime-config=api/v1=false`.
runtime-config also supports 2 special keys: api/all and api/legacy to control all and legacy APIs respectively.
For example, for turning off all API versions except v1, pass `--runtime-config=api/all=false,api/v1=true`.
For the purposes of these flags, _legacy_ APIs are those APIs which have been explicitly deprecated (e.g. `v1beta3`).
### Switching your cluster's storage API version
The objects that are stored to disk for a cluster's internal representation of the Kubernetes resources active in the cluster are written using a particular version of the API.
When the supported API changes, these objects may need to be rewritten in the newer API. Failure to do this will eventually result in resources that are no longer decodable or usable
by the Kubernetes API server.
`KUBE_API_VERSIONS` environment variable for the `kube-apiserver` binary which controls the API versions that are supported in the cluster. The first version in the list is used as the cluster's storage version. Hence, to set a specific version as the storage version, bring it to the front of list of versions in the value of `KUBE_API_VERSIONS`. You need to restart the `kube-apiserver` binary
for changes to this variable to take effect.
### Switching your config files to a new API version
You can use `kubectl convert` command to convert config files between different API versions.
```shell
kubectl convert -f pod.yaml --output-version v1
```
For more options, please refer to the usage of [kubectl convert](/docs/user-guide/kubectl/kubectl_convert/) command.
[Cluster Management](/docs/concepts/cluster-administration/cluster-management/)

View File

@ -4,170 +4,6 @@ assignees:
title: Daemon Sets
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
## What is a DaemonSet?
A _DaemonSet_ ensures that all (or some) nodes run a copy of a pod. As nodes are added to the
cluster, pods are added to them. As nodes are removed from the cluster, those pods are garbage
collected. Deleting a DaemonSet will clean up the pods it created.
Some typical uses of a DaemonSet are:
- running a cluster storage daemon, such as `glusterd`, `ceph`, on each node.
- running a logs collection daemon on every node, such as `fluentd` or `logstash`.
- running a node monitoring daemon on every node, such as [Prometheus Node Exporter](
https://github.com/prometheus/node_exporter), `collectd`, New Relic agent, or Ganglia `gmond`.
In a simple case, one DaemonSet, covering all nodes, would be used for each type of daemon.
A more complex setup might use multiple DaemonSets for a single type of daemon, but with
different flags and/or different memory and cpu requests for different hardware types.
## Writing a DaemonSet Spec
### Required Fields
As with all other Kubernetes config, a DaemonSet needs `apiVersion`, `kind`, and `metadata` fields. For
general information about working with config files, see [deploying applications](/docs/user-guide/deploying-applications/),
[configuring containers](/docs/user-guide/configuring-containers/), and [working with resources](/docs/user-guide/working-with-resources/) documents.
A DaemonSet also needs a [`.spec`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) section.
### Pod Template
The `.spec.template` is the only required field of the `.spec`.
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template).
It has exactly the same schema as a [pod](/docs/user-guide/pods), except
it is nested and does not have an `apiVersion` or `kind`.
In addition to required fields for a pod, a pod template in a DaemonSet has to specify appropriate
labels (see [pod selector](#pod-selector)).
A pod template in a DaemonSet must have a [`RestartPolicy`](/docs/user-guide/pod-states)
equal to `Always`, or be unspecified, which defaults to `Always`.
### Pod Selector
The `.spec.selector` field is a pod selector. It works the same as the `.spec.selector` of
a [Job](/docs/concepts/jobs/run-to-completion-finite-workloads/) or other new resources.
The `spec.selector` is an object consisting of two fields:
* `matchLabels` - works the same as the `.spec.selector` of a [ReplicationController](/docs/user-guide/replication-controller/)
* `matchExpressions` - allows to build more sophisticated selectors by specifying key,
list of values and an operator that relates the key and values.
When the two are specified the result is ANDed.
If the `.spec.selector` is specified, it must match the `.spec.template.metadata.labels`. If not
specified, they are defaulted to be equal. Config with these not matching will be rejected by the API.
Also you should not normally create any pods whose labels match this selector, either directly, via
another DaemonSet, or via other controller such as ReplicationController. Otherwise, the DaemonSet
controller will think that those pods were created by it. Kubernetes will not stop you from doing
this. One case where you might want to do this is manually create a pod with a different value on
a node for testing.
### Running Pods on Only Some Nodes
If you specify a `.spec.template.spec.nodeSelector`, then the DaemonSet controller will
create pods on nodes which match that [node
selector](/docs/user-guide/node-selection/). Likewise if you specify a `.spec.template.spec.affinity`
then DaemonSet controller will create pods on nodes which match that [node affinity](../../user-guide/node-selection/index.md).
If you do not specify either, then the DaemonSet controller will create pods on all nodes.
## How Daemon Pods are Scheduled
Normally, the machine that a pod runs on is selected by the Kubernetes scheduler. However, pods
created by the Daemon controller have the machine already selected (`.spec.nodeName` is specified
when the pod is created, so it is ignored by the scheduler). Therefore:
- the [`unschedulable`](/docs/admin/node/#manual-node-administration) field of a node is not respected
by the DaemonSet controller.
- DaemonSet controller can make pods even when the scheduler has not been started, which can help cluster
bootstrap.
Daemon pods do respect [taints and tolerations](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature), but they are
created with `NoExecute` tolerations for the `node.alpha.kubernetes.io/notReady` and `node.alpha.kubernetes.io/unreachable`
taints with no `tolerationSeconds`. This ensures that when the `TaintBasedEvictions` alpha feature is enabled,
they will not be evicted when there are node problems such as a network partition. (When the
`TaintBasedEvictions` feature is not enabled, they are also not evicted in these scenarios, but
due to hard-coded behavior of the NodeController rather than due to tolerations).
## Communicating with Daemon Pods
Some possible patterns for communicating with pods in a DaemonSet are:
- **Push**: Pods in the DaemonSet are configured to send updates to another service, such
as a stats database. They do not have clients.
- **NodeIP and Known Port**: Pods in the DaemonSet use a `hostPort`, so that the pods are reachable via the node IPs. Clients know the list of nodes ips somehow, and know the port by convention.
- **DNS**: Create a [headless service](/docs/user-guide/services/#headless-services) with the same pod selector,
and then discover DaemonSets using the `endpoints` resource or retrieve multiple A records from
DNS.
- **Service**: Create a service with the same pod selector, and use the service to reach a
daemon on a random node. (No way to reach specific node.)
## Updating a DaemonSet
If node labels are changed, the DaemonSet will promptly add pods to newly matching nodes and delete
pods from newly not-matching nodes.
You can modify the pods that a DaemonSet creates. However, pods do not allow all
fields to be updated. Also, the DaemonSet controller will use the original template the next
time a node (even with the same name) is created.
You can delete a DaemonSet. If you specify `--cascade=false` with `kubectl`, then the pods
will be left on the nodes. You can then create a new DaemonSet with a different template.
the new DaemonSet with the different template will recognize all the existing pods as having
matching labels. It will not modify or delete them despite a mismatch in the pod template.
You will need to force new pod creation by deleting the pod or deleting the node.
In Kubernetes version 1.6 and later, you can [perform a rolling update](/docs/tasks/manage-daemon/update-daemon-set/) on a DaemonSet.
Future releases of Kubernetes will support controlled updating of nodes.
## Alternatives to DaemonSet
### Init Scripts
It is certainly possible to run daemon processes by directly starting them on a node (e.g. using
`init`, `upstartd`, or `systemd`). This is perfectly fine. However, there are several advantages to
running such processes via a DaemonSet:
- Ability to monitor and manage logs for daemons in the same way as applications.
- Same config language and tools (e.g. pod templates, `kubectl`) for daemons and applications.
- Future versions of Kubernetes will likely support integration between DaemonSet-created
pods and node upgrade workflows.
- Running daemons in containers with resource limits increases isolation between daemons from app
containers. However, this can also be accomplished by running the daemons in a container but not in a pod
(e.g. start directly via Docker).
### Bare Pods
It is possible to create pods directly which specify a particular node to run on. However,
a DaemonSet replaces pods that are deleted or terminated for any reason, such as in the case of
node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, you should
use a DaemonSet rather than creating individual pods.
### Static Pods
It is possible to create pods by writing a file to a certain directory watched by Kubelet. These
are called [static pods](/docs/admin/static-pods/).
Unlike DaemonSet, static pods cannot be managed with kubectl
or other Kubernetes API clients. Static pods do not depend on the apiserver, making them useful
in cluster bootstrapping cases. Also, static pods may be deprecated in the future.
### Replication Controller
DaemonSet are similar to [Replication Controllers](/docs/user-guide/replication-controller) in that
they both create pods, and those pods have processes which are not expected to terminate (e.g. web servers,
storage servers).
Use a replication controller for stateless services, like frontends, where scaling up and down the
number of replicas and rolling out updates are more important than controlling exactly which host
the pod runs on. Use a Daemon Controller when it is important that a copy of a pod always run on
all or certain hosts, and when it needs to start before other pods.
[DaemonSets](/docs/concepts/workloads/controllers/daemonset/)

View File

@ -84,7 +84,7 @@ sudo docker run -it --rm --privileged --net=host \
gcr.io/google_containers/node-test:0.2
```
Node conformance test is a containerized version of [node e2e test](https://github.com/kubernetes/kubernetes/blob/{{page.version}}/docs/devel/e2e-node-tests.md).
Node conformance test is a containerized version of [node e2e test](https://github.com/kubernetes/community/blob/master/contributors/devel/e2e-node-tests.md).
By default, it runs all conformance tests.
Theoretically, you can run any node e2e test if you configure the container and

View File

@ -5,255 +5,6 @@ assignees:
title: Nodes
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
## What is a node?
A `node` is a worker machine in Kubernetes, previously known as a `minion`. A node
may be a VM or physical machine, depending on the cluster. Each node has
the services necessary to run [pods](/docs/user-guide/pods) and is managed by the master
components. The services on a node include Docker, kubelet and kube-proxy. See
[The Kubernetes Node](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/architecture.md#the-kubernetes-node) section in the
architecture design doc for more details.
## Node Status
A node's status contains the following information:
* [Addresses](#Addresses)
* ~~[Phase](#Phase)~~ **deprecated**
* [Condition](#Condition)
* [Capacity](#Capacity)
* [Info](#Info)
Each section is described in detail below.
### Addresses
The usage of these fields varies depending on your cloud provider or bare metal configuration.
* HostName: The hostname as reported by the node's kernel. Can be overridden via the kubelet `--hostname-override` parameter.
* ExternalIP: Typically the IP address of the node that is externally routable (available from outside the cluster).
* InternalIP: Typically the IP address of the node that is routable only within the cluster.
### Phase
Deprecated: node phase is no longer used.
### Condition
The `conditions` field describes the status of all `Running` nodes.
| Node Condition | Description |
|----------------|-------------|
| `OutOfDisk` | `True` if there is insufficient free space on the node for adding new pods, otherwise `False` |
| `Ready` | `True` if the node is healthy and ready to accept pods, `False` if the node is not healthy and is not accepting pods, and `Unknown` if the node controller has not heard from the node in the last 40 seconds |
The node condition is represented as a JSON object. For example, the following response describes a healthy node.
```json
"conditions": [
{
"kind": "Ready",
"status": "True"
}
]
```
If the Status of the Ready condition is "Unknown" or "False" for longer than the `pod-eviction-timeout`, an argument passed to the [kube-controller-manager](/docs/admin/kube-controller-manager/), all of the Pods on the node are scheduled for deletion by the Node Controller. The default eviction timeout duration is **five minutes**. In some cases when the node is unreachable, the apiserver is unable to communicate with the kubelet on it. The decision to delete the pods cannot be communicated to the kubelet until it re-establishes communication with the apiserver. In the meantime, the pods which are scheduled for deletion may continue to run on the partitioned node.
In versions of Kubernetes prior to 1.5, the node controller would [force delete](/docs/user-guide/pods/#force-deletion-of-pods) these unreachable pods from the apiserver. However, in 1.5 and higher, the node controller does not force delete pods until it is confirmed that they have stopped running in the cluster. One can see these pods which may be running on an unreachable node as being in the "Terminating" or "Unknown" states. In cases where Kubernetes cannot deduce from the underlying infrastructure if a node has permanently left a cluster, the cluster administrator may need to delete the node object by hand. Deleting the node object from Kubernetes causes all the Pod objects running on it to be deleted from the apiserver, freeing up their names.
### Capacity
Describes the resources available on the node: CPU, memory and the maximum
number of pods that can be scheduled onto the node.
### Info
General information about the node, such as kernel version, Kubernetes version
(kubelet and kube-proxy version), Docker version (if used), OS name.
The information is gathered by Kubelet from the node.
## Management
Unlike [pods](/docs/user-guide/pods) and [services](/docs/user-guide/services),
a node is not inherently created by Kubernetes: it is created externally by cloud
providers like Google Compute Engine, or exists in your pool of physical or virtual
machines. What this means is that when Kubernetes creates a node, it is really
just creating an object that represents the node. After creation, Kubernetes
will check whether the node is valid or not. For example, if you try to create
a node from the following content:
```json
{
"kind": "Node",
"apiVersion": "v1",
"metadata": {
"name": "10.240.79.157",
"labels": {
"name": "my-first-k8s-node"
}
}
}
```
Kubernetes will create a node object internally (the representation), and
validate the node by health checking based on the `metadata.name` field (we
assume `metadata.name` can be resolved). If the node is valid, i.e. all necessary
services are running, it is eligible to run a pod; otherwise, it will be
ignored for any cluster activity until it becomes valid. Note that Kubernetes
will keep the object for the invalid node unless it is explicitly deleted by
the client, and it will keep checking to see if it becomes valid.
Currently, there are three components that interact with the Kubernetes node
interface: node controller, kubelet, and kubectl.
### Node Controller
The node controller is a Kubernetes master component which manages various
aspects of nodes.
The node controller has multiple roles in a node's life. The first is assigning a
CIDR block to the node when it is registered (if CIDR assignment is turned on).
The second is keeping the node controller's internal list of nodes up to date with
the cloud provider's list of available machines. When running in a cloud
environment, whenever a node is unhealthy the node controller asks the cloud
provider if the VM for that node is still available. If not, the node
controller deletes the node from its list of nodes.
The third is monitoring the nodes' health. The node controller is
responsible for updating the NodeReady condition of NodeStatus to
ConditionUnknown when a node becomes unreachable (i.e. the node controller stops
receiving heartbeats for some reason, e.g. due to the node being down), and then later evicting
all the pods from the node (using graceful termination) if the node continues
to be unreachable. (The default timeouts are 40s to start reporting
ConditionUnknown and 5m after that to start evicting pods.) The node controller
checks the state of each node every `--node-monitor-period` seconds.
In Kubernetes 1.4, we updated the logic of the node controller to better handle
cases when a big number of nodes have problems with reaching the master
(e.g. because the master has networking problem). Starting with 1.4, the node
controller will look at the state of all nodes in the cluster when making a
decision about pod eviction.
In most cases, node controller limits the eviction rate to
`--node-eviction-rate` (default 0.1) per second, meaning it won't evict pods
from more than 1 node per 10 seconds.
The node eviction behavior changes when a node in a given availability zone
becomes unhealthy. The node controller checks what percentage of nodes in the zone
are unhealthy (NodeReady condition is ConditionUnknown or ConditionFalse) at
the same time. If the fraction of unhealthy nodes is at least
`--unhealthy-zone-threshold` (default 0.55) then the eviction rate is reduced:
if the cluster is small (i.e. has less than or equal to
`--large-cluster-size-threshold` nodes - default 50) then evictions are
stopped, otherwise the eviction rate is reduced to
`--secondary-node-eviction-rate` (default 0.01) per second. The reason these
policies are implemented per availability zone is because one availability zone
might become partitioned from the master while the others remain connected. If
your cluster does not span multiple cloud provider availability zones, then
there is only one availability zone (the whole cluster).
A key reason for spreading your nodes across availability zones is so that the
workload can be shifted to healthy zones when one entire zone goes down.
Therefore, if all nodes in a zone are unhealthy then node controller evicts at
the normal rate `--node-eviction-rate`. The corner case is when all zones are
completely unhealthy (i.e. there are no healthy nodes in the cluster). In such
case, the node controller assumes that there's some problem with master
connectivity and stops all evictions until some connectivity is restored.
Starting in Kubernetes 1.6, the NodeController is also responsible for evicting
pods that are running on nodes with `NoExecute` taints, when the pods do not tolerate
the taints. Additionally, as an alpha feature that is disabled by default, the
NodeController is responsible for adding taints corresponding to node problems like
node unreachable or not ready. See [this documentation](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature)
for details about `NoExecute` taints and the alpha feature.
### Self-Registration of Nodes
When the kubelet flag `--register-node` is true (the default), the kubelet will attempt to
register itself with the API server. This is the preferred pattern, used by most distros.
For self-registration, the kubelet is started with the following options:
- `--api-servers` - Location of the apiservers.
- `--kubeconfig` - Path to credentials to authenticate itself to the apiserver.
- `--cloud-provider` - How to talk to a cloud provider to read metadata about itself.
- `--register-node` - Automatically register with the API server.
- `--register-with-taints` - Register the node with the given list of taints (comma seperated `<key>=<value>:<effect>`). No-op if `register-node` is false.
- `--node-ip` IP address of the node.
- `--node-labels` - Labels to add when registering the node in the cluster.
- `--node-status-update-frequency` - Specifies how often kubelet posts node status to master.
Currently, any kubelet is authorized to create/modify any node resource, but in practice it only creates/modifies
its own. (In the future, we plan to only allow a kubelet to modify its own node resource.)
#### Manual Node Administration
A cluster administrator can create and modify node objects.
If the administrator wishes to create node objects manually, set the kubelet flag
`--register-node=false`.
The administrator can modify node resources (regardless of the setting of `--register-node`).
Modifications include setting labels on the node and marking it unschedulable.
Labels on nodes can be used in conjunction with node selectors on pods to control scheduling,
e.g. to constrain a pod to only be eligible to run on a subset of the nodes.
Marking a node as unschedulable will prevent new pods from being scheduled to that
node, but will not affect any existing pods on the node. This is useful as a
preparatory step before a node reboot, etc. For example, to mark a node
unschedulable, run this command:
```shell
kubectl cordon $NODENAME
```
Note that pods which are created by a daemonSet controller bypass the Kubernetes scheduler,
and do not respect the unschedulable attribute on a node. The assumption is that daemons belong on
the machine even if it is being drained of applications in preparation for a reboot.
### Node capacity
The capacity of the node (number of cpus and amount of memory) is part of the node object.
Normally, nodes register themselves and report their capacity when creating the node object. If
you are doing [manual node administration](#manual-node-administration), then you need to set node
capacity when adding a node.
The Kubernetes scheduler ensures that there are enough resources for all the pods on a node. It
checks that the sum of the limits of containers on the node is no greater than the node capacity. It
includes all containers started by the kubelet, but not containers started directly by Docker nor
processes not in containers.
If you want to explicitly reserve resources for non-pod processes, you can create a placeholder
pod. Use the following template:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: resource-reserver
spec:
containers:
- name: sleep-forever
image: gcr.io/google_containers/pause:0.8.0
resources:
limits:
cpu: 100m
memory: 100Mi
```
Set the `cpu` and `memory` values to the amount of resources you want to reserve.
Place the file in the manifest directory (`--config=DIR` flag of kubelet). Do this
on each kubelet where you want to reserve resources.
## API Object
Node is a top-level resource in the Kubernetes REST API. More details about the
API object can be found at: [Node API
object](/docs/api-reference/v1.6/#node-v1-core).
[Nodes](/docs/concepts/nodes/node/)

View File

@ -1,7 +1,7 @@
---
assignees:
- mml
title: Cluster Management Guide
title: Cluster Management Guide for Version 1.6
---
* TOC
@ -23,4 +23,4 @@ startup:
$ kube-apiserver --storage-backend='etcd2' $(EXISTING_ARGS)
```
However, for long-term maintenance of the cluster, we recommend that the operator plan an outage window in order to perform a [v2->v3 data upgrade](https://coreos.com/etcd/docs/latest/upgrades/upgrade_3_0.html).
However, for long-term maintenance of the cluster, we recommend that the operator plan an outage window in order to perform a [v2->v3 data upgrade](https://coreos.com/etcd/docs/latest/upgrades/upgrade_3_0.html).

View File

@ -0,0 +1,201 @@
---
assignees:
- lavalamp
- thockin
title: Cluster Management
---
* TOC
{:toc}
This document describes several topics related to the lifecycle of a cluster: creating a new cluster,
upgrading your cluster's
master and worker nodes, performing node maintenance (e.g. kernel upgrades), and upgrading the Kubernetes API version of a
running cluster.
## Creating and configuring a Cluster
To install Kubernetes on a set of machines, consult one of the existing [Getting Started guides](/docs/getting-started-guides/) depending on your environment.
## Upgrading a cluster
The current state of cluster upgrades is provider dependent, and some releases may require special care when upgrading. It is recommended that administrators consult both the [release notes](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md), as well as the version specific upgrade notes prior to upgrading their clusters.
* [Upgrading to 1.6](/docs/admin/upgrade-1-6)
### Upgrading Google Compute Engine clusters
Google Compute Engine Open Source (GCE-OSS) support master upgrades by deleting and
recreating the master, while maintaining the same Persistent Disk (PD) to ensure that data is retained across the
upgrade.
Node upgrades for GCE use a [Managed Instance Group](https://cloud.google.com/compute/docs/instance-groups/), each node
is sequentially destroyed and then recreated with new software. Any Pods that are running on that node need to be
controlled by a Replication Controller, or manually re-created after the roll out.
Upgrades on open source Google Compute Engine (GCE) clusters are controlled by the `cluster/gce/upgrade.sh` script.
Get its usage by running `cluster/gce/upgrade.sh -h`.
For example, to upgrade just your master to a specific version (v1.0.2):
```shell
cluster/gce/upgrade.sh -M v1.0.2
```
Alternatively, to upgrade your entire cluster to the latest stable release:
```shell
cluster/gce/upgrade.sh release/stable
```
### Upgrading Google Container Engine (GKE) clusters
Google Container Engine automatically updates master components (e.g. `kube-apiserver`, `kube-scheduler`) to the latest
version. It also handles upgrading the operating system and other components that the master runs on.
The node upgrade process is user-initiated and is described in the [GKE documentation.](https://cloud.google.com/container-engine/docs/clusters/upgrade)
### Upgrading clusters on other platforms
Different providers, and tools, will manage upgrades differently. It is recommended that you consult their main documentation regarding upgrades.
* [kops](https://github.com/kubernetes/kops)
* [kargo](https://github.com/kubernetes-incubator/kargo)
* [CoreOS Tectonic](https://coreos.com/tectonic/docs/latest/admin/upgrade.html)
* ...
## Resizing a cluster
If your cluster runs short on resources you can easily add more machines to it if your cluster is running in [Node self-registration mode](/docs/admin/node/#self-registration-of-nodes).
If you're using GCE or GKE it's done by resizing Instance Group managing your Nodes. It can be accomplished by modifying number of instances on `Compute > Compute Engine > Instance groups > your group > Edit group` [Google Cloud Console page](https://console.developers.google.com) or using gcloud CLI:
```shell
gcloud compute instance-groups managed resize kubernetes-minion-group --size 42 --zone $ZONE
```
Instance Group will take care of putting appropriate image on new machines and start them, while Kubelet will register its Node with API server to make it available for scheduling. If you scale the instance group down, system will randomly choose Nodes to kill.
In other environments you may need to configure the machine yourself and tell the Kubelet on which machine API server is running.
### Cluster autoscaling
If you are using GCE or GKE, you can configure your cluster so that it is automatically rescaled based on
pod needs.
As described in [Compute Resource](/docs/user-guide/compute-resources/), users can reserve how much CPU and memory is allocated to pods.
This information is used by the Kubernetes scheduler to find a place to run the pod. If there is
no node that has enough free capacity (or doesn't match other pod requirements) then the pod has
to wait until some pods are terminated or a new node is added.
Cluster autoscaler looks for the pods that cannot be scheduled and checks if adding a new node, similar
to the other in the cluster, would help. If yes, then it resizes the cluster to accommodate the waiting pods.
Cluster autoscaler also scales down the cluster if it notices that some node is not needed anymore for
an extended period of time (10min but it may change in the future).
Cluster autoscaler is configured per instance group (GCE) or node pool (GKE).
If you are using GCE then you can either enable it while creating a cluster with kube-up.sh script.
To configure cluster autoscaler you have to set three environment variables:
* `KUBE_ENABLE_CLUSTER_AUTOSCALER` - it enables cluster autoscaler if set to true.
* `KUBE_AUTOSCALER_MIN_NODES` - minimum number of nodes in the cluster.
* `KUBE_AUTOSCALER_MAX_NODES` - maximum number of nodes in the cluster.
Example:
```shell
KUBE_ENABLE_CLUSTER_AUTOSCALER=true KUBE_AUTOSCALER_MIN_NODES=3 KUBE_AUTOSCALER_MAX_NODES=10 NUM_NODES=5 ./cluster/kube-up.sh
```
On GKE you configure cluster autoscaler either on cluster creation or update or when creating a particular node pool
(which you want to be autoscaled) by passing flags `--enable-autoscaling` `--min-nodes` and `--max-nodes`
to the corresponding `gcloud` commands.
Examples:
```shell
gcloud container clusters create mytestcluster --zone=us-central1-b --enable-autoscaling --min-nodes=3 --max-nodes=10 --num-nodes=5
```
```shell
gcloud container clusters update mytestcluster --enable-autoscaling --min-nodes=1 --max-nodes=15
```
**Cluster autoscaler expects that nodes have not been manually modified (e.g. by adding labels via kubectl) as those properties would not be propagated to the new nodes within the same instance group.**
## Maintenance on a Node
If you need to reboot a node (such as for a kernel upgrade, libc upgrade, hardware repair, etc.), and the downtime is
brief, then when the Kubelet restarts, it will attempt to restart the pods scheduled to it. If the reboot takes longer
(the default time is 5 minutes, controlled by `--pod-eviction-timeout` on the controller-manager),
then the node controller will terminate the pods that are bound to the unavailable node. If there is a corresponding
replica set (or replication controller), then a new copy of the pod will be started on a different node. So, in the case where all
pods are replicated, upgrades can be done without special coordination, assuming that not all nodes will go down at the same time.
If you want more control over the upgrading process, you may use the following workflow:
Use `kubectl drain` to gracefully terminate all pods on the node while marking the node as unschedulable:
```shell
kubectl drain $NODENAME
```
This keeps new pods from landing on the node while you are trying to get them off.
For pods with a replica set, the pod will be replaced by a new pod which will be scheduled to a new node. Additionally, if the pod is part of a service, then clients will automatically be redirected to the new pod.
For pods with no replica set, you need to bring up a new copy of the pod, and assuming it is not part of a service, redirect clients to it.
Perform maintenance work on the node.
Make the node schedulable again:
```shell
kubectl uncordon $NODENAME
```
If you deleted the node's VM instance and created a new one, then a new schedulable node resource will
be created automatically (if you're using a cloud provider that supports
node discovery; currently this is only Google Compute Engine, not including CoreOS on Google Compute Engine using kube-register). See [Node](/docs/admin/node) for more details.
## Advanced Topics
### Upgrading to a different API version
When a new API version is released, you may need to upgrade a cluster to support the new API version (e.g. switching from 'v1' to 'v2' when 'v2' is launched).
This is an infrequent event, but it requires careful management. There is a sequence of steps to upgrade to a new API version.
1. Turn on the new API version.
1. Upgrade the cluster's storage to use the new version.
1. Upgrade all config files. Identify users of the old API version endpoints.
1. Update existing objects in the storage to new version by running `cluster/update-storage-objects.sh`.
1. Turn off the old API version.
### Turn on or off an API version for your cluster
Specific API versions can be turned on or off by passing `--runtime-config=api/<version>` flag while bringing up the API server. For example: to turn off v1 API, pass `--runtime-config=api/v1=false`.
runtime-config also supports 2 special keys: api/all and api/legacy to control all and legacy APIs respectively.
For example, for turning off all API versions except v1, pass `--runtime-config=api/all=false,api/v1=true`.
For the purposes of these flags, _legacy_ APIs are those APIs which have been explicitly deprecated (e.g. `v1beta3`).
### Switching your cluster's storage API version
The objects that are stored to disk for a cluster's internal representation of the Kubernetes resources active in the cluster are written using a particular version of the API.
When the supported API changes, these objects may need to be rewritten in the newer API. Failure to do this will eventually result in resources that are no longer decodable or usable
by the Kubernetes API server.
`KUBE_API_VERSIONS` environment variable for the `kube-apiserver` binary which controls the API versions that are supported in the cluster. The first version in the list is used as the cluster's storage version. Hence, to set a specific version as the storage version, bring it to the front of list of versions in the value of `KUBE_API_VERSIONS`. You need to restart the `kube-apiserver` binary
for changes to this variable to take effect.
### Switching your config files to a new API version
You can use `kubectl convert` command to convert config files between different API versions.
```shell
kubectl convert -f pod.yaml --output-version v1
```
For more options, please refer to the usage of [kubectl convert](/docs/user-guide/kubectl/kubectl_convert/) command.

View File

@ -25,7 +25,7 @@ The kubelet has a single default network plugin, and a default network common to
## Network Plugin Requirements
Besides providing the [`NetworkPlugin` interface](https://github.com/kubernetes/kubernetes/tree/{{page.version}}/pkg/kubelet/network/plugins.go) to configure and clean up pod networking, the plugin may also need specific support for kube-proxy. The iptables proxy obviously depends on iptables, and the plugin may need to ensure that container traffic is made available to iptables. For example, if the plugin connects containers to a Linux bridge, the plugin must set the `net/bridge/bridge-nf-call-iptables` sysctl to `1` to ensure that the iptables proxy functions correctly. If the plugin does not use a Linux bridge (but instead something like Open vSwitch or some other mechanism) it should ensure container traffic is appropriately routed for the proxy.
Besides providing the [`NetworkPlugin` interface](https://github.com/kubernetes/kubernetes/tree/{{page.fullversion}}/pkg/kubelet/network/plugins.go) to configure and clean up pod networking, the plugin may also need specific support for kube-proxy. The iptables proxy obviously depends on iptables, and the plugin may need to ensure that container traffic is made available to iptables. For example, if the plugin connects containers to a Linux bridge, the plugin must set the `net/bridge/bridge-nf-call-iptables` sysctl to `1` to ensure that the iptables proxy functions correctly. If the plugin does not use a Linux bridge (but instead something like Open vSwitch or some other mechanism) it should ensure container traffic is appropriately routed for the proxy.
By default if no kubelet network plugin is specified, the `noop` plugin is used, which sets `net/bridge/bridge-nf-call-iptables=1` to ensure simple configurations (like docker with a bridge) work correctly with the iptables proxy.

View File

@ -0,0 +1,795 @@
---
assignees:
- mikedanese
title: Secrets
---
Objects of type `secret` are intended to hold sensitive information, such as
passwords, OAuth tokens, and ssh keys. Putting this information in a `secret`
is safer and more flexible than putting it verbatim in a `pod` definition or in
a docker image. See [Secrets design document](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/secrets.md) for more information.
* TOC
{:toc}
## Overview of Secrets
A Secret is an object that contains a small amount of sensitive data such as
a password, a token, or a key. Such information might otherwise be put in a
Pod specification or in an image; putting it in a Secret object allows for
more control over how it is used, and reduces the risk of accidental exposure.
Users can create secrets, and the system also creates some secrets.
To use a secret, a pod needs to reference the secret.
A secret can be used with a pod in two ways: as files in a [volume](/docs/concepts/storage/volumes/) mounted on one or more of
its containers, or used by kubelet when pulling images for the pod.
### Built-in Secrets
#### Service Accounts Automatically Create and Attach Secrets with API Credentials
Kubernetes automatically creates secrets which contain credentials for
accessing the API and it automatically modifies your pods to use this type of
secret.
The automatic creation and use of API credentials can be disabled or overridden
if desired. However, if all you need to do is securely access the apiserver,
this is the recommended workflow.
See the [Service Account](/docs/user-guide/service-accounts) documentation for more
information on how Service Accounts work.
### Creating your own Secrets
#### Creating a Secret Using kubectl create secret
Say that some pods need to access a database. The
username and password that the pods should use is in the files
`./username.txt` and `./password.txt` on your local machine.
```shell
# Create files needed for rest of example.
$ echo -n "admin" > ./username.txt
$ echo -n "1f2d1e2e67df" > ./password.txt
```
The `kubectl create secret` command
packages these files into a Secret and creates
the object on the Apiserver.
```shell
$ kubectl create secret generic db-user-pass --from-file=./username.txt --from-file=./password.txt
secret "db-user-pass" created
```
You can check that the secret was created like this:
```shell
$ kubectl get secrets
NAME TYPE DATA AGE
db-user-pass Opaque 2 51s
$ kubectl describe secrets/db-user-pass
Name: db-user-pass
Namespace: default
Labels: <none>
Annotations: <none>
Type: Opaque
Data
====
password.txt: 12 bytes
username.txt: 5 bytes
```
Note that neither `get` nor `describe` shows the contents of the file by default.
This is to protect the secret from being exposed accidentally to someone looking
or from being stored in a terminal log.
See [decoding a secret](#decoding-a-secret) for how to see the contents.
#### Creating a Secret Manually
You can also create a secret object in a file first,
in json or yaml format, and then create that object.
Each item must be base64 encoded:
```shell
$ echo -n "admin" | base64
YWRtaW4=
$ echo -n "1f2d1e2e67df" | base64
MWYyZDFlMmU2N2Rm
```
Now write a secret object that looks like this:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: mysecret
type: Opaque
data:
username: YWRtaW4=
password: MWYyZDFlMmU2N2Rm
```
The data field is a map. Its keys must match
[`DNS_SUBDOMAIN`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/design/identifiers.md), except that leading dots are also
allowed. The values are arbitrary data, encoded using base64.
Create the secret using [`kubectl create`](/docs/user-guide/kubectl/kubectl_create/):
```shell
$ kubectl create -f ./secret.yaml
secret "mysecret" created
```
**Encoding Note:** The serialized JSON and YAML values of secret data are
encoded as base64 strings. Newlines are not valid within these strings and must
be omitted. When using the `base64` utility on Darwin/OS X users should avoid
using the `-b` option to split long lines. Conversely Linux users *should* add
the option `-w 0` to `base64` commands.
#### Decoding a Secret
Get back the secret created in the previous section:
```shell
$ kubectl get secret mysecret -o yaml
apiVersion: v1
data:
username: YWRtaW4=
password: MWYyZDFlMmU2N2Rm
kind: Secret
metadata:
creationTimestamp: 2016-01-22T18:41:56Z
name: mysecret
namespace: default
resourceVersion: "164619"
selfLink: /api/v1/namespaces/default/secrets/mysecret
uid: cfee02d6-c137-11e5-8d73-42010af00002
type: Opaque
```
Decode the password field:
```shell
$ echo "MWYyZDFlMmU2N2Rm" | base64 --decode
1f2d1e2e67df
```
### Using Secrets
Secrets can be mounted as data volumes or be exposed as environment variables to
be used by a container in a pod. They can also be used by other parts of the
system, without being directly exposed to the pod. For example, they can hold
credentials that other parts of the system should use to interact with external
systems on your behalf.
#### Using Secrets as Files from a Pod
To consume a Secret in a volume in a Pod:
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
1. Modify your Pod definition to add a volume under `spec.volumes[]`. Name the volume anything, and have a `spec.volumes[].secret.secretName` field equal to the name of the secret object.
1. Add a `spec.containers[].volumeMounts[]` to each container that needs the secret. Specify `spec.containers[].volumeMounts[].readOnly = true` and `spec.containers[].volumeMounts[].mountPath` to an unused directory name where you would like the secrets to appear.
1. Modify your image and/or command line so that the program looks for files in that directory. Each key in the secret `data` map becomes the filename under `mountPath`.
This is an example of a pod that mounts a secret in a volume:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo",
"readOnly": true
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret"
}
}]
}
}
```
Each secret you want to use needs to be referred to in `spec.volumes`.
If there are multiple containers in the pod, then each container needs its
own `volumeMounts` block, but only one `spec.volumes` is needed per secret.
You can package many files into one secret, or use many secrets, whichever is convenient.
**Projection of secret keys to specific paths**
We can also control the paths within the volume where Secret keys are projected.
You can use `spec.volumes[].secret.items` field to change target path of each key:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo",
"readOnly": true
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret",
"items": [{
"key": "username",
"path": "my-group/my-username"
}]
}
}]
}
}
```
What will happen:
* `username` secret is stored under `/etc/foo/my-group/my-username` file instead of `/etc/foo/username`.
* `password` secret is not projected
If `spec.volumes[].secret.items` is used, only keys specified in `items` are projected.
To consume all keys from the secret, all of them must be listed in the `items` field.
All listed keys must exist in the corresponding secret. Otherwise, the volume is not created.
**Secret files permissions**
You can also specify the permission mode bits files part of a secret will have.
If you don't specify any, `0644` is used by default. You can specify a default
mode for the whole secret volume and override per key if needed.
For example, you can specify a default mode like this:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo"
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret",
"defaultMode": 256
}
}]
}
}
```
Then, the secret will be mounted on `/etc/foo` and all the files created by the
secret volume mount will have permission `0400`.
Note that the JSON spec doesn't support octal notation, so use the value 256 for
0400 permissions. If you use yaml instead of json for the pod, you can use octal
notation to specify permissions in a more natural way.
You can also use mapping, as in the previous example, and specify different
permission for different files like this:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo"
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret",
"items": [{
"key": "username",
"path": "my-group/my-username",
"mode": 511
}]
}
}]
}
}
```
In this case, the file resulting in `/etc/foo/my-group/my-username` will have
permission value of `0777`. Owing to JSON limitations, you must specify the mode
in decimal notation.
Note that this permission value might be displayed in decimal notation if you
read it later.
**Consuming Secret Values from Volumes**
Inside the container that mounts a secret volume, the secret keys appear as
files and the secret values are base-64 decoded and stored inside these files.
This is the result of commands
executed inside the container from the example above:
```shell
$ ls /etc/foo/
username
password
$ cat /etc/foo/username
admin
$ cat /etc/foo/password
1f2d1e2e67df
```
The program in a container is responsible for reading the secrets from the
files.
**Mounted Secrets are updated automatically**
When a secret being already consumed in a volume is updated, projected keys are eventually updated as well.
Kubelet is checking whether the mounted secret is fresh on every periodic sync.
However, it is using its local ttl-based cache for getting the current value of the secret.
As a result, the total delay from the moment when the secret is updated to the moment when new keys are
projected to the pod can be as long as kubelet sync period + ttl of secrets cache in kubelet.
#### Using Secrets as Environment Variables
To use a secret in an environment variable in a pod:
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
1. Modify your Pod definition in each container that you wish to consume the value of a secret key to add an environment variable for each secret key you wish to consume. The environment variable that consumes the secret key should populate the secret's name and key in `env[x].valueFrom.secretKeyRef`.
1. Modify your image and/or command line so that the program looks for values in the specified environment variables
This is an example of a pod that uses secrets from environment variables:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: secret-env-pod
spec:
containers:
- name: mycontainer
image: redis
env:
- name: SECRET_USERNAME
valueFrom:
secretKeyRef:
name: mysecret
key: username
- name: SECRET_PASSWORD
valueFrom:
secretKeyRef:
name: mysecret
key: password
restartPolicy: Never
```
**Consuming Secret Values from Environment Variables**
Inside a container that consumes a secret in an environment variables, the secret keys appear as
normal environment variables containing the base-64 decoded values of the secret data.
This is the result of commands executed inside the container from the example above:
```shell
$ echo $SECRET_USERNAME
admin
$ echo $SECRET_PASSWORD
1f2d1e2e67df
```
#### Using imagePullSecrets
An imagePullSecret is a way to pass a secret that contains a Docker (or other) image registry
password to the Kubelet so it can pull a private image on behalf of your Pod.
**Manually specifying an imagePullSecret**
Use of imagePullSecrets is described in the [images documentation](/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod)
### Arranging for imagePullSecrets to be Automatically Attached
You can manually create an imagePullSecret, and reference it from
a serviceAccount. Any pods created with that serviceAccount
or that default to use that serviceAccount, will get their imagePullSecret
field set to that of the service account.
See [here](/docs/user-guide/service-accounts/#adding-imagepullsecrets-to-a-service-account)
for a detailed explanation of that process.
#### Automatic Mounting of Manually Created Secrets
We plan to extend the service account behavior so that manually created
secrets (e.g. one containing a token for accessing a github account)
can be automatically attached to pods based on their service account.
*This is not implemented yet. See [issue 9902](http://issue.k8s.io/9902).*
## Details
### Restrictions
Secret volume sources are validated to ensure that the specified object
reference actually points to an object of type `Secret`. Therefore, a secret
needs to be created before any pods that depend on it.
Secret API objects reside in a namespace. They can only be referenced by pods
in that same namespace.
Individual secrets are limited to 1MB in size. This is to discourage creation
of very large secrets which would exhaust apiserver and kubelet memory.
However, creation of many smaller secrets could also exhaust memory. More
comprehensive limits on memory usage due to secrets is a planned feature.
Kubelet only supports use of secrets for Pods it gets from the API server.
This includes any pods created using kubectl, or indirectly via a replication
controller. It does not include pods created via the kubelets
`--manifest-url` flag, its `--config` flag, or its REST API (these are
not common ways to create pods.)
### Secret and Pod Lifetime interaction
When a pod is created via the API, there is no check whether a referenced
secret exists. Once a pod is scheduled, the kubelet will try to fetch the
secret value. If the secret cannot be fetched because it does not exist or
because of a temporary lack of connection to the API server, kubelet will
periodically retry. It will report an event about the pod explaining the
reason it is not started yet. Once the secret is fetched, the kubelet will
create and mount a volume containing it. None of the pod's containers will
start until all the pod's volumes are mounted.
## Use cases
### Use-Case: Pod with ssh keys
Create a secret containing some ssh keys:
```shell
$ kubectl create secret generic ssh-key-secret --from-file=ssh-privatekey=/path/to/.ssh/id_rsa --from-file=ssh-publickey=/path/to/.ssh/id_rsa.pub
```
**Security Note:** think carefully before sending your own ssh keys: other users of the cluster may have access to the secret. Use a service account which you want to have accessible to all the users with whom you share the Kubernetes cluster, and can revoke if they are compromised.
Now we can create a pod which references the secret with the ssh key and
consumes it in a volume:
```json
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "secret-test-pod",
"labels": {
"name": "secret-test"
}
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "ssh-key-secret"
}
}
],
"containers": [
{
"name": "ssh-test-container",
"image": "mySshImage",
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
}
```
When the container's command runs, the pieces of the key will be available in:
```shell
/etc/secret-volume/ssh-publickey
/etc/secret-volume/ssh-privatekey
```
The container is then free to use the secret data to establish an ssh connection.
### Use-Case: Pods with prod / test credentials
This example illustrates a pod which consumes a secret containing prod
credentials and another pod which consumes a secret with test environment
credentials.
Make the secrets:
```shell
$ kubectl create secret generic prod-db-secret --from-literal=username=produser --from-literal=password=Y4nys7f11
secret "prod-db-secret" created
$ kubectl create secret generic test-db-secret --from-literal=username=testuser --from-literal=password=iluvtests
secret "test-db-secret" created
```
Now make the pods:
```json
{
"apiVersion": "v1",
"kind": "List",
"items":
[{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "prod-db-client-pod",
"labels": {
"name": "prod-db-client"
}
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "prod-db-secret"
}
}
],
"containers": [
{
"name": "db-client-container",
"image": "myClientImage",
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
},
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "test-db-client-pod",
"labels": {
"name": "test-db-client"
}
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "test-db-secret"
}
}
],
"containers": [
{
"name": "db-client-container",
"image": "myClientImage",
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
}]
}
```
Both containers will have the following files present on their filesystems:
```shell
/etc/secret-volume/username
/etc/secret-volume/password
```
Note how the specs for the two pods differ only in one field; this facilitates
creating pods with different capabilities from a common pod config template.
You could further simplify the base pod specification by using two Service Accounts:
one called, say, `prod-user` with the `prod-db-secret`, and one called, say,
`test-user` with the `test-db-secret`. Then, the pod spec can be shortened to, for example:
```json
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "prod-db-client-pod",
"labels": {
"name": "prod-db-client"
}
},
"spec": {
"serviceAccount": "prod-db-client",
"containers": [
{
"name": "db-client-container",
"image": "myClientImage"
}
]
}
}
```
### Use-case: Dotfiles in secret volume
In order to make piece of data 'hidden' (i.e., in a file whose name begins with a dot character), simply
make that key begin with a dot. For example, when the following secret is mounted into a volume:
```json
{
"kind": "Secret",
"apiVersion": "v1",
"metadata": {
"name": "dotfile-secret"
},
"data": {
".secret-file": "dmFsdWUtMg0KDQo="
}
}
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "secret-dotfiles-pod"
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "dotfile-secret"
}
}
],
"containers": [
{
"name": "dotfile-test-container",
"image": "gcr.io/google_containers/busybox",
"command": [ "ls", "-l", "/etc/secret-volume" ],
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
}
```
The `secret-volume` will contain a single file, called `.secret-file`, and
the `dotfile-test-container` will have this file present at the path
`/etc/secret-volume/.secret-file`.
**NOTE**
Files beginning with dot characters are hidden from the output of `ls -l`;
you must use `ls -la` to see them when listing directory contents.
### Use-case: Secret visible to one container in a pod
Consider a program that needs to handle HTTP requests, do some complex business
logic, and then sign some messages with an HMAC. Because it has complex
application logic, there might be an unnoticed remote file reading exploit in
the server, which could expose the private key to an attacker.
This could be divided into two processes in two containers: a frontend container
which handles user interaction and business logic, but which cannot see the
private key; and a signer container that can see the private key, and responds
to simple signing requests from the frontend (e.g. over localhost networking).
With this partitioned approach, an attacker now has to trick the application
server into doing something rather arbitrary, which may be harder than getting
it to read a file.
<!-- TODO: explain how to do this while still using automation. -->
## Security Properties
### Protections
Because `secret` objects can be created independently of the `pods` that use
them, there is less risk of the secret being exposed during the workflow of
creating, viewing, and editing pods. The system can also take additional
precautions with `secret` objects, such as avoiding writing them to disk where
possible.
A secret is only sent to a node if a pod on that node requires it. It is not
written to disk. It is stored in a tmpfs. It is deleted once the pod that
depends on it is deleted.
On most Kubernetes-project-maintained distributions, communication between user
to the apiserver, and from apiserver to the kubelets, is protected by SSL/TLS.
Secrets are protected when transmitted over these channels.
Secret data on nodes is stored in tmpfs volumes and thus does not come to rest
on the node.
There may be secrets for several pods on the same node. However, only the
secrets that a pod requests are potentially visible within its containers.
Therefore, one Pod does not have access to the secrets of another pod.
There may be several containers in a pod. However, each container in a pod has
to request the secret volume in its `volumeMounts` for it to be visible within
the container. This can be used to construct useful [security partitions at the
Pod level](#use-case-secret-visible-to-one-container-in-a-pod).
### Risks
- In the API server secret data is stored as plaintext in etcd; therefore:
- Administrators should limit access to etcd to admin users
- Secret data in the API server is at rest on the disk that etcd uses; admins may want to wipe/shred disks
used by etcd when no longer in use
- Applications still need to protect the value of secret after reading it from the volume,
such as not accidentally logging it or transmitting it to an untrusted party.
- A user who can create a pod that uses a secret can also see the value of that secret. Even
if apiserver policy does not allow that user to read the secret object, the user could
run a pod which exposes the secret.
- If multiple replicas of etcd are run, then the secrets will be shared between them.
By default, etcd does not secure peer-to-peer communication with SSL/TLS, though this can be configured.
- Currently, anyone with root on any node can read any secret from the apiserver,
by impersonating the kubelet. It is a planned feature to only send secrets to
nodes that actually require them, to restrict the impact of a root exploit on a
single node.

259
docs/concepts/nodes/node.md Normal file
View File

@ -0,0 +1,259 @@
---
assignees:
- caesarxuchao
- dchen1107
title: Nodes
---
* TOC
{:toc}
## What is a node?
A `node` is a worker machine in Kubernetes, previously known as a `minion`. A node
may be a VM or physical machine, depending on the cluster. Each node has
the services necessary to run [pods](/docs/user-guide/pods) and is managed by the master
components. The services on a node include Docker, kubelet and kube-proxy. See
[The Kubernetes Node](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/architecture.md#the-kubernetes-node) section in the
architecture design doc for more details.
## Node Status
A node's status contains the following information:
* [Addresses](#Addresses)
* ~~[Phase](#Phase)~~ **deprecated**
* [Condition](#Condition)
* [Capacity](#Capacity)
* [Info](#Info)
Each section is described in detail below.
### Addresses
The usage of these fields varies depending on your cloud provider or bare metal configuration.
* HostName: The hostname as reported by the node's kernel. Can be overridden via the kubelet `--hostname-override` parameter.
* ExternalIP: Typically the IP address of the node that is externally routable (available from outside the cluster).
* InternalIP: Typically the IP address of the node that is routable only within the cluster.
### Phase
Deprecated: node phase is no longer used.
### Condition
The `conditions` field describes the status of all `Running` nodes.
| Node Condition | Description |
|----------------|-------------|
| `OutOfDisk` | `True` if there is insufficient free space on the node for adding new pods, otherwise `False` |
| `Ready` | `True` if the node is healthy and ready to accept pods, `False` if the node is not healthy and is not accepting pods, and `Unknown` if the node controller has not heard from the node in the last 40 seconds |
The node condition is represented as a JSON object. For example, the following response describes a healthy node.
```json
"conditions": [
{
"kind": "Ready",
"status": "True"
}
]
```
If the Status of the Ready condition is "Unknown" or "False" for longer than the `pod-eviction-timeout`, an argument passed to the [kube-controller-manager](/docs/admin/kube-controller-manager/), all of the Pods on the node are scheduled for deletion by the Node Controller. The default eviction timeout duration is **five minutes**. In some cases when the node is unreachable, the apiserver is unable to communicate with the kubelet on it. The decision to delete the pods cannot be communicated to the kubelet until it re-establishes communication with the apiserver. In the meantime, the pods which are scheduled for deletion may continue to run on the partitioned node.
In versions of Kubernetes prior to 1.5, the node controller would [force delete](/docs/user-guide/pods/#force-deletion-of-pods) these unreachable pods from the apiserver. However, in 1.5 and higher, the node controller does not force delete pods until it is confirmed that they have stopped running in the cluster. One can see these pods which may be running on an unreachable node as being in the "Terminating" or "Unknown" states. In cases where Kubernetes cannot deduce from the underlying infrastructure if a node has permanently left a cluster, the cluster administrator may need to delete the node object by hand. Deleting the node object from Kubernetes causes all the Pod objects running on it to be deleted from the apiserver, freeing up their names.
### Capacity
Describes the resources available on the node: CPU, memory and the maximum
number of pods that can be scheduled onto the node.
### Info
General information about the node, such as kernel version, Kubernetes version
(kubelet and kube-proxy version), Docker version (if used), OS name.
The information is gathered by Kubelet from the node.
## Management
Unlike [pods](/docs/user-guide/pods) and [services](/docs/user-guide/services),
a node is not inherently created by Kubernetes: it is created externally by cloud
providers like Google Compute Engine, or exists in your pool of physical or virtual
machines. What this means is that when Kubernetes creates a node, it is really
just creating an object that represents the node. After creation, Kubernetes
will check whether the node is valid or not. For example, if you try to create
a node from the following content:
```json
{
"kind": "Node",
"apiVersion": "v1",
"metadata": {
"name": "10.240.79.157",
"labels": {
"name": "my-first-k8s-node"
}
}
}
```
Kubernetes will create a node object internally (the representation), and
validate the node by health checking based on the `metadata.name` field (we
assume `metadata.name` can be resolved). If the node is valid, i.e. all necessary
services are running, it is eligible to run a pod; otherwise, it will be
ignored for any cluster activity until it becomes valid. Note that Kubernetes
will keep the object for the invalid node unless it is explicitly deleted by
the client, and it will keep checking to see if it becomes valid.
Currently, there are three components that interact with the Kubernetes node
interface: node controller, kubelet, and kubectl.
### Node Controller
The node controller is a Kubernetes master component which manages various
aspects of nodes.
The node controller has multiple roles in a node's life. The first is assigning a
CIDR block to the node when it is registered (if CIDR assignment is turned on).
The second is keeping the node controller's internal list of nodes up to date with
the cloud provider's list of available machines. When running in a cloud
environment, whenever a node is unhealthy the node controller asks the cloud
provider if the VM for that node is still available. If not, the node
controller deletes the node from its list of nodes.
The third is monitoring the nodes' health. The node controller is
responsible for updating the NodeReady condition of NodeStatus to
ConditionUnknown when a node becomes unreachable (i.e. the node controller stops
receiving heartbeats for some reason, e.g. due to the node being down), and then later evicting
all the pods from the node (using graceful termination) if the node continues
to be unreachable. (The default timeouts are 40s to start reporting
ConditionUnknown and 5m after that to start evicting pods.) The node controller
checks the state of each node every `--node-monitor-period` seconds.
In Kubernetes 1.4, we updated the logic of the node controller to better handle
cases when a big number of nodes have problems with reaching the master
(e.g. because the master has networking problem). Starting with 1.4, the node
controller will look at the state of all nodes in the cluster when making a
decision about pod eviction.
In most cases, node controller limits the eviction rate to
`--node-eviction-rate` (default 0.1) per second, meaning it won't evict pods
from more than 1 node per 10 seconds.
The node eviction behavior changes when a node in a given availability zone
becomes unhealthy. The node controller checks what percentage of nodes in the zone
are unhealthy (NodeReady condition is ConditionUnknown or ConditionFalse) at
the same time. If the fraction of unhealthy nodes is at least
`--unhealthy-zone-threshold` (default 0.55) then the eviction rate is reduced:
if the cluster is small (i.e. has less than or equal to
`--large-cluster-size-threshold` nodes - default 50) then evictions are
stopped, otherwise the eviction rate is reduced to
`--secondary-node-eviction-rate` (default 0.01) per second. The reason these
policies are implemented per availability zone is because one availability zone
might become partitioned from the master while the others remain connected. If
your cluster does not span multiple cloud provider availability zones, then
there is only one availability zone (the whole cluster).
A key reason for spreading your nodes across availability zones is so that the
workload can be shifted to healthy zones when one entire zone goes down.
Therefore, if all nodes in a zone are unhealthy then node controller evicts at
the normal rate `--node-eviction-rate`. The corner case is when all zones are
completely unhealthy (i.e. there are no healthy nodes in the cluster). In such
case, the node controller assumes that there's some problem with master
connectivity and stops all evictions until some connectivity is restored.
Starting in Kubernetes 1.6, the NodeController is also responsible for evicting
pods that are running on nodes with `NoExecute` taints, when the pods do not tolerate
the taints. Additionally, as an alpha feature that is disabled by default, the
NodeController is responsible for adding taints corresponding to node problems like
node unreachable or not ready. See [this documentation](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature)
for details about `NoExecute` taints and the alpha feature.
### Self-Registration of Nodes
When the kubelet flag `--register-node` is true (the default), the kubelet will attempt to
register itself with the API server. This is the preferred pattern, used by most distros.
For self-registration, the kubelet is started with the following options:
- `--api-servers` - Location of the apiservers.
- `--kubeconfig` - Path to credentials to authenticate itself to the apiserver.
- `--cloud-provider` - How to talk to a cloud provider to read metadata about itself.
- `--register-node` - Automatically register with the API server.
- `--register-with-taints` - Register the node with the given list of taints (comma seperated `<key>=<value>:<effect>`). No-op if `register-node` is false.
- `--node-ip` IP address of the node.
- `--node-labels` - Labels to add when registering the node in the cluster.
- `--node-status-update-frequency` - Specifies how often kubelet posts node status to master.
Currently, any kubelet is authorized to create/modify any node resource, but in practice it only creates/modifies
its own. (In the future, we plan to only allow a kubelet to modify its own node resource.)
#### Manual Node Administration
A cluster administrator can create and modify node objects.
If the administrator wishes to create node objects manually, set the kubelet flag
`--register-node=false`.
The administrator can modify node resources (regardless of the setting of `--register-node`).
Modifications include setting labels on the node and marking it unschedulable.
Labels on nodes can be used in conjunction with node selectors on pods to control scheduling,
e.g. to constrain a pod to only be eligible to run on a subset of the nodes.
Marking a node as unschedulable will prevent new pods from being scheduled to that
node, but will not affect any existing pods on the node. This is useful as a
preparatory step before a node reboot, etc. For example, to mark a node
unschedulable, run this command:
```shell
kubectl cordon $NODENAME
```
Note that pods which are created by a daemonSet controller bypass the Kubernetes scheduler,
and do not respect the unschedulable attribute on a node. The assumption is that daemons belong on
the machine even if it is being drained of applications in preparation for a reboot.
### Node capacity
The capacity of the node (number of cpus and amount of memory) is part of the node object.
Normally, nodes register themselves and report their capacity when creating the node object. If
you are doing [manual node administration](#manual-node-administration), then you need to set node
capacity when adding a node.
The Kubernetes scheduler ensures that there are enough resources for all the pods on a node. It
checks that the sum of the limits of containers on the node is no greater than the node capacity. It
includes all containers started by the kubelet, but not containers started directly by Docker nor
processes not in containers.
If you want to explicitly reserve resources for non-pod processes, you can create a placeholder
pod. Use the following template:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: resource-reserver
spec:
containers:
- name: sleep-forever
image: gcr.io/google_containers/pause:0.8.0
resources:
limits:
cpu: 100m
memory: 100Mi
```
Set the `cpu` and `memory` values to the amount of resources you want to reserve.
Place the file in the manifest directory (`--config=DIR` flag of kubelet). Do this
on each kubelet where you want to reserve resources.
## API Object
Node is a top-level resource in the Kubernetes REST API. More details about the
API object can be found at: [Node API
object](/docs/api-reference/v1.6/#node-v1-core).

View File

@ -28,9 +28,7 @@ What constitutes a compatible change and how to change the API are detailed by t
Complete API details are documented using [Swagger v1.2](http://swagger.io/) and [OpenAPI](https://www.openapis.org/). The Kubernetes apiserver (aka "master") exposes an API that can be used to retrieve the Swagger v1.2 Kubernetes API spec located at `/swaggerapi`. You can also enable a UI to browse the API documentation at `/swagger-ui` by passing the `--enable-swagger-ui=true` flag to apiserver.
We also host a version of the [latest v1.2 API documentation UI](http://kubernetes.io/kubernetes/third_party/swagger-ui/). This is updated with the latest release, so if you are using a different version of Kubernetes you will want to use the spec from your apiserver.
Starting with kubernetes 1.4, OpenAPI spec is also available at `/swagger.json`. While we are transitioning from Swagger v1.2 to OpenAPI (aka Swagger v2.0), some of the tools such as kubectl and swagger-ui are still using v1.2 spec. OpenAPI spec is in Beta as of Kubernetes 1.5.
Starting with kubernetes 1.4, OpenAPI spec is also available at [`/swagger.json`](https://github.com/kubernetes/kubernetes/blob/master/api/openapi-spec/swagger.json). While we are transitioning from Swagger v1.2 to OpenAPI (aka Swagger v2.0), some of the tools such as kubectl and swagger-ui are still using v1.2 spec. OpenAPI spec is in Beta as of Kubernetes 1.5.
Kubernetes implements an alternative Protobuf based serialization format for the API that is primarily intended for intra-cluster communication, documented in the [design proposal](https://github.com/kubernetes/kubernetes/blob/{{ page.githubbranch }}/docs/proposals/protobuf.md) and the IDL files for each schema are located in the Go packages that define the API objects.

View File

@ -0,0 +1,18 @@
---
assignees:
- mikedanese
- thockin
title: Names
---
All objects in the Kubernetes REST API are unambiguously identified by a Name and a UID.
For non-unique user-provided attributes, Kubernetes provides [labels](/docs/user-guide/labels) and [annotations](/docs/user-guide/annotations).
## Names
Names are generally client-provided. Only one object of a given kind can have a given name at a time (i.e., they are spatially unique). But if you delete an object, you can make a new object with the same name. Names are used to refer to an object in a resource URL, such as `/api/v1/pods/some-name`. By convention, the names of Kubernetes resources should be up to maximum length of 253 characters and consist of lower case alphanumeric characters, `-`, and `.`, but certain resources have more specific restrictions. See the [identifiers design doc](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/identifiers.md) for the precise syntax rules for names.
## UIDs
UID are generated by Kubernetes. Every object created over the whole lifetime of a Kubernetes cluster has a distinct UID (i.e., they are spatially and temporally unique).

View File

@ -0,0 +1,88 @@
---
assignees:
- derekwaynecarr
- mikedanese
- thockin
title: Namespaces
---
Kubernetes supports multiple virtual clusters backed by the same physical cluster.
These virtual clusters are called namespaces.
## When to Use Multiple Namespaces
Namespaces are intended for use in environments with many users spread across multiple
teams, or projects. For clusters with a few to tens of users, you should not
need to create or think about namespaces at all. Start using namespaces when you
need the features they provide.
Namespaces provide a scope for names. Names of resources need to be unique within a namespace, but not across namespaces.
Namespaces are a way to divide cluster resources between multiple uses (via [resource quota](/docs/admin/resourcequota/)).
In future versions of Kubernetes, objects in the same namespace will have the same
access control policies by default.
It is not necessary to use multiple namespaces just to separate slightly different
resources, such as different versions of the same software: use [labels](/docs/user-guide/labels) to distinguish
resources within the same namespace.
## Working with Namespaces
Creation and deletion of namespaces is described in the [Admin Guide documentation
for namespaces](/docs/admin/namespaces)
### Viewing namespaces
You can list the current namespaces in a cluster using:
```shell
$ kubectl get namespaces
NAME LABELS STATUS
default <none> Active
kube-system <none> Active
```
Kubernetes starts with two initial namespaces:
* `default` The default namespace for objects with no other namespace
* `kube-system` The namespace for objects created by the Kubernetes system
### Setting the namespace for a request
To temporarily set the namespace for a request, use the `--namespace` flag.
For example:
```shell
$ kubectl --namespace=<insert-namespace-name-here> run nginx --image=nginx
$ kubectl --namespace=<insert-namespace-name-here> get pods
```
### Setting the namespace preference
You can permanently save the namespace for all subsequent kubectl commands in that
context.
```shell
$ kubectl config set-context $(kubectl config current-context) --namespace=<insert-namespace-name-here>
# Validate it
$ kubectl config view | grep namespace:
```
## Namespaces and DNS
When you create a [Service](/docs/user-guide/services), it creates a corresponding [DNS entry](/docs/admin/dns).
This entry is of the form `<service-name>.<namespace-name>.svc.cluster.local`, which means
that if a container just uses `<service-name>` it will resolve to the service which
is local to a namespace. This is useful for using the same configuration across
multiple namespaces such as Development, Staging and Production. If you want to reach
across namespaces, you need to use the fully qualified domain name (FQDN).
## Not All Objects are in a Namespace
Most Kubernetes resources (e.g. pods, services, replication controllers, and others) are
in some namespace. However namespace resources are not themselves in a namespace.
And low-level resources, such as [nodes](/docs/admin/node) and
persistentVolumes, are not in any namespace. Events are an exception: they may or may not
have a namespace, depending on the object the event is about.

View File

@ -0,0 +1,294 @@
---
assignees:
- bprashanth
title: Ingress Resources
---
* TOC
{:toc}
__Terminology__
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
* Node: A single virtual or physical machine in a Kubernetes cluster.
* Cluster: A group of nodes firewalled from the internet, that are the primary compute resources managed by Kubernetes.
* Edge router: A router that enforces the firewall policy for your cluster. This could be a gateway managed by a cloudprovider or a physical piece of hardware.
* Cluster network: A set of links, logical or physical, that facilitate communication within a cluster according to the [Kubernetes networking model](/docs/admin/networking/). Examples of a Cluster network include Overlays such as [flannel](https://github.com/coreos/flannel#flannel) or SDNs such as [OVS](/docs/admin/ovs-networking/).
* Service: A Kubernetes [Service](/docs/user-guide/services/) that identifies a set of pods using label selectors. Unless mentioned otherwise, Services are assumed to have virtual IPs only routable within the cluster network.
## What is Ingress?
Typically, services and pods have IPs only routable by the cluster network. All traffic that ends up at an edge router is either dropped or forwarded elsewhere. Conceptually, this might look like:
```
internet
|
------------
[ Services ]
```
An Ingress is a collection of rules that allow inbound connections to reach the cluster services.
```
internet
|
[ Ingress ]
--|-----|--
[ Services ]
```
It can be configured to give services externally-reachable urls, load balance traffic, terminate SSL, offer name based virtual hosting etc. Users request ingress by POSTing the Ingress resource to the API server. An [Ingress controller](#ingress-controllers) is responsible for fulfilling the Ingress, usually with a loadbalancer, though it may also configure your edge router or additional frontends to help handle the traffic in an HA manner.
## Prerequisites
Before you start using the Ingress resource, there are a few things you should understand. The Ingress is a beta resource, not available in any Kubernetes release prior to 1.1. You need an Ingress controller to satisfy an Ingress, simply creating the resource will have no effect.
GCE/GKE deploys an ingress controller on the master. You can deploy any number of custom ingress controllers in a pod. You must annotate each ingress with the appropriate class, as indicated [here](https://github.com/kubernetes/ingress/tree/master/controllers/nginx#running-multiple-ingress-controllers) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md#disabling-glbc).
Make sure you review the [beta limitations](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md) of this controller. In environments other than GCE/GKE, you need to [deploy a controller](https://github.com/kubernetes/ingress/tree/master/controllers) as a pod.
## The Ingress Resource
A minimal Ingress might look like:
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test-ingress
spec:
rules:
- http:
paths:
- path: /testpath
backend:
serviceName: test
servicePort: 80
```
*POSTing this to the API server will have no effect if you have not configured an [Ingress controller](#ingress-controllers).*
__Lines 1-4__: As with all other Kubernetes config, an Ingress needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/deploying-applications), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
__Lines 5-7__: Ingress [spec](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status) has all the information needed to configure a loadbalancer or proxy server. Most importantly, it contains a list of rules matched against all incoming requests. Currently the Ingress resource only supports http rules.
__Lines 8-9__: Each http rule contains the following information: A host (e.g.: foo.bar.com, defaults to * in this example), a list of paths (e.g.: /testpath) each of which has an associated backend (test:80). Both the host and path must match the content of an incoming request before the loadbalancer directs traffic to the backend.
__Lines 10-12__: A backend is a service:port combination as described in the [services doc](/docs/user-guide/services). Ingress traffic is typically sent directly to the endpoints matching a backend.
__Global Parameters__: For the sake of simplicity the example Ingress has no global parameters, see the [api-reference](https://releases.k8s.io/{{page.githubbranch}}/pkg/apis/extensions/v1beta1/types.go) for a full definition of the resource. One can specify a global default backend in the absence of which requests that don't match a path in the spec are sent to the default backend of the Ingress controller.
## Ingress controllers
In order for the Ingress resource to work, the cluster must have an Ingress controller running. This is unlike other types of controllers, which typically run as part of the `kube-controller-manager` binary, and which are typically started automatically as part of cluster creation. You need to choose the ingress controller implementation that is the best fit for your cluster, or implement one. Examples and instructions can be found [here](https://github.com/kubernetes/ingress/tree/master/controllers).
## Before you begin
The following document describes a set of cross platform features exposed through the Ingress resource. Ideally, all Ingress controllers should fulfill this specification, but we're not there yet. The docs for the GCE and nginx controllers are [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md) respectively. **Make sure you review controller specific docs so you understand the caveats of each one**.
## Types of Ingress
### Single Service Ingress
There are existing Kubernetes concepts that allow you to expose a single service (see [alternatives](#alternatives)), however you can do so through an Ingress as well, by specifying a *default backend* with no rules.
{% include code.html language="yaml" file="ingress.yaml" ghlink="/docs/concepts/services-networking/ingress.yaml" %}
If you create it using `kubectl create -f` you should see:
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test-ingress - testsvc:80 107.178.254.228
```
Where `107.178.254.228` is the IP allocated by the Ingress controller to satisfy this Ingress. The `RULE` column shows that all traffic send to the IP is directed to the Kubernetes Service listed under `BACKEND`.
### Simple fanout
As described previously, pods within kubernetes have IPs only visible on the cluster network, so we need something at the edge accepting ingress traffic and proxying it to the right endpoints. This component is usually a highly available loadbalancer. An Ingress allows you to keep the number of loadbalancers down to a minimum, for example, a setup like:
```shell
foo.bar.com -> 178.91.123.132 -> / foo s1:80
/ bar s2:80
```
would require an Ingress such as:
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test
spec:
rules:
- host: foo.bar.com
http:
paths:
- path: /foo
backend:
serviceName: s1
servicePort: 80
- path: /bar
backend:
serviceName: s2
servicePort: 80
```
When you create the Ingress with `kubectl create -f`:
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test -
foo.bar.com
/foo s1:80
/bar s2:80
```
The Ingress controller will provision an implementation specific loadbalancer that satisfies the Ingress, as long as the services (s1, s2) exist. When it has done so, you will see the address of the loadbalancer under the last column of the Ingress.
### Name based virtual hosting
Name-based virtual hosts use multiple host names for the same IP address.
```
foo.bar.com --| |-> foo.bar.com s1:80
| 178.91.123.132 |
bar.foo.com --| |-> bar.foo.com s2:80
```
The following Ingress tells the backing loadbalancer to route requests based on the [Host header](https://tools.ietf.org/html/rfc7230#section-5.4).
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test
spec:
rules:
- host: foo.bar.com
http:
paths:
- backend:
serviceName: s1
servicePort: 80
- host: bar.foo.com
http:
paths:
- backend:
serviceName: s2
servicePort: 80
```
__Default Backends__: An Ingress with no rules, like the one shown in the previous section, sends all traffic to a single default backend. You can use the same technique to tell a loadbalancer where to find your website's 404 page, by specifying a set of rules *and* a default backend. Traffic is routed to your default backend if none of the Hosts in your Ingress match the Host in the request header, and/or none of the paths match the url of the request.
### TLS
You can secure an Ingress by specifying a [secret](/docs/user-guide/secrets) that contains a TLS private key and certificate. Currently the Ingress only supports a single TLS port, 443, and assumes TLS termination. If the TLS configuration section in an Ingress specifies different hosts, they will be multiplexed on the same port according to the hostname specified through the SNI TLS extension (provided the Ingress controller supports SNI). The TLS secret must contain keys named `tls.crt` and `tls.key` that contain the certificate and private key to use for TLS, e.g.:
```yaml
apiVersion: v1
data:
tls.crt: base64 encoded cert
tls.key: base64 encoded key
kind: Secret
metadata:
name: testsecret
namespace: default
type: Opaque
```
Referencing this secret in an Ingress will tell the Ingress controller to secure the channel from the client to the loadbalancer using TLS:
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: no-rules-map
spec:
tls:
- secretName: testsecret
backend:
serviceName: s1
servicePort: 80
```
Note that there is a gap between TLS features supported by various Ingress controllers. Please refer to documentation on [nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md#https), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#tls), or any other platform specific Ingress controller to understand how TLS works in your environment.
### Loadbalancing
An Ingress controller is bootstrapped with some loadbalancing policy settings that it applies to all Ingress, such as the loadbalancing algorithm, backend weight scheme etc. More advanced loadbalancing concepts (e.g.: persistent sessions, dynamic weights) are not yet exposed through the Ingress. You can still get these features through the [service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). With time, we plan to distill loadbalancing patterns that are applicable cross platform into the Ingress resource.
It's also worth noting that even though health checks are not exposed directly through the Ingress, there exist parallel concepts in Kubernetes such as [readiness probes](/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/) which allow you to achieve the same end result. Please review the controller specific docs to see how they handle health checks ([nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#health-checks)).
## Updating an Ingress
Say you'd like to add a new Host to an existing Ingress, you can update it by editing the resource:
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test - 178.91.123.132
foo.bar.com
/foo s1:80
$ kubectl edit ing test
```
This should pop up an editor with the existing yaml, modify it to include the new Host.
```yaml
spec:
rules:
- host: foo.bar.com
http:
paths:
- backend:
serviceName: s1
servicePort: 80
path: /foo
- host: bar.baz.com
http:
paths:
- backend:
serviceName: s2
servicePort: 80
path: /foo
..
```
saving it will update the resource in the API server, which should tell the Ingress controller to reconfigure the loadbalancer.
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test - 178.91.123.132
foo.bar.com
/foo s1:80
bar.baz.com
/foo s2:80
```
You can achieve the same by invoking `kubectl replace -f` on a modified Ingress yaml file.
## Failing across availability zones
Techniques for spreading traffic across failure domains differs between cloud providers. Please check the documentation of the relevant Ingress controller for details. Please refer to the federation [doc](/docs/user-guide/federation/) for details on deploying Ingress in a federated cluster.
## Future Work
* Various modes of HTTPS/TLS support (e.g.: SNI, re-encryption)
* Requesting an IP or Hostname via claims
* Combining L4 and L7 Ingress
* More Ingress controllers
Please track the [L7 and Ingress proposal](https://github.com/kubernetes/kubernetes/pull/12827) for more details on the evolution of the resource, and the [Ingress repository](https://github.com/kubernetes/ingress/tree/master) for more details on the evolution of various Ingress controllers.
## Alternatives
You can expose a Service in multiple ways that don't directly involve the Ingress resource:
* Use [Service.Type=LoadBalancer](/docs/user-guide/services/#type-loadbalancer)
* Use [Service.Type=NodePort](/docs/user-guide/services/#type-nodeport)
* Use a [Port Proxy](https://github.com/kubernetes/contrib/tree/master/for-demos/proxy-to-service)
* Deploy the [Service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). This allows you to share a single IP among multiple Services and achieve more advanced loadbalancing through Service Annotations.

View File

@ -0,0 +1,9 @@
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test-ingress
spec:
backend:
serviceName: testsvc
servicePort: 80

View File

@ -0,0 +1,98 @@
---
assignees:
- thockin
- caseydavenport
title: Network Policies
---
* TOC
{:toc}
A network policy is a specification of how selections of pods are allowed to communicate with each other and other network endpoints.
`NetworkPolicy` resources use labels to select pods and define whitelist rules which allow traffic to the selected pods in addition to what is allowed by the isolation policy for a given namespace.
## Prerequisites
You must enable the `extensions/v1beta1/networkpolicies` runtime config in your apiserver to enable this resource.
You must also be using a networking solution which supports `NetworkPolicy` - simply creating the
resource without a controller to implement it will have no effect.
## Configuring Namespace Isolation Policy
Isolation can be configured on a per-namespace basis. Once isolation is configured on a namespace it will be applied to all pods in that namespace. Currently, only isolation policy on inbound traffic (ingress) can be defined.
The following ingress isolation types being supported:
- `DefaultDeny`: Pods in the namespace will be inaccessible from any source except the pod's local node.
Ingress isolation can be enabled using an annotation on the Namespace.
```yaml
kind: Namespace
apiVersion: v1
metadata:
annotations:
net.beta.kubernetes.io/network-policy: |
{
"ingress": {
"isolation": "DefaultDeny"
}
}
```
To configure the annotation via `kubectl`:
```shell
{% raw %}
kubectl annotate ns <namespace> "net.beta.kubernetes.io/network-policy={\"ingress\": {\"isolation\": \"DefaultDeny\"}}"
{% endraw %}
```
See the [NetworkPolicy getting started guide](/docs/getting-started-guides/network-policy/walkthrough) for an example.
## The `NetworkPolicy` Resource
See the [api-reference](/docs/api-reference/extensions/v1beta1/definitions/#_v1beta1_networkpolicy) for a full definition of the resource.
A minimal `NetworkPolicy` might look like this:
```yaml
apiVersion: extensions/v1beta1
kind: NetworkPolicy
metadata:
name: test-network-policy
namespace: default
spec:
podSelector:
matchLabels:
role: db
ingress:
- from:
- namespaceSelector:
matchLabels:
project: myproject
- podSelector:
matchLabels:
role: frontend
ports:
- protocol: tcp
port: 6379
```
*POSTing this to the API server will have no effect unless your chosen networking solution supports network policy.*
__Mandatory Fields__: As with all other Kubernetes config, a `NetworkPolicy` needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/simple-yaml), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
__spec__: `NetworkPolicy` [spec](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) has all the information needed to define a network isolation policy in the deployed controller.
__podSelector__: Each `NetworkPolicy` includes a `podSelector` which selects the grouping of pods to which the `ingress` rules in the policy apply.
__ingress__: Each `NetworkPolicy` includes a list of whitelist `ingress` rules. Each rule allows traffic which matches both the `from` and `ports` sections.
This example NetworkPolicy has the following characteristics:
1. applies to all pods in the default namespace with the label "role=db"
2. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the current namespace with the label "role=frontend" (due to the podSelector list element)
3. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the namespace "myproject" (due to the namespaceSelector list element)

View File

@ -0,0 +1,679 @@
---
assignees:
- jsafrane
- mikedanese
- saad-ali
- thockin
title: Persistent Volumes
---
This document describes the current state of `PersistentVolumes` in Kubernetes. Familiarity with [volumes](/docs/concepts/storage/volumes/) is suggested.
* TOC
{:toc}
## Introduction
Managing storage is a distinct problem from managing compute. The `PersistentVolume` subsystem provides an API for users and administrators that abstracts details of how storage is provided from how it is consumed. To do this we introduce two new API resources: `PersistentVolume` and `PersistentVolumeClaim`.
A `PersistentVolume` (PV) is a piece of networked storage in the cluster that has been provisioned by an administrator. It is a resource in the cluster just like a node is a cluster resource. PVs are volume plugins like Volumes, but have a lifecycle independent of any individual pod that uses the PV. This API object captures the details of the implementation of the storage, be that NFS, iSCSI, or a cloud-provider-specific storage system.
A `PersistentVolumeClaim` (PVC) is a request for storage by a user. It is similar to a pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., can be mounted once read/write or many times read-only).
While `PersistentVolumeClaims` allow a user to consume abstract storage
resources, it is common that users need `PersistentVolumes` with varying
properties, such as performance, for different problems. Cluster administrators
need to be able to offer a variety of `PersistentVolumes` that differ in more
ways than just size and access modes, without exposing users to the details of
how those volumes are implemented. For these needs there is the `StorageClass`
resource.
A `StorageClass` provides a way for administrators to describe the "classes" of
storage they offer. Different classes might map to quality-of-service levels,
or to backup policies, or to arbitrary policies determined by the cluster
administrators. Kubernetes itself is unopinionated about what classes
represent. This concept is sometimes called "profiles" in other storage
systems.
Please see the [detailed walkthrough with working examples](/docs/user-guide/persistent-volumes/walkthrough/).
## Lifecycle of a volume and claim
PVs are resources in the cluster. PVCs are requests for those resources and also act as claim checks to the resource. The interaction between PVs and PVCs follows this lifecycle:
### Provisioning
There are two ways PVs may be provisioned: statically or dynamically.
#### Static
A cluster administrator creates a number of PVs. They carry the details of the real storage which is available for use by cluster users. They exist in the Kubernetes API and are available for consumption.
#### Dynamic
When none of the static PVs the administrator created matches a user's `PersistentVolumeClaim`, the cluster may try to dynamically provision a volume specially for the PVC. This provisioning is based on `StorageClasses`: the PVC must request a class and the administrator must have created and configured that class in order for dynamic provisioning to occur. Claims that request the class `""` effectively disable dynamic provisioning for themselves.
### Binding
A user creates, or has already created in the case of dynamic provisioning, a `PersistentVolumeClaim` with a specific amount of storage requested and with certain access modes. A control loop in the master watches for new PVCs, finds a matching PV (if possible), and binds them together. If a PV was dynamically provisioned for a new PVC, the loop will always bind that PV to the PVC. Otherwise, the user will always get at least what they asked for, but the volume may be in excess of what was requested. Once bound, `PersistentVolumeClaim` binds are exclusive, regardless of the mode used to bind them.
Claims will remain unbound indefinitely if a matching volume does not exist. Claims will be bound as matching volumes become available. For example, a cluster provisioned with many 50Gi PVs would not match a PVC requesting 100Gi. The PVC can be bound when a 100Gi PV is added to the cluster.
### Using
Pods use claims as volumes. The cluster inspects the claim to find the bound volume and mounts that volume for a pod. For volumes which support multiple access modes, the user specifies which mode desired when using their claim as a volume in a pod.
Once a user has a claim and that claim is bound, the bound PV belongs to the user for as long as they need it. Users schedule Pods and access their claimed PVs by including a persistentVolumeClaim in their Pod's volumes block. [See below for syntax details](#claims-as-volumes).
### Releasing
When a user is done with their volume, they can delete the PVC objects from the API which allows reclamation of the resource. The volume is considered "released" when the claim is deleted, but it is not yet available for another claim. The previous claimant's data remains on the volume which must be handled according to policy.
### Reclaiming
The reclaim policy for a `PersistentVolume` tells the cluster what to do with the volume after it has been released of its claim. Currently, volumes can either be Retained, Recycled or Deleted. Retention allows for manual reclamation of the resource. For those volume plugins that support it, deletion removes both the `PersistentVolume` object from Kubernetes, as well as deleting the associated storage asset in external infrastructure (such as an AWS EBS, GCE PD, Azure Disk, or Cinder volume). Volumes that were dynamically provisioned are always deleted.
#### Recycling
If supported by appropriate volume plugin, recycling performs a basic scrub (`rm -rf /thevolume/*`) on the volume and makes it available again for a new claim.
However, an administrator can configure a custom recycler pod templates using the Kubernetes controller manager command line arguments as described [here](/docs/admin/kube-controller-manager/). The custom recycler pod template must contain a `volumes` specification, as shown in the example below:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: pv-recycler-
namespace: default
spec:
restartPolicy: Never
volumes:
- name: vol
hostPath:
path: /any/path/it/will/be/replaced
containers:
- name: pv-recycler
image: "gcr.io/google_containers/busybox"
command: ["/bin/sh", "-c", "test -e /scrub && rm -rf /scrub/..?* /scrub/.[!.]* /scrub/* && test -z \"$(ls -A /scrub)\" || exit 1"]
volumeMounts:
- name: vol
mountPath: /scrub
```
However, the particular path specified in the custom recycler pod template in the `volumes` part is replaced with the particular path of the volume that is being recycled.
## Types of Persistent Volumes
`PersistentVolume` types are implemented as plugins. Kubernetes currently supports the following plugins:
* GCEPersistentDisk
* AWSElasticBlockStore
* AzureFile
* AzureDisk
* FC (Fibre Channel)
* Flocker
* NFS
* iSCSI
* RBD (Ceph Block Device)
* CephFS
* Cinder (OpenStack block storage)
* Glusterfs
* VsphereVolume
* Quobyte Volumes
* HostPath (single node testing only -- local storage is not supported in any way and WILL NOT WORK in a multi-node cluster)
* VMware Photon
* Portworx Volumes
* ScaleIO Volumes
## Persistent Volumes
Each PV contains a spec and status, which is the specification and status of the volume.
```yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv0003
spec:
capacity:
storage: 5Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
storageClassName: slow
nfs:
path: /tmp
server: 172.17.0.2
```
### Capacity
Generally, a PV will have a specific storage capacity. This is set using the PV's `capacity` attribute. See the Kubernetes [Resource Model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) to understand the units expected by `capacity`.
Currently, storage size is the only resource that can be set or requested. Future attributes may include IOPS, throughput, etc.
### Access Modes
A `PersistentVolume` can be mounted on a host in any way supported by the resource provider. As shown in the table below, providers will have different capabilities and each PV's access modes are set to the specific modes supported by that particular volume. For example, NFS can support multiple read/write clients, but a specific NFS PV might be exported on the server as read-only. Each PV gets its own set of access modes describing that specific PV's capabilities.
The access modes are:
* ReadWriteOnce -- the volume can be mounted as read-write by a single node
* ReadOnlyMany -- the volume can be mounted read-only by many nodes
* ReadWriteMany -- the volume can be mounted as read-write by many nodes
In the CLI, the access modes are abbreviated to:
* RWO - ReadWriteOnce
* ROX - ReadOnlyMany
* RWX - ReadWriteMany
> __Important!__ A volume can only be mounted using one access mode at a time, even if it supports many. For example, a GCEPersistentDisk can be mounted as ReadWriteOnce by a single node or ReadOnlyMany by many nodes, but not at the same time.
| Volume Plugin | ReadWriteOnce| ReadOnlyMany| ReadWriteMany|
| :--- | :---: | :---: | :---: |
| AWSElasticBlockStore | &#x2713; | - | - |
| AzureFile | &#x2713; | &#x2713; | &#x2713; |
| AzureDisk | &#x2713; | - | - |
| CephFS | &#x2713; | &#x2713; | &#x2713; |
| Cinder | &#x2713; | - | - |
| FC | &#x2713; | &#x2713; | - |
| FlexVolume | &#x2713; | &#x2713; | - |
| Flocker | &#x2713; | - | - |
| GCEPersistentDisk | &#x2713; | &#x2713; | - |
| Glusterfs | &#x2713; | &#x2713; | &#x2713; |
| HostPath | &#x2713; | - | - |
| iSCSI | &#x2713; | &#x2713; | - |
| PhotonPersistentDisk | &#x2713; | - | - |
| Quobyte | &#x2713; | &#x2713; | &#x2713; |
| NFS | &#x2713; | &#x2713; | &#x2713; |
| RBD | &#x2713; | &#x2713; | - |
| VsphereVolume | &#x2713; | - | - |
| PortworxVolume | &#x2713; | - | &#x2713; |
| ScaleIO | &#x2713; | &#x2713; | - |
### Class
A PV can have a class, which is specified by setting the
`storageClassName` attribute to the name of a
`StorageClass`. A PV of a particular class can only be bound to PVCs requesting
that class. A PV with no `storageClassName` has no class and can only be bound
to PVCs that request no particular class.
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
of the `storageClassName` attribute. This annotation is still working, however
it will become fully deprecated in a future Kubernetes release.
### Reclaim Policy
Current reclaim policies are:
* Retain -- manual reclamation
* Recycle -- basic scrub ("rm -rf /thevolume/*")
* Delete -- associated storage asset such as AWS EBS, GCE PD, Azure Disk, or OpenStack Cinder volume is deleted
Currently, only NFS and HostPath support recycling. AWS EBS, GCE PD, Azure Disk, and Cinder volumes support deletion.
### Phase
A volume will be in one of the following phases:
* Available -- a free resource that is not yet bound to a claim
* Bound -- the volume is bound to a claim
* Released -- the claim has been deleted, but the resource is not yet reclaimed by the cluster
* Failed -- the volume has failed its automatic reclamation
The CLI will show the name of the PVC bound to the PV.
### Mount Options
A Kubernetes administrator can specify additional mount options for when a Persistent Volume is being mounted on a node.
You can specify a mount option by using the annotation `volume.beta.kubernetes.io/mount-options` on
your Persistent Volume.
For example:
```yaml
apiVersion: "v1"
kind: "PersistentVolume"
metadata:
name: gce-disk-1
annotations:
volume.beta.kubernetes.io/mount-options: "discard"
spec:
capacity:
storage: "10Gi"
accessModes:
- "ReadWriteOnce"
gcePersistentDisk:
fsType: "ext4"
pdName: "gce-disk-1
```
A mount option is a string which will be cumulatively joined and used while mounting volume to the disk.
Note that not all Persistent volume types support mount options. In Kubernetes version 1.6, the following
volume types support mount options.
* GCEPersistentDisk
* AWSElasticBlockStore
* AzureFile
* AzureDisk
* NFS
* iSCSI
* RBD (Ceph Block Device)
* CephFS
* Cinder (OpenStack block storage)
* Glusterfs
* VsphereVolume
* Quobyte Volumes
* VMware Photon
## PersistentVolumeClaims
Each PVC contains a spec and status, which is the specification and status of the claim.
```yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: myclaim
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 8Gi
storageClassName: slow
selector:
matchLabels:
release: "stable"
matchExpressions:
- {key: environment, operator: In, values: [dev]}
```
### Access Modes
Claims use the same conventions as volumes when requesting storage with specific access modes.
### Resources
Claims, like pods, can request specific quantities of a resource. In this case, the request is for storage. The same [resource model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) applies to both volumes and claims.
### Selector
Claims can specify a [label selector](/docs/user-guide/labels/#label-selectors) to further filter the set of volumes. Only the volumes whose labels match the selector can be bound to the claim. The selector can consist of two fields:
* matchLabels - the volume must have a label with this value
* matchExpressions - a list of requirements made by specifying key, list of values, and operator that relates the key and values. Valid operators include In, NotIn, Exists, and DoesNotExist.
All of the requirements, from both `matchLabels` and `matchExpressions` are ANDed together they must all be satisfied in order to match.
### Class
A claim can request a particular class by specifying the name of a
`StorageClass` using the attribute `storageClassName`.
Only PVs of the requested class, ones with the same `storageClassName` as the PVC, can
be bound to the PVC.
PVCs don't necessarily have to request a class. A PVC with its `storageClasName` set
equal to `""` is always interpreted to be requesting a PV with no class, so it
can only be bound to PVs with no class (no annotation or one set equal to
`""`). A PVC with no `storageClassName` is not quite the same and is treated differently
by the cluster depending on whether the
[`DefaultStorageClass` admission plugin](/docs/admin/admission-controllers/#defaultstorageclass)
is turned on.
* If the admission plugin is turned on, the administrator may specify a
default `StorageClass`. All PVCs that have no `storageClassName` can be bound only to
PVs of that default. Specifying a default `StorageClass` is done by setting the
annotation `storageclass.kubernetes.io/is-default-class` equal to "true" in
a `StorageClass` object. If the administrator does not specify a default, the
cluster responds to PVC creation as if the admission plugin were turned off. If
more than one default is specified, the admission plugin forbids the creation of
all PVCs.
* If the admission plugin is turned off, there is no notion of a default
`StorageClass`. All PVCs that have no `storageClassName` can be bound only to PVs that
have no class. In this case, the PVCs that have no `storageClassName` are treated the
same way as PVCs that have their `storageClassName` set to `""`.
Depending on installation method, a default StorageClass may be deployed
to Kubernetes cluster by addon manager during installation.
When a PVC specifies a `selector` in addition to requesting a `StorageClass`,
the requirements are ANDed together: only a PV of the requested class and with
the requested labels may be bound to the PVC. Note that currently, a PVC with a
non-empty `selector` can't have a PV dynamically provisioned for it.
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
of `storageClassName` attribute. This annotation is still working, however
it won't be supported in a future Kubernetes release.
## Claims As Volumes
Pods access storage by using the claim as a volume. Claims must exist in the same namespace as the pod using the claim. The cluster finds the claim in the pod's namespace and uses it to get the `PersistentVolume` backing the claim. The volume is then mounted to the host and into the pod.
```yaml
kind: Pod
apiVersion: v1
metadata:
name: mypod
spec:
containers:
- name: myfrontend
image: dockerfile/nginx
volumeMounts:
- mountPath: "/var/www/html"
name: mypd
volumes:
- name: mypd
persistentVolumeClaim:
claimName: myclaim
```
### A Note on Namespaces
`PersistentVolumes` binds are exclusive, and since `PersistentVolumeClaims` are namespaced objects, mounting claims with "Many" modes (`ROX`, `RWX`) is only possible within one namespace.
## StorageClasses
Each `StorageClass` contains the fields `provisioner` and `parameters`, which
are used when a `PersistentVolume` belonging to the class needs to be
dynamically provisioned.
The name of a `StorageClass` object is significant, and is how users can
request a particular class. Administrators set the name and other parameters
of a class when first creating `StorageClass` objects, and the objects cannot
be updated once they are created.
Administrators can specify a default `StorageClass` just for PVCs that don't
request any particular class to bind to: see the
[`PersistentVolumeClaim` section](#persistentvolumeclaims)
for details.
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: standard
provisioner: kubernetes.io/aws-ebs
parameters:
type: gp2
```
### Provisioner
Storage classes have a provisioner that determines what volume plugin is used
for provisioning PVs. This field must be specified.
You are not restricted to specifying the "internal" provisioners
listed here (whose names are prefixed with "kubernetes.io" and shipped
alongside Kubernetes). You can also run and specify external provisioners,
which are independent programs that follow a [specification](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/volume-provisioning.md)
defined by Kubernetes. Authors of external provisioners have full discretion
over where their code lives, how the provisioner is shipped, how it needs to be
run, what volume plugin it uses (including Flex), etc. The repository [kubernetes-incubator/external-storage](https://github.com/kubernetes-incubator/external-storage)
houses a library for writing external provisioners that implements the bulk of
the specification plus various community-maintained external provisioners.
### Parameters
Storage classes have parameters that describe volumes belonging to the storage
class. Different parameters may be accepted depending on the `provisioner`. For
example, the value `io1`, for the parameter `type`, and the parameter
`iopsPerGB` are specific to EBS. When a parameter is omitted, some default is
used.
#### AWS
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/aws-ebs
parameters:
type: io1
zone: us-east-1d
iopsPerGB: "10"
```
* `type`: `io1`, `gp2`, `sc1`, `st1`. See [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) for details. Default: `gp2`.
* `zone`: AWS zone. If not specified, a random zone from those where Kubernetes cluster has a node is chosen.
* `iopsPerGB`: only for `io1` volumes. I/O operations per second per GiB. AWS volume plugin multiplies this with size of requested volume to compute IOPS of the volume and caps it at 20 000 IOPS (maximum supported by AWS, see [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html). A string is expected here, i.e. `"10"`, not `10`.
* `encrypted`: denotes whether the EBS volume should be encrypted or not. Valid values are `"true"` or `"false"`. A string is expected here, i.e. `"true"`, not `true`.
* `kmsKeyId`: optional. The full Amazon Resource Name of the key to use when encrypting the volume. If none is supplied but `encrypted` is true, a key is generated by AWS. See AWS docs for valid ARN value.
#### GCE
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/gce-pd
parameters:
type: pd-standard
zone: us-central1-a
```
* `type`: `pd-standard` or `pd-ssd`. Default: `pd-standard`
* `zone`: GCE zone. If not specified, a random zone in the same region as controller-manager will be chosen.
#### Glusterfs
```yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: slow
provisioner: kubernetes.io/glusterfs
parameters:
resturl: "http://127.0.0.1:8081"
restauthenabled: "true"
restuser: "admin"
secretNamespace: "default"
secretName: "heketi-secret"
```
* `resturl`: Gluster REST service/Heketi service url which provision gluster volumes on demand. The general format should be `IPaddress:Port` and this is a mandatory parameter for GlusterFS dynamic provisioner. If Heketi service is exposed as a routable service in openshift/kubernetes setup, this can have a format similar to
`http://heketi-storage-project.cloudapps.mystorage.com` where the fqdn is a resolvable heketi service url.
* `restauthenabled` : Gluster REST service authentication boolean that enables authentication to the REST server. If this value is 'true', `restuser` and `restuserkey` or `secretNamespace` + `secretName` have to be filled. This option is deprecated, authentication is enabled when any of `restuser`, `restuserkey`, `secretName` or `secretNamespace` is specified.
* `restuser` : Gluster REST service/Heketi user who has access to create volumes in the Gluster Trusted Pool.
* `restuserkey` : Gluster REST service/Heketi user's password which will be used for authentication to the REST server. This parameter is deprecated in favor of `secretNamespace` + `secretName`.
* `secretNamespace` + `secretName` : Identification of Secret instance that contains user password to use when talking to Gluster REST service. These parameters are optional, empty password will be used when both `secretNamespace` and `secretName` are omitted. The provided secret must have type "kubernetes.io/glusterfs", e.g. created in this way:
```
$ kubectl create secret generic heketi-secret --type="kubernetes.io/glusterfs" --from-literal=key='opensesame' --namespace=default
```
#### OpenStack Cinder
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: gold
provisioner: kubernetes.io/cinder
parameters:
type: fast
availability: nova
```
* `type`: [VolumeType](http://docs.openstack.org/admin-guide/dashboard-manage-volumes.html) created in Cinder. Default is empty.
* `availability`: Availability Zone. Default is empty.
#### vSphere
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: fast
provisioner: kubernetes.io/vsphere-volume
parameters:
diskformat: zeroedthick
```
* `diskformat`: `thin`, `zeroedthick` and `eagerzeroedthick`. Default: `"thin"`.
#### Ceph RBD
```yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: fast
provisioner: kubernetes.io/rbd
parameters:
monitors: 10.16.153.105:6789
adminId: kube
adminSecretName: ceph-secret
adminSecretNamespace: kube-system
pool: kube
userId: kube
userSecretName: ceph-secret-user
```
* `monitors`: Ceph monitors, comma delimited. This parameter is required.
* `adminId`: Ceph client ID that is capable of creating images in the pool. Default is "admin".
* `adminSecretNamespace`: The namespace for `adminSecret`. Default is "default".
* `adminSecret`: Secret Name for `adminId`. This parameter is required. The provided secret must have type "kubernetes.io/rbd".
* `pool`: Ceph RBD pool. Default is "rbd".
* `userId`: Ceph client ID that is used to map the RBD image. Default is the same as `adminId`.
* `userSecretName`: The name of Ceph Secret for `userId` to map RBD image. It must exist in the same namespace as PVCs. This parameter is required. The provided secret must have type "kubernetes.io/rbd", e.g. created in this way:
```
$ kubectl create secret generic ceph-secret --type="kubernetes.io/rbd" --from-literal=key='QVFEQ1pMdFhPUnQrSmhBQUFYaERWNHJsZ3BsMmNjcDR6RFZST0E9PQ==' --namespace=kube-system
```
#### Quobyte
```yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: slow
provisioner: kubernetes.io/quobyte
parameters:
quobyteAPIServer: "http://138.68.74.142:7860"
registry: "138.68.74.142:7861"
adminSecretName: "quobyte-admin-secret"
adminSecretNamespace: "kube-system"
user: "root"
group: "root"
quobyteConfig: "BASE"
quobyteTenant: "DEFAULT"
```
* `quobyteAPIServer`: API Server of Quobyte in the format `http(s)://api-server:7860`
* `registry`: Quobyte registry to use to mount the volume. You can specify the registry as ``<host>:<port>`` pair or if you want to specify multiple registries you just have to put a comma between them e.q. ``<host1>:<port>,<host2>:<port>,<host3>:<port>``. The host can be an IP address or if you have a working DNS you can also provide the DNS names.
* `adminSecretNamespace`: The namespace for `adminSecretName`. Default is "default".
* `adminSecretName`: secret that holds information about the Quobyte user and the password to authenticate against the API server. The provided secret must have type "kubernetes.io/quobyte", e.g. created in this way:
```
$ kubectl create secret generic quobyte-admin-secret --type="kubernetes.io/quobyte" --from-literal=key='opensesame' --namespace=kube-system
```
* `user`: maps all access to this user. Default is "root".
* `group`: maps all access to this group. Default is "nfsnobody".
* `quobyteConfig`: use the specified configuration to create the volume. You can create a new configuration or modify an existing one with the Web console or the quobyte CLI. Default is "BASE".
* `quobyteTenant`: use the specified tenant ID to create/delete the volume. This Quobyte tenant has to be already present in Quobyte. Default is "DEFAULT".
#### Azure Disk
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/azure-disk
parameters:
skuName: Standard_LRS
location: eastus
storageAccount: azure_storage_account_name
```
* `skuName`: Azure storage account Sku tier. Default is empty.
* `location`: Azure storage account location. Default is empty.
* `storageAccount`: Azure storage account name. If storage account is not provided, all storage accounts associated with the resource group are searched to find one that matches `skuName` and `location`. If storage account is provided, it must reside in the same resource group as the cluster, and `skuName` and `location` are ignored.
#### Portworx Volume
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: portworx-io-priority-high
provisioner: kubernetes.io/portworx-volume
parameters:
repl: "1"
snap_interval: "70"
io_priority: "high"
```
* `fs`: filesystem to be laid out: [none/xfs/ext4] (default: `ext4`).
* `block_size`: block size in Kbytes (default: `32`).
* `repl`: number of synchronous replicas to be provided in the form of replication factor [1..3] (default: `1`) A string is expected here i.e.`"1"` and not `1`.
* `io_priority`: determines whether the volume will be created from higher performance or a lower priority storage [high/medium/low] (default: `low`).
* `snap_interval`: clock/time interval in minutes for when to trigger snapshots. Snapshots are incremental based on difference with the prior snapshot, 0 disables snaps (default: `0`). A string is expected here i.e. `"70"` and not `70`.
* `aggregation_level`: specifies the number of chunks the volume would be distributed into, 0 indicates a non-aggregated volume (default: `0`). A string is expected here i.e. `"0"` and not `0`
* `ephemeral`: specifies whether the volume should be cleaned-up after unmount or should be persistent. `emptyDir` use case can set this value to true and `persistent volumes` use case such as for databases like Cassandra should set to false, [true/false] (default `false`). A string is expected here i.e. `"true"` and not `true`.
#### ScaleIO
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/scaleio
parameters:
gateway: https://192.168.99.200:443/api
system: scaleio
protectionDomain: default
storagePool: default
storageMode: ThinProvisionned
secretRef: sio-secret
readOnly: false
fsType: xfs
```
* `provisioner`: attribute is set to `kubernetes.io/scaleio`
* `gateway`: address to a ScaleIO API gateway (required)
* `system`: the name of the ScaleIO system (required)
* `protectionDomain`: the name of the ScaleIO protection domain
* `storagePool`: the name of the volume storage pool
* `storageMode`: the storage provision mode: `ThinProvisionned` (default) or `ThickProvisionned`
* `secretRef`: reference to a configuered Secret object (required, see detail below)
* `readOnly`: specifies the access mode to the mounted volume
* `fsType`: the file system to use for the volume
The ScaleIO Kubernetes volume plugin requires a configuered Secret object.
The secret must be created with type `kubernetes.io/scaleio` and use the same namespace value as that of the PVC where it is referenced
as shown in the following command:
```
$> kubectl create secret generic sio-secret --type="kubernetes.io/scaleio" --from-literal=username=sioadmin --from-literal=password=d2NABDNjMA== --namespace=default
```
## Writing Portable Configuration
If you're writing configuration templates or examples that run on a wide range of clusters
and need persistent storage, we recommend that you use the following pattern:
- Do include PersistentVolumeClaim objects in your bundle of config (alongside Deployments, ConfigMaps, etc).
- Do not include PersistentVolume objects in the config, since the user instantiating the config may not have
permission to create PersistentVolumes.
- Give the user the option of providing a storage class name when instantiating the template.
- If the user provides a storage class name, and the cluster is version 1.4 or newer, put that value into the `volume.beta.kubernetes.io/storage-class` annotation of the PVC.
This will cause the PVC to match the right storage class if the cluster has StorageClasses enabled by the admin.
- If the user does not provide a storage class name or the cluster is version 1.3, then instead put a `volume.alpha.kubernetes.io/storage-class: default` annotation on the PVC.
- This will cause a PV to be automatically provisioned for the user with sane default characteristics on some clusters.
- Despite the word `alpha` in the name, the code behind this annotation has `beta` level support.
- Do not use `volume.beta.kubernetes.io/storage-class:` with any value including the empty string since it will prevent DefaultStorageClass admission controller
from running if enabled.
- In your tooling, do watch for PVCs that are not getting bound after some time and surface this to the user, as this may indicate that the cluster has no dynamic
storage support (in which case the user should create a matching PV) or the cluster has no storage system (in which case the user cannot deploy config requiring
PVCs).
- In the future, we expect most clusters to have `DefaultStorageClass` enabled, and to have some form of storage available. However, there may not be any
storage class names which work on all clusters, so continue to not set one by default.
At some point, the alpha annotation will cease to have meaning, but the unset `storageClass` field on the PVC
will have the desired effect.

View File

@ -0,0 +1,173 @@
---
assignees:
- erictune
title: Daemon Sets
---
* TOC
{:toc}
## What is a DaemonSet?
A _DaemonSet_ ensures that all (or some) nodes run a copy of a pod. As nodes are added to the
cluster, pods are added to them. As nodes are removed from the cluster, those pods are garbage
collected. Deleting a DaemonSet will clean up the pods it created.
Some typical uses of a DaemonSet are:
- running a cluster storage daemon, such as `glusterd`, `ceph`, on each node.
- running a logs collection daemon on every node, such as `fluentd` or `logstash`.
- running a node monitoring daemon on every node, such as [Prometheus Node Exporter](
https://github.com/prometheus/node_exporter), `collectd`, New Relic agent, or Ganglia `gmond`.
In a simple case, one DaemonSet, covering all nodes, would be used for each type of daemon.
A more complex setup might use multiple DaemonSets for a single type of daemon, but with
different flags and/or different memory and cpu requests for different hardware types.
## Writing a DaemonSet Spec
### Required Fields
As with all other Kubernetes config, a DaemonSet needs `apiVersion`, `kind`, and `metadata` fields. For
general information about working with config files, see [deploying applications](/docs/user-guide/deploying-applications/),
[configuring containers](/docs/user-guide/configuring-containers/), and [working with resources](/docs/user-guide/working-with-resources/) documents.
A DaemonSet also needs a [`.spec`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) section.
### Pod Template
The `.spec.template` is the only required field of the `.spec`.
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template).
It has exactly the same schema as a [pod](/docs/user-guide/pods), except
it is nested and does not have an `apiVersion` or `kind`.
In addition to required fields for a pod, a pod template in a DaemonSet has to specify appropriate
labels (see [pod selector](#pod-selector)).
A pod template in a DaemonSet must have a [`RestartPolicy`](/docs/user-guide/pod-states)
equal to `Always`, or be unspecified, which defaults to `Always`.
### Pod Selector
The `.spec.selector` field is a pod selector. It works the same as the `.spec.selector` of
a [Job](/docs/concepts/jobs/run-to-completion-finite-workloads/) or other new resources.
The `spec.selector` is an object consisting of two fields:
* `matchLabels` - works the same as the `.spec.selector` of a [ReplicationController](/docs/user-guide/replication-controller/)
* `matchExpressions` - allows to build more sophisticated selectors by specifying key,
list of values and an operator that relates the key and values.
When the two are specified the result is ANDed.
If the `.spec.selector` is specified, it must match the `.spec.template.metadata.labels`. If not
specified, they are defaulted to be equal. Config with these not matching will be rejected by the API.
Also you should not normally create any pods whose labels match this selector, either directly, via
another DaemonSet, or via other controller such as ReplicationController. Otherwise, the DaemonSet
controller will think that those pods were created by it. Kubernetes will not stop you from doing
this. One case where you might want to do this is manually create a pod with a different value on
a node for testing.
### Running Pods on Only Some Nodes
If you specify a `.spec.template.spec.nodeSelector`, then the DaemonSet controller will
create pods on nodes which match that [node
selector](/docs/user-guide/node-selection/). Likewise if you specify a `.spec.template.spec.affinity`
then DaemonSet controller will create pods on nodes which match that [node affinity](../../user-guide/node-selection/index.md).
If you do not specify either, then the DaemonSet controller will create pods on all nodes.
## How Daemon Pods are Scheduled
Normally, the machine that a pod runs on is selected by the Kubernetes scheduler. However, pods
created by the Daemon controller have the machine already selected (`.spec.nodeName` is specified
when the pod is created, so it is ignored by the scheduler). Therefore:
- the [`unschedulable`](/docs/admin/node/#manual-node-administration) field of a node is not respected
by the DaemonSet controller.
- DaemonSet controller can make pods even when the scheduler has not been started, which can help cluster
bootstrap.
Daemon pods do respect [taints and tolerations](/docs/user-guide/node-selection/index.md#taints-and-tolerations-beta-feature), but they are
created with `NoExecute` tolerations for the `node.alpha.kubernetes.io/notReady` and `node.alpha.kubernetes.io/unreachable`
taints with no `tolerationSeconds`. This ensures that when the `TaintBasedEvictions` alpha feature is enabled,
they will not be evicted when there are node problems such as a network partition. (When the
`TaintBasedEvictions` feature is not enabled, they are also not evicted in these scenarios, but
due to hard-coded behavior of the NodeController rather than due to tolerations).
## Communicating with Daemon Pods
Some possible patterns for communicating with pods in a DaemonSet are:
- **Push**: Pods in the DaemonSet are configured to send updates to another service, such
as a stats database. They do not have clients.
- **NodeIP and Known Port**: Pods in the DaemonSet use a `hostPort`, so that the pods are reachable via the node IPs. Clients know the list of nodes ips somehow, and know the port by convention.
- **DNS**: Create a [headless service](/docs/user-guide/services/#headless-services) with the same pod selector,
and then discover DaemonSets using the `endpoints` resource or retrieve multiple A records from
DNS.
- **Service**: Create a service with the same pod selector, and use the service to reach a
daemon on a random node. (No way to reach specific node.)
## Updating a DaemonSet
If node labels are changed, the DaemonSet will promptly add pods to newly matching nodes and delete
pods from newly not-matching nodes.
You can modify the pods that a DaemonSet creates. However, pods do not allow all
fields to be updated. Also, the DaemonSet controller will use the original template the next
time a node (even with the same name) is created.
You can delete a DaemonSet. If you specify `--cascade=false` with `kubectl`, then the pods
will be left on the nodes. You can then create a new DaemonSet with a different template.
the new DaemonSet with the different template will recognize all the existing pods as having
matching labels. It will not modify or delete them despite a mismatch in the pod template.
You will need to force new pod creation by deleting the pod or deleting the node.
In Kubernetes version 1.6 and later, you can [perform a rolling update](/docs/tasks/manage-daemon/update-daemon-set/) on a DaemonSet.
Future releases of Kubernetes will support controlled updating of nodes.
## Alternatives to DaemonSet
### Init Scripts
It is certainly possible to run daemon processes by directly starting them on a node (e.g. using
`init`, `upstartd`, or `systemd`). This is perfectly fine. However, there are several advantages to
running such processes via a DaemonSet:
- Ability to monitor and manage logs for daemons in the same way as applications.
- Same config language and tools (e.g. pod templates, `kubectl`) for daemons and applications.
- Future versions of Kubernetes will likely support integration between DaemonSet-created
pods and node upgrade workflows.
- Running daemons in containers with resource limits increases isolation between daemons from app
containers. However, this can also be accomplished by running the daemons in a container but not in a pod
(e.g. start directly via Docker).
### Bare Pods
It is possible to create pods directly which specify a particular node to run on. However,
a DaemonSet replaces pods that are deleted or terminated for any reason, such as in the case of
node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, you should
use a DaemonSet rather than creating individual pods.
### Static Pods
It is possible to create pods by writing a file to a certain directory watched by Kubelet. These
are called [static pods](/docs/admin/static-pods/).
Unlike DaemonSet, static pods cannot be managed with kubectl
or other Kubernetes API clients. Static pods do not depend on the apiserver, making them useful
in cluster bootstrapping cases. Also, static pods may be deprecated in the future.
### Replication Controller
DaemonSet are similar to [Replication Controllers](/docs/user-guide/replication-controller) in that
they both create pods, and those pods have processes which are not expected to terminate (e.g. web servers,
storage servers).
Use a replication controller for stateless services, like frontends, where scaling up and down the
number of replicas and rolling out updates are more important than controlling exactly which host
the pod runs on. Use a Daemon Controller when it is important that a copy of a pod always run on
all or certain hosts, and when it needs to start before other pods.

View File

@ -0,0 +1,808 @@
---
assignees:
- bgrant0607
- janetkuo
title: Deployments
---
{:toc}
## What is a Deployment?
A _Deployment_ provides declarative updates for [Pods](/docs/user-guide/pods/) and [Replica Sets](/docs/user-guide/replicasets/) (the next-generation Replication Controller).
You only need to describe the desired state in a Deployment object, and the Deployment
controller will change the actual state to the desired state at a controlled rate for you.
You can define Deployments to create new resources, or replace existing ones
by new ones.
A typical use case is:
* Create a Deployment to bring up a Replica Set and Pods.
* Check the status of a Deployment to see if it succeeds or not.
* Later, update that Deployment to recreate the Pods (for example, to use a new image).
* Rollback to an earlier Deployment revision if the current Deployment isn't stable.
* Pause and resume a Deployment.
## Creating a Deployment
Here is an example Deployment. It creates a Replica Set to
bring up 3 nginx Pods.
{% include code.html language="yaml" file="nginx-deployment.yaml" ghlink="/docs/concepts/workloads/controllers/nginx-deployment.yaml" %}
Run the example by downloading the example file and then running this command:
```shell
$ kubectl create -f docs/user-guide/nginx-deployment.yaml --record
deployment "nginx-deployment" created
```
Setting the kubectl flag `--record` to `true` allows you to record current command in the annotations of the resources being created or updated. It will be useful for future introspection; for example, to see the commands executed in each Deployment revision.
Then running `get` immediately will give:
```shell
$ kubectl get deployments
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 0 0 0 1s
```
This indicates that the Deployment's number of desired replicas is 3 (according to deployment's `.spec.replicas`), the number of current replicas (`.status.replicas`) is 0, the number of up-to-date replicas (`.status.updatedReplicas`) is 0, and the number of available replicas (`.status.availableReplicas`) is also 0.
Running the `get` again a few seconds later, should give:
```shell
$ kubectl get deployments
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 3 3 3 18s
```
This indicates that the Deployment has created all three replicas, and all replicas are up-to-date (contains the latest pod template) and available (pod status is ready for at least Deployment's `.spec.minReadySeconds`). Running `kubectl get rs` and `kubectl get pods` will show the Replica Set (RS) and Pods created.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-2035384211 3 3 0 18s
```
You may notice that the name of the Replica Set is always `<the name of the Deployment>-<hash value of the pod template>`.
```shell
$ kubectl get pods --show-labels
NAME READY STATUS RESTARTS AGE LABELS
nginx-deployment-2035384211-7ci7o 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
nginx-deployment-2035384211-kzszj 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
nginx-deployment-2035384211-qqcnn 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
```
The created Replica Set will ensure that there are three nginx Pods at all times.
**Note:** You must specify appropriate selector and pod template labels of a Deployment (in this case, `app = nginx`), i.e. don't overlap with other controllers (including Deployments, Replica Sets, Replication Controllers, etc.) Kubernetes won't stop you from doing that, and if you end up with multiple controllers that have overlapping selectors, those controllers will fight with each other's and won't behave correctly.
## Updating a Deployment
**Note:** a Deployment's rollout is triggered if and only if the Deployment's pod template (i.e. `.spec.template`) is changed,
e.g. updating labels or container images of the template. Other updates, such as scaling the Deployment, will not trigger a rollout.
Suppose that we now want to update the nginx Pods to start using the `nginx:1.9.1` image
instead of the `nginx:1.7.9` image.
```shell
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
deployment "nginx-deployment" image updated
```
Alternatively, we can `edit` the Deployment and change `.spec.template.spec.containers[0].image` from `nginx:1.7.9` to `nginx:1.9.1`:
```shell
$ kubectl edit deployment/nginx-deployment
deployment "nginx-deployment" edited
```
To see its rollout status, simply run:
```shell
$ kubectl rollout status deployment/nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
deployment "nginx-deployment" successfully rolled out
```
After the rollout succeeds, you may want to `get` the Deployment:
```shell
$ kubectl get deployments
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 3 3 3 36s
```
The number of up-to-date replicas indicates that the Deployment has updated the replicas to the latest configuration.
The current replicas indicates the total replicas this Deployment manages, and the available replicas indicates the
number of current replicas that are available.
We can run `kubectl get rs` to see that the Deployment updated the Pods by creating a new Replica Set and scaling it up to 3 replicas, as well as scaling down the old Replica Set to 0 replicas.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 3 3 0 6s
nginx-deployment-2035384211 0 0 0 36s
```
Running `get pods` should now show only the new Pods:
```shell
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
nginx-deployment-1564180365-khku8 1/1 Running 0 14s
nginx-deployment-1564180365-nacti 1/1 Running 0 14s
nginx-deployment-1564180365-z9gth 1/1 Running 0 14s
```
Next time we want to update these Pods, we only need to update the Deployment's pod template again.
Deployment can ensure that only a certain number of Pods may be down while they are being updated. By
default, it ensures that at least 25% less than the desired number of Pods are
up (25% max unavailable).
Deployment can also ensure that only a certain number of Pods may be created above the desired number of Pods. By default, it ensures that at most 25% more than the desired number of Pods are up (25% max surge).
For example, if you look at the above Deployment closely, you will see that
it first created a new Pod, then deleted some old Pods and created new ones. It
does not kill old Pods until a sufficient number of new Pods have come up, and does not create new Pods until a sufficient number of old Pods have been killed. It makes sure that number of available Pods is at least 2 and the number of total Pods is at most 4.
```shell
$ kubectl describe deployments
Name: nginx-deployment
Namespace: default
CreationTimestamp: Tue, 15 Mar 2016 12:01:06 -0700
Labels: app=nginx
Selector: app=nginx
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
StrategyType: RollingUpdate
MinReadySeconds: 0
RollingUpdateStrategy: 1 max unavailable, 1 max surge
OldReplicaSets: <none>
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
36s 36s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
```
Here we see that when we first created the Deployment, it created a Replica Set (nginx-deployment-2035384211) and scaled it up to 3 replicas directly.
When we updated the Deployment, it created a new Replica Set (nginx-deployment-1564180365) and scaled it up to 1 and then scaled down the old Replica Set to 2, so that at least 2 Pods were available and at most 4 Pods were created at all times.
It then continued scaling up and down the new and the old Replica Set, with the same rolling update strategy. Finally, we'll have 3 available replicas in the new Replica Set, and the old Replica Set is scaled down to 0.
### Multiple Updates
Each time a new deployment object is observed by the deployment controller, a Replica Set is
created to bring up the desired Pods if there is no existing Replica Set doing so.
Existing Replica Set controlling Pods whose labels match `.spec.selector` but whose
template does not match `.spec.template` are scaled down.
Eventually, the new Replica Set will be scaled to `.spec.replicas` and all old Replica Sets will
be scaled to 0.
If you update a Deployment while an existing deployment is in progress,
the Deployment will create a new Replica Set as per the update and start scaling that up, and
will roll the Replica Set that it was scaling up previously -- it will add it to its list of old Replica Sets and will
start scaling it down.
For example, suppose you create a Deployment to create 5 replicas of `nginx:1.7.9`,
but then updates the Deployment to create 5 replicas of `nginx:1.9.1`, when only 3
replicas of `nginx:1.7.9` had been created. In that case, Deployment will immediately start
killing the 3 `nginx:1.7.9` Pods that it had created, and will start creating
`nginx:1.9.1` Pods. It will not wait for 5 replicas of `nginx:1.7.9` to be created
before changing course.
## Rolling Back a Deployment
Sometimes you may want to rollback a Deployment; for example, when the Deployment is not stable, such as crash looping.
By default, two previous Deployment's rollout history are kept in the system so that you can rollback anytime you want
(you can change that by modifying [revision history limit](/docs/user-guide/deployments/#revision-history-limit)).
**Note:** a Deployment's revision is created when a Deployment's rollout is triggered. This means that the new revision is created
if and only if the Deployment's pod template (i.e. `.spec.template`) is changed, e.g. updating labels or container images of the template.
Other updates, such as scaling the Deployment, will not create a Deployment revision -- so that we can facilitate simultaneous manual- or
auto-scaling. This implies that when you rollback to an earlier revision, only the Deployment's pod template part will be rolled back.
Suppose that we made a typo while updating the Deployment, by putting the image name as `nginx:1.91` instead of `nginx:1.9.1`:
```shell
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.91
deployment "nginx-deployment" image updated
```
The rollout will be stuck.
```shell
$ kubectl rollout status deployments nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
```
Press Ctrl-C to stop the above rollout status watch. For more information on stuck rollouts, [read more here](#deployment-status).
You will also see that both the number of old replicas (nginx-deployment-1564180365 and nginx-deployment-2035384211) and new replicas (nginx-deployment-3066724191) are 2.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 2 2 0 25s
nginx-deployment-2035384211 0 0 0 36s
nginx-deployment-3066724191 2 2 2 6s
```
Looking at the Pods created, you will see that the 2 Pods created by new Replica Set are stuck in an image pull loop.
```shell
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
nginx-deployment-1564180365-70iae 1/1 Running 0 25s
nginx-deployment-1564180365-jbqqo 1/1 Running 0 25s
nginx-deployment-3066724191-08mng 0/1 ImagePullBackOff 0 6s
nginx-deployment-3066724191-eocby 0/1 ImagePullBackOff 0 6s
```
Note that the Deployment controller will stop the bad rollout automatically, and will stop scaling up the new Replica Set.
```shell
$ kubectl describe deployment
Name: nginx-deployment
Namespace: default
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
Labels: app=nginx
Selector: app=nginx
Replicas: 2 updated | 3 total | 2 available | 2 unavailable
StrategyType: RollingUpdate
MinReadySeconds: 0
RollingUpdateStrategy: 1 max unavailable, 1 max surge
OldReplicaSets: nginx-deployment-1564180365 (2/2 replicas created)
NewReplicaSet: nginx-deployment-3066724191 (2/2 replicas created)
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
1m 1m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
```
To fix this, we need to rollback to a previous revision of Deployment that is stable.
### Checking Rollout History of a Deployment
First, check the revisions of this deployment:
```shell
$ kubectl rollout history deployment/nginx-deployment
deployments "nginx-deployment":
REVISION CHANGE-CAUSE
1 kubectl create -f docs/user-guide/nginx-deployment.yaml --record
2 kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
3 kubectl set image deployment/nginx-deployment nginx=nginx:1.91
```
Because we recorded the command while creating this Deployment using `--record`, we can easily see the changes we made in each revision.
To further see the details of each revision, run:
```shell
$ kubectl rollout history deployment/nginx-deployment --revision=2
deployments "nginx-deployment" revision 2
Labels: app=nginx
pod-template-hash=1159050644
Annotations: kubernetes.io/change-cause=kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
Containers:
nginx:
Image: nginx:1.9.1
Port: 80/TCP
QoS Tier:
cpu: BestEffort
memory: BestEffort
Environment Variables: <none>
No volumes.
```
### Rolling Back to a Previous Revision
Now we've decided to undo the current rollout and rollback to the previous revision:
```shell
$ kubectl rollout undo deployment/nginx-deployment
deployment "nginx-deployment" rolled back
```
Alternatively, you can rollback to a specific revision by specify that in `--to-revision`:
```shell
$ kubectl rollout undo deployment/nginx-deployment --to-revision=2
deployment "nginx-deployment" rolled back
```
For more details about rollout related commands, read [`kubectl rollout`](/docs/user-guide/kubectl/kubectl_rollout/).
The Deployment is now rolled back to a previous stable revision. As you can see, a `DeploymentRollback` event for rolling back to revision 2 is generated from Deployment controller.
```shell
$ kubectl get deployment
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 3 3 3 30m
$ kubectl describe deployment
Name: nginx-deployment
Namespace: default
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
Labels: app=nginx
Selector: app=nginx
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
StrategyType: RollingUpdate
MinReadySeconds: 0
RollingUpdateStrategy: 1 max unavailable, 1 max surge
OldReplicaSets: <none>
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
30m 30m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
2m 2m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-3066724191 to 0
2m 2m 1 {deployment-controller } Normal DeploymentRollback Rolled back deployment "nginx-deployment" to revision 2
29m 2m 2 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
```
### Clean up Policy
You can set `.spec.revisionHistoryLimit` field to specify how much revision history of this deployment you want to keep. By default,
all revision history will be kept; explicitly setting this field to `0` disallows a deployment being rolled back.
## Scaling a Deployment
You can scale a Deployment by using the following command:
```shell
$ kubectl scale deployment nginx-deployment --replicas 10
deployment "nginx-deployment" scaled
```
Assuming [horizontal pod autoscaling](/docs/user-guide/horizontal-pod-autoscaling/walkthrough.md) is enabled
in your cluster, you can setup an autoscaler for your Deployment and choose the minimum and maximum number of
Pods you want to run based on the CPU utilization of your existing Pods.
```shell
$ kubectl autoscale deployment nginx-deployment --min=10 --max=15 --cpu-percent=80
deployment "nginx-deployment" autoscaled
```
RollingUpdate Deployments support running multiple versions of an application at the same time. When you
or an autoscaler scales a RollingUpdate Deployment that is in the middle of a rollout (either in progress
or paused), then the Deployment controller will balance the additional replicas in the existing active
ReplicaSets (ReplicaSets with Pods) in order to mitigate risk. This is called *proportional scaling*.
For example, you are running a Deployment with 10 replicas, [maxSurge](#max-surge)=3, and [maxUnavailable](#max-unavailable)=2.
```shell
$ kubectl get deploy
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 10 10 10 10 50s
```
You update to a new image which happens to be unresolvable from inside the cluster.
```shell
$ kubectl set image deploy/nginx-deployment nginx=nginx:sometag
deployment "nginx-deployment" image updated
```
The image update starts a new rollout with ReplicaSet nginx-deployment-1989198191 but it's blocked due to the
maxUnavailable requirement that we mentioned above.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1989198191 5 5 0 9s
nginx-deployment-618515232 8 8 8 1m
```
Then a new scaling request for the Deployment comes along. The autoscaler increments the Deployment replicas
to 15. The Deployment controller needs to decide where to add these new 5 replicas. If we weren't using
proportional scaling, all 5 of them would be added in the new ReplicaSet. With proportional scaling, we
spread the additional replicas across all ReplicaSets. Bigger proportions go to the ReplicaSets with the
most replicas and lower proportions go to ReplicaSets with less replicas. Any leftovers are added to the
ReplicaSet with the most replicas. ReplicaSets with zero replicas are not scaled up.
In our example above, 3 replicas will be added to the old ReplicaSet and 2 replicas will be added to the
new ReplicaSet. The rollout process should eventually move all replicas to the new ReplicaSet, assuming
the new replicas become healthy.
```shell
$ kubectl get deploy
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 15 18 7 8 7m
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1989198191 7 7 0 7m
nginx-deployment-618515232 11 11 11 7m
```
## Pausing and Resuming a Deployment
You can also pause a Deployment mid-way and then resume it. A use case is to support canary deployment.
Update the Deployment again and then pause the Deployment with `kubectl rollout pause`:
```shell
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1; kubectl rollout pause deployment/nginx-deployment
deployment "nginx-deployment" image updated
deployment "nginx-deployment" paused
```
Note that any current state of the Deployment will continue its function, but new updates to the Deployment will not have an effect as long as the Deployment is paused.
The Deployment was still in progress when we paused it, so the actions of scaling up and down Replica Sets are paused too.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 2 2 2 1h
nginx-deployment-2035384211 2 2 0 1h
nginx-deployment-3066724191 0 0 0 1h
```
In a separate terminal, watch for rollout status changes and you'll see the rollout won't continue:
```shell
$ kubectl rollout status deployment/nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
```
To resume the Deployment, simply do `kubectl rollout resume`:
```shell
$ kubectl rollout resume deployment/nginx-deployment
deployment "nginx-deployment" resumed
```
Then the Deployment will continue and finish the rollout:
```shell
$ kubectl rollout status deployment/nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
Waiting for deployment spec update to be observed...
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
deployment nginx-deployment successfully rolled out
```
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 3 3 3 1h
nginx-deployment-2035384211 0 0 0 1h
nginx-deployment-3066724191 0 0 0 1h
```
Note: You cannot rollback a paused Deployment until you resume it.
## Deployment status
A Deployment enters various states during its lifecycle. It can be [progressing](#progressing-deployment) while rolling out a new ReplicaSet,
it can be [complete](#complete-deployment), or it can [fail to progress](#failed-deployment).
### Progressing Deployment
Kubernetes marks a Deployment as _progressing_ when one of the following tasks is performed:
* The Deployment is in the process of creating a new ReplicaSet.
* The Deployment is scaling up an existing ReplicaSet.
* The Deployment is scaling down an existing ReplicaSet.
* New pods become available.
You can monitor the progress for a Deployment by using `kubectl rollout status`.
### Complete Deployment
Kubernetes marks a Deployment as _complete_ when it has the following characteristics:
* The Deployment has minimum availability. Minimum availability means that the Deployment's number of available replicas
equals or exceeds the number required by the Deployment strategy.
* All of the replicas associated with the Deployment have been updated to the latest version you've specified, meaning any
updates you've requested have been completed.
* No old pods for the Deployment are running.
You can check if a Deployment has completed by using `kubectl rollout status`. If the rollout completed successfully, `kubectl rollout status` returns a zero exit code.
```shell
$ kubectl rollout status deploy/nginx
Waiting for rollout to finish: 2 of 3 updated replicas are available...
deployment "nginx" successfully rolled out
$ echo $?
0
```
### Failed Deployment
Your Deployment may get stuck trying to deploy its newest ReplicaSet without ever completing. This can occur due to some of the following factors:
* Insufficient quota
* Readiness probe failures
* Image pull errors
* Insufficient permissions
* Limit ranges
* Application runtime misconfiguration
One way you can detect this condition is to specify a deadline parameter in your Deployment spec: ([`spec.progressDeadlineSeconds`](#progress-deadline-seconds)). `spec.progressDeadlineSeconds` denotes the number of seconds the Deployment controller waits before indicating (via the Deployment status) that the Deployment progress has stalled.
The following `kubectl` command sets the spec with `progressDeadlineSeconds` to make the controller report lack of progress for a Deployment after 10 minutes:
```shell
$ kubectl patch deployment/nginx-deployment -p '{"spec":{"progressDeadlineSeconds":600}}'
"nginx-deployment" patched
```
Once the deadline has been exceeded, the Deployment controller adds a DeploymentCondition with the following attributes to
the Deployment's `status.conditions`:
* Type=Progressing
* Status=False
* Reason=ProgressDeadlineExceeded
See the [Kubernetes API conventions](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#typical-status-properties) for more information on status conditions.
Note that in version 1.5, Kubernetes will take no action on a stalled Deployment other than to report a status condition with
`Reason=ProgressDeadlineExceeded`.
**Note:** If you pause a Deployment, Kubernetes does not check progress against your specified deadline. You can safely pause a Deployment in the middle of a rollout and resume without triggering the condition for exceeding the deadline.
You may experience transient errors with your Deployments, either due to a low timeout that you have set or due to any other kind
of error that can be treated as transient. For example, let's suppose you have insufficient quota. If you describe the Deployment
you will notice the following section:
```shell
$ kubectl describe deployment nginx-deployment
<...>
Conditions:
Type Status Reason
---- ------ ------
Available True MinimumReplicasAvailable
Progressing True ReplicaSetUpdated
ReplicaFailure True FailedCreate
<...>
```
If you run `kubectl get deployment nginx-deployment -o yaml`, the Deployement status might look like this:
```
status:
availableReplicas: 2
conditions:
- lastTransitionTime: 2016-10-04T12:25:39Z
lastUpdateTime: 2016-10-04T12:25:39Z
message: Replica set "nginx-deployment-4262182780" is progressing.
reason: ReplicaSetUpdated
status: "True"
type: Progressing
- lastTransitionTime: 2016-10-04T12:25:42Z
lastUpdateTime: 2016-10-04T12:25:42Z
message: Deployment has minimum availability.
reason: MinimumReplicasAvailable
status: "True"
type: Available
- lastTransitionTime: 2016-10-04T12:25:39Z
lastUpdateTime: 2016-10-04T12:25:39Z
message: 'Error creating: pods "nginx-deployment-4262182780-" is forbidden: exceeded quota:
object-counts, requested: pods=1, used: pods=3, limited: pods=2'
reason: FailedCreate
status: "True"
type: ReplicaFailure
observedGeneration: 3
replicas: 2
unavailableReplicas: 2
```
Eventually, once the Deployment progress deadline is exceeded, Kubernetes updates the status and the reason for the Progressing condition:
```
Conditions:
Type Status Reason
---- ------ ------
Available True MinimumReplicasAvailable
Progressing False ProgressDeadlineExceeded
ReplicaFailure True FailedCreate
```
You can address an issue of insufficient quota by scaling down your Deployment, by scaling down other controllers you may be running,
or by increasing quota in your namespace. If you satisfy the quota conditions and the Deployment controller then completes the Deployment
rollout, you'll see the Deployment's status update with a successful condition (`Status=True` and `Reason=NewReplicaSetAvailable`).
```
Conditions:
Type Status Reason
---- ------ ------
Available True MinimumReplicasAvailable
Progressing True NewReplicaSetAvailable
```
`Type=Available` with `Status=True` means that your Deployment has minimum availability. Minimum availability is dictated
by the parameters specified in the deployment strategy. `Type=Progressing` with `Status=True` means that your Deployment
is either in the middle of a rollout and it is progressing or that it has successfully completed its progress and the minimum
required new replicas are available (see the Reason of the condition for the particulars - in our case
`Reason=NewReplicaSetAvailable` means that the Deployment is complete).
You can check if a Deployment has failed to progress by using `kubectl rollout status`. `kubectl rollout status` returns a non-zero exit code if the Deployment has exceeded the progression deadline.
```shell
$ kubectl rollout status deploy/nginx
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
error: deployment "nginx" exceeded its progress deadline
$ echo $?
1
```
### Operating on a failed deployment
All actions that apply to a complete Deployment also apply to a failed Deployment. You can scale it up/down, roll back
to a previous revision, or even pause it if you need to apply multiple tweaks in the Deployment pod template.
## Use Cases
### Canary Deployment
If you want to roll out releases to a subset of users or servers using the Deployment, you can create multiple Deployments, one for each release,
following the canary pattern described in [managing resources](/docs/concepts/cluster-administration/manage-deployment/#canary-deployments).
## Writing a Deployment Spec
As with all other Kubernetes configs, a Deployment needs `apiVersion`, `kind`, and
`metadata` fields. For general information about working with config files,
see [deploying applications](/docs/user-guide/deploying-applications), [configuring containers](/docs/user-guide/configuring-containers), and [using kubectl to manage resources](/docs/user-guide/working-with-resources) documents.
A Deployment also needs a [`.spec` section](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status).
### Pod Template
The `.spec.template` is the only required field of the `.spec`.
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template). It has exactly
the same schema as a [Pod](/docs/user-guide/pods), except it is nested and does not have an
`apiVersion` or `kind`.
In addition to required fields for a Pod, a pod template in a Deployment must specify appropriate
labels (i.e. don't overlap with other controllers, see [selector](#selector)) and an appropriate restart policy.
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
if not specified.
### Replicas
`.spec.replicas` is an optional field that specifies the number of desired Pods. It defaults
to 1.
### Selector
`.spec.selector` is an optional field that specifies a [label selector](/docs/user-guide/labels/#label-selectors) for the Pods
targeted by this deployment.
If specified, `.spec.selector` must match `.spec.template.metadata.labels`, or it will
be rejected by the API. If `.spec.selector` is unspecified, `.spec.selector.matchLabels` will be defaulted to
`.spec.template.metadata.labels`.
Deployment may kill Pods whose labels match the selector, in the case that their
template is different than `.spec.template` or if the total number of such Pods
exceeds `.spec.replicas`. It will bring up new Pods with `.spec.template` if
number of Pods are less than the desired number.
Note that you should not create other pods whose labels match this selector, either directly, via another Deployment or via another controller such as Replica Sets or Replication Controllers. Otherwise, the Deployment will think that those pods were created by it. Kubernetes will not stop you from doing this.
If you have multiple controllers that have overlapping selectors, the controllers will fight with each other's and won't behave correctly.
### Strategy
`.spec.strategy` specifies the strategy used to replace old Pods by new ones.
`.spec.strategy.type` can be "Recreate" or "RollingUpdate". "RollingUpdate" is
the default value.
#### Recreate Deployment
All existing Pods are killed before new ones are created when
`.spec.strategy.type==Recreate`.
#### Rolling Update Deployment
The Deployment updates Pods in a [rolling update](/docs/tasks/run-application/rolling-update-replication-controller/) fashion
when `.spec.strategy.type==RollingUpdate`.
You can specify `maxUnavailable` and `maxSurge` to control
the rolling update process.
##### Max Unavailable
`.spec.strategy.rollingUpdate.maxUnavailable` is an optional field that specifies the
maximum number of Pods that can be unavailable during the update process.
The value can be an absolute number (e.g. 5) or a percentage of desired Pods
(e.g. 10%).
The absolute number is calculated from percentage by rounding up.
This can not be 0 if `.spec.strategy.rollingUpdate.maxSurge` is 0.
By default, a fixed value of 1 is used.
For example, when this value is set to 30%, the old Replica Set can be scaled down to
70% of desired Pods immediately when the rolling update starts. Once new Pods are
ready, old Replica Set can be scaled down further, followed by scaling up the new Replica Set,
ensuring that the total number of Pods available at all times during the
update is at least 70% of the desired Pods.
##### Max Surge
`.spec.strategy.rollingUpdate.maxSurge` is an optional field that specifies the
maximum number of Pods that can be created above the desired number of Pods.
Value can be an absolute number (e.g. 5) or a percentage of desired Pods
(e.g. 10%).
This can not be 0 if `MaxUnavailable` is 0.
The absolute number is calculated from percentage by rounding up.
By default, a value of 1 is used.
For example, when this value is set to 30%, the new Replica Set can be scaled up immediately when
the rolling update starts, such that the total number of old and new Pods do not exceed
130% of desired Pods. Once old Pods have been killed,
the new Replica Set can be scaled up further, ensuring that the total number of Pods running
at any time during the update is at most 130% of desired Pods.
### Progress Deadline Seconds
`.spec.progressDeadlineSeconds` is an optional field that specifies the number of seconds you want
to wait for your Deployment to progress before the system reports back that the Deployment has
[failed progressing](#failed-deployment) - surfaced as a condition with `Type=Progressing`, `Status=False`.
and `Reason=ProgressDeadlineExceeded` in the status of the resource. The deployment controller will keep
retrying the Deployment. In the future, once automatic rollback will be implemented, the deployment
controller will roll back a Deployment as soon as it observes such a condition.
If specified, this field needs to be greater than `.spec.minReadySeconds`.
### Min Ready Seconds
`.spec.minReadySeconds` is an optional field (with default value of 600s) that specifies the
minimum number of seconds for which a newly created Pod should be ready
without any of its containers crashing, for it to be considered available.
This defaults to 0 (the Pod will be considered available as soon as it is ready).
To learn more about when a Pod is considered ready, see [Container Probes](/docs/user-guide/pod-states/#container-probes).
### Rollback To
`.spec.rollbackTo` is an optional field with the configuration the Deployment is rolling back to. Setting this field will trigger a rollback, and this field will be cleared every time a rollback is done.
#### Revision
`.spec.rollbackTo.revision` is an optional field specifying the revision to rollback to. This defaults to 0, meaning rollback to the last revision in history.
### Revision History Limit
A deployment's revision history is stored in the replica sets it controls.
`.spec.revisionHistoryLimit` is an optional field (with default value of two) that specifies the number of old Replica Sets to retain to allow rollback. Its ideal value depends on the frequency and stability of new deployments. All old Replica Sets will be kept by default, consuming resources in `etcd` and crowding the output of `kubectl get rs`, if this field is not set. The configuration of each Deployment revision is stored in its Replica Sets; therefore, once an old Replica Set is deleted, you lose the ability to rollback to that revision of Deployment.
More specifically, setting this field to zero means that all old replica sets with 0 replica will be cleaned up.
In this case, a new deployment rollout cannot be undone, since its revision history is cleaned up.
### Paused
`.spec.paused` is an optional boolean field for pausing and resuming a Deployment. It defaults to false (a Deployment is not paused).
## Alternative to Deployments
### kubectl rolling update
[Kubectl rolling update](/docs/user-guide/kubectl/kubectl_rolling-update) updates Pods and Replication Controllers in a similar fashion.
But Deployments are recommended, since they are declarative, server side, and have additional features, such as rolling back to any previous revision even after the rolling update is done.

View File

@ -0,0 +1,45 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: frontend
# these labels can be applied automatically
# from the labels in the pod template if not set
# labels:
# app: guestbook
# tier: frontend
spec:
# this replicas value is default
# modify it according to your case
replicas: 3
# selector can be applied automatically
# from the labels in the pod template if not set,
# but we are specifying the selector here to
# demonstrate its usage.
selector:
matchLabels:
tier: frontend
matchExpressions:
- {key: tier, operator: In, values: [frontend]}
template:
metadata:
labels:
app: guestbook
tier: frontend
spec:
containers:
- name: php-redis
image: gcr.io/google_samples/gb-frontend:v3
resources:
requests:
cpu: 100m
memory: 100Mi
env:
- name: GET_HOSTS_FROM
value: dns
# If your cluster config does not include a dns service, then to
# instead access environment variables to find service host
# info, comment out the 'value: dns' line above, and uncomment the
# line below.
# value: env
ports:
- containerPort: 80

View File

@ -0,0 +1,11 @@
apiVersion: autoscaling/v1
kind: HorizontalPodAutoscaler
metadata:
name: frontend-scaler
spec:
scaleTargetRef:
kind: ReplicaSet
name: frontend
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 50

View File

@ -0,0 +1,16 @@
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: nginx-deployment
spec:
replicas: 3
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: nginx:1.7.9
ports:
- containerPort: 80

View File

@ -15,4 +15,430 @@ redirect_from:
__Warning:__ Starting in Kubernetes version 1.5, PetSet has been renamed to [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets). To use (or continue to use) PetSet in Kubernetes 1.5, you _must_ [migrate](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/) your existing PetSets to StatefulSets. For information on working with StatefulSet, see the tutorial on [how to run replicated stateful applications](/docs/tutorials/stateful-application/run-replicated-stateful-application).
__This document has been deprecated__.
__This document has been deprecated__, but can still apply if you're using
Kubernetes version 1.4 or earlier.
* TOC
{:toc}
__Terminology__
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
* Node: A single virtual or physical machine in a Kubernetes cluster.
* Cluster: A group of nodes in a single failure domain, unless mentioned otherwise.
* Persistent Volume Claim (PVC): A request for storage, typically a [persistent volume](/docs/user-guide/persistent-volumes/walkthrough/).
* Host name: The hostname attached to the UTS namespace of the pod, i.e. the output of `hostname` in the pod.
* DNS/Domain name: A *cluster local* domain name resolvable using standard methods (e.g.: [gethostbyname](http://linux.die.net/man/3/gethostbyname)).
* Ordinality: the property of being "ordinal", or occupying a position in a sequence.
* Pet: a single member of a PetSet; more generally, a stateful application.
* Peer: a process running a server, capable of communicating with other such processes.
__Prerequisites__
This doc assumes familiarity with the following Kubernetes concepts:
* [Pods](/docs/user-guide/pods/single-container/)
* [Cluster DNS](/docs/admin/dns/)
* [Headless Services](/docs/user-guide/services/#headless-services)
* [Persistent Volumes](/docs/concepts/storage/volumes/)
* [Persistent Volume Provisioning](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md)
You need a working Kubernetes cluster at version >= 1.3, with a healthy DNS [cluster addon](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/README.md) at version >= 15. You cannot use PetSet on a hosted Kubernetes provider that has disabled `alpha` resources.
## What is a PetSet?
In Kubernetes, most pod management abstractions group them into disposable units of work that compose a micro service. Replication controllers for example, are designed with a weak guarantee - that there should be N replicas of a particular pod template. The pods are treated as stateless units, if one of them is unhealthy or superseded by a newer version, the system just disposes it.
```
foo.default.svc.cluster.local
|service|
/ \
| pod-asdf | | pod-zxcv |
```
A PetSet, in contrast, is a group of stateful pods that require a stronger notion of identity. The document refers to these as "clustered applications".
```
*.foo.default.svc.cluster.local
| mysql-0 | <-> | mysql-1 |
[pv 0] [pv 1]
```
The co-ordinated deployment of clustered applications is notoriously hard. They require stronger notions of identity and membership, which they use in opaque internal protocols, and are especially prone to race conditions and deadlock. Traditionally administrators have deployed these applications by leveraging nodes as stable, long-lived entities with persistent storage and static ips.
The goal of PetSet is to decouple this dependency by assigning identities to individual instances of an application that are not anchored to the underlying physical infrastructure. For the rest of this document we will refer to these entities as "Pets". Our use of this term is predated by the "Pets vs Cattle" analogy.
__Relationship between Pets and Pods__: PetSet requires there be {0..N-1} Pets. Each Pet has a deterministic name - PetSetName-Ordinal, and a unique identity. Each Pet has at most one pod, and each PetSet has at most one Pet with a given identity.
## When to use PetSet?
A PetSet ensures that a specified number of "pets" with unique identities are running at any given time. The identity of a Pet is comprised of:
* a stable hostname, available in DNS
* an ordinal index
* stable storage: linked to the ordinal & hostname
These properties are useful in deploying stateful applications. However most stateful applications are also clustered, meaning they form groups with strict membership requirements that rely on stored state. PetSet also helps with the 2 most common problems encountered managing such clustered applications:
* discovery of peers for quorum
* startup/teardown ordering
Only use PetSet if your application requires some or all of these properties. Managing pods as stateless replicas is vastly easier.
Example workloads for PetSet:
* Databases like MySQL or PostgreSQL that require a single instance attached to an NFS persistent volume at any time
* Clustered software like Zookeeper, Etcd, or Elasticsearch that require stable membership.
## Alpha limitations
Before you start deploying applications as PetSets, there are a few limitations you should understand.
* PetSet is an *alpha* resource, not available in any Kubernetes release prior to 1.3.
* As with all alpha/beta resources, it can be disabled through the `--runtime-config` option passed to the apiserver, and in fact most likely will be disabled on hosted offerings of Kubernetes.
* The only updatable field on a PetSet is `replicas`
* The storage for a given pet must either be provisioned by a [persistent volume provisioner](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md) based on the requested `storage class`, or pre-provisioned by an admin. Note that persistent volume provisioning is also currently in alpha.
* Deleting and/or scaling a PetSet down will *not* delete the volumes associated with the PetSet. This is done to ensure safety first, your data is more valuable than an auto purge of all related PetSet resources. **Deleting the Persistent Volume Claims will result in a deletion of the associated volumes**.
* All PetSets currently require a "governing service", or a Service responsible for the network identity of the pets. The user is responsible for this Service.
* Updating an existing PetSet is currently a manual process, meaning you either need to deploy a new PetSet with the new image version, or orphan Pets one by one, update their image, and join them back to the cluster.
## Example PetSet
We'll create a basic PetSet to demonstrate how Pets are assigned unique and "sticky" identities.
{% include code.html language="yaml" file="petset.yaml" ghlink="/docs/concepts/workloads/controllers/petset.yaml" %}
Saving this config into `petset.yaml` and submitting it to a Kubernetes cluster should create the defined PetSet and Pets it manages:
```shell
$ kubectl create -f petset.yaml
service "nginx" created
petset "nginx" created
```
## Pet Identity
The identity of a Pet sticks to it, regardless of which node it's (re) scheduled on. We can examine the identity of the pets we just created.
### Ordinal index
you should see 2 pods with predictable names formatted thus: `$(petset name)-$(ordinal index assigned by petset controller)`
```shell
$ kubectl get po
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 10m
web-1 1/1 Running 0 10m
```
### Stable storage
2 persistent volumes, one per pod. This is auto created by the PetSet based on the `volumeClaimTemplate` field
```shell
$ kubectl get pv
NAME CAPACITY ACCESSMODES STATUS CLAIM REASON AGE
pvc-90234946-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-0 11m
pvc-902733c2-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-1 11m
```
### Network identity
The network identity has 2 parts. First, we created a headless Service that controls the domain within which we create Pets. The domain managed by this Service takes the form: `$(service name).$(namespace).svc.cluster.local`, where "cluster.local" is the [cluster domain](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it). As each pet is created, it gets a matching DNS subdomain, taking the form: `$(petname).$(governing service domain)`, where the governing service is defined by the `serviceName` field on the PetSet.
Here are some examples of choices for Cluster Domain, Service name, PetSet name, and how that affects the DNS names for the Pets and the hostnames in the Pet's pods:
Cluster Domain | Service (ns/name) | PetSet (ns/name) | PetSet Domain | Pet DNS | Pet Hostname |
-------------- | ----------------- | ----------------- | -------------- | ------- | ------------ |
cluster.local | default/nginx | default/web | nginx.default.svc.cluster.local | web-{0..N-1}.nginx.default.svc.cluster.local | web-{0..N-1} |
cluster.local | foo/nginx | foo/web | nginx.foo.svc.cluster.local | web-{0..N-1}.nginx.foo.svc.cluster.local | web-{0..N-1} |
kube.local | foo/nginx | foo/web | nginx.foo.svc.kube.local | web-{0..N-1}.nginx.foo.svc.kube.local | web-{0..N-1} |
Note that Cluster Domain will be set to `cluster.local` unless [otherwise configured](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it).
Let's verify our assertion with a simple test.
```shell
$ kubectl get svc
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
nginx None <none> 80/TCP 12m
...
```
First, the PetSet provides a stable hostname:
```shell
$ for i in 0 1; do kubectl exec web-$i -- sh -c 'hostname'; done
web-0
web-1
```
And the hostname is linked to the in-cluster DNS address:
```shell
$ kubectl run -i --tty --image busybox dns-test --restart=Never /bin/sh
dns-test # nslookup web-0.nginx
Server: 10.0.0.10
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
Name: web-0.nginx
Address 1: 10.180.3.5
dns-test # nslookup web-1.nginx
Server: 10.0.0.10
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
Name: web-1.nginx
Address 1: 10.180.0.9
```
The containers are running nginx webservers, which by default will look for an index.html file in `/usr/share/nginx/html/index.html`. That directory is backed by a `PersistentVolume` created by the PetSet. So let's write our hostname there:
```shell
$ for i in 0 1; do
kubectl exec web-$i -- sh -c 'echo $(hostname) > /usr/share/nginx/html/index.html';
done
```
And verify each webserver serves its own hostname:
```shell
$ for i in 0 1; do kubectl exec -it web-$i -- curl localhost; done
web-0
web-1
```
Now delete all pods in the petset:
```shell
$ kubectl delete po -l app=nginx
pod "web-0" deleted
pod "web-1" deleted
```
Wait for them to come back up, and try to retrieve the previously written hostname through the DNS name of the peer. They match, because the storage, DNS name, and hostname stick to the Pet no matter where it gets scheduled:
```shell
$ kubectl exec -it web-1 -- curl web-0.nginx
web-0
$ kubectl exec -it web-0 -- curl web-1.nginx
web-1
```
## Peer discovery
A pet can piece together its own identity:
1. Use the [downward api](/docs/user-guide/downward-api/) to find its pod name
2. Run `hostname` to find its DNS name
3. Run `mount` or `df` to find its volumes (usually this is unnecessary)
It's not necessary to "discover" the governing Service of a PetSet, since it's known at creation time you can simply pass it down through an [environment variable](/docs/user-guide/environment-guide).
Usually pets also need to find their peers. In the previous nginx example, we just used `kubectl` to get the names of existing pods, and as humans, we could tell which ones belonged to a given PetSet. Another way to find peers is by contacting the API server, just like `kubectl`, but that has several disadvantages (you end up implementing a Kubernetes specific init system that runs as pid 1 in your application container).
PetSet gives you a way to discover your peers using DNS records. To illustrate this we can use the previous example (note: one usually doesn't `apt-get` in a container).
```shell
$ kubectl exec -it web-0 /bin/sh
web-0 # apt-get update && apt-get install -y dnsutils
...
web-0 # nslookup -type=srv nginx.default
Server: 10.0.0.10
Address: 10.0.0.10#53
nginx.default.svc.cluster.local service = 10 50 0 web-1.ub.default.svc.cluster.local.
nginx.default.svc.cluster.local service = 10 50 0 web-0.ub.default.svc.cluster.local.
```
## Updating a PetSet
You cannot update any field of the PetSet except `spec.replicas` and the `containers` in the podTemplate. Updating `spec.replicas` will scale the PetSet, updating `containers` will not have any effect till a Pet is deleted, at which time it is recreated with the modified podTemplate.
## Scaling a PetSet
You can scale a PetSet by updating the "replicas" field. Note however that the controller will only:
1. Create one pet at a time, in order from {0..N-1}, and wait till each one is in [Running and Ready](/docs/user-guide/pod-states) before creating the next
2. Delete one pet at a time, in reverse order from {N-1..0}, and wait till each one is completely shutdown (past its [terminationGracePeriodSeconds](/docs/user-guide/pods/index#termination-of-pods)) before deleting the next
```shell
$ kubectl get po
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 30s
web-1 1/1 Running 0 36s
$ kubectl patch petset web -p '{"spec":{"replicas":3}}'
"web" patched
$ kubectl get po
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 40s
web-1 1/1 Running 0 46s
web-2 1/1 Running 0 8s
```
You can also use the `kubectl scale` command:
```shell
$ kubectl get petset
NAME DESIRED CURRENT AGE
web 3 3 24m
$ kubectl scale petset web --replicas=5
petset "web" scaled
$ kubectl get po --watch-only
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 10m
web-1 1/1 Running 0 27m
web-2 1/1 Running 0 10m
web-3 1/1 Running 0 3m
web-4 0/1 ContainerCreating 0 48s
$ kubectl get petset web
NAME DESIRED CURRENT AGE
web 5 5 30m
```
Note however, that scaling up to N and back down to M *will not* delete the volumes of the M-N pets, as described in the section on [deletion](#deleting-a-petset), i.e. scaling back up to M creates new pets that use the same volumes. To see this in action, scale the PetSet back down to 3:
```shell
$ kubectl get po --watch-only
web-4 1/1 Terminating 0 4m
web-4 1/1 Terminating 0 4m
web-3 1/1 Terminating 0 6m
web-3 1/1 Terminating 0 6m
```
Note that we still have 5 pvcs:
```shell
$ kubectl get pvc
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
www-web-0 Bound pvc-42ca5cef-8113-11e6-82f6-42010af00002 1Gi RWO 32m
www-web-1 Bound pvc-42de30af-8113-11e6-82f6-42010af00002 1Gi RWO 32m
www-web-2 Bound pvc-ba416413-8115-11e6-82f6-42010af00002 1Gi RWO 14m
www-web-3 Bound pvc-ba45f19c-8115-11e6-82f6-42010af00002 1Gi RWO 14m
www-web-4 Bound pvc-ba47674a-8115-11e6-82f6-42010af00002 1Gi RWO 14m
```
This allows you to upgrade the image of a petset and have it come back up with the same data, as described in the next section.
## Image upgrades
PetSet currently *does not* support automated image upgrade as noted in the section on [limitations](#alpha-limitations), however you can update the `image` field of any container in the podTemplate and delete Pets one by one, the PetSet controller will recreate it with the new image.
Edit the image on the PetSet to `gcr.io/google_containers/nginx-slim:0.7` and delete 1 Pet:
```shell{% raw %}
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
gcr.io/google_containers/nginx-slim:0.8
gcr.io/google_containers/nginx-slim:0.8
gcr.io/google_containers/nginx-slim:0.8
$ kubectl delete po web-0
pod "web-0" deleted
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
gcr.io/google_containers/nginx-slim:0.7
gcr.io/google_containers/nginx-slim:0.8
gcr.io/google_containers/nginx-slim:0.8
{% endraw %}```
Delete the remaining 2:
```shell
$ kubectl delete po web-1 web-2
pod "web-1" deleted
pod "web-2" deleted
```
Wait till the PetSet is stable and check the images:
```shell{% raw %}
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
gcr.io/google_containers/nginx-slim:0.7
gcr.io/google_containers/nginx-slim:0.7
gcr.io/google_containers/nginx-slim:0.7
{% endraw %}```
## Deleting a PetSet
Deleting a PetSet through kubectl will scale it down to 0, thereby deleting all the Pets. If you wish to delete just the PetSet and not the Pets, use `--cascade=false`:
```shell
$ kubectl delete -f petset.yaml --cascade=false
petset "web" deleted
$ kubectl get po -l app=nginx
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 21h
web-1 1/1 Running 0 21h
$ kubectl delete po -l app=nginx
pod "web-0" deleted
pod "web-1" deleted
```
Deleting the pods will *not* delete the volumes. Until we finalize the recycle policy for these volumes they will have to get cleaned up by an admin. This is to ensure that you have the chance to copy data off the volume before deleting it. Simply deleting the PVC after the pods have left the [terminating state](/docs/user-guide/pods/index#termination-of-pods) should trigger deletion of the backing Persistent Volumes.
**Note: you will lose all your data once the PVC is deleted, do this with caution.**
```shell
$ kubectl get po -l app=nginx
$ kubectl get pvc -l app=nginx
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
www-web-0 Bound pvc-62d271cd-3822-11e6-b1b7-42010af00002 0 21h
www-web-1 Bound pvc-62d6750e-3822-11e6-b1b7-42010af00002 0 21h
$ kubectl delete pvc -l app=nginx
$ kubectl get pv
```
If you simply want to clean everything:
```shell{% raw %}
$ grace=$(kubectl get po web-0 --template '{{.spec.terminationGracePeriodSeconds}}')
$ kubectl delete petset,po -l app=nginx
$ sleep $grace
$ kubectl delete pvc -l app=nginx
{% endraw %}
```
## Troubleshooting
You might have noticed an `annotations` field in all the PetSets shown above.
```yaml
annotations:
pod.alpha.kubernetes.io/initialized: "true"
```
This field is a debugging hook. It pauses any scale up/down operations on the entire PetSet. If you'd like to pause a petset after each pet, set it to `false` in the template, wait for each pet to come up, verify it has initialized correctly, and then set it to `true` using `kubectl edit` on the pet (setting it to `false` on *any pet* is enough to pause the PetSet). If you don't need it, create the PetSet with it set to `true` as shown. This is surprisingly useful in debugging bootstrapping race conditions.
## Future Work
There are a LOT of planned improvements since PetSet is still in alpha.
* Data gravity and local storage
* Richer notification events
* Public network identities
* WAN cluster deployments (multi-AZ/region/cloud provider)
* Image and node upgrades
This list goes on, if you have examples, ideas or thoughts, please contribute.
## Alternatives
Deploying one RC of size 1/Service per pod is a popular alternative, as is simply deploying a DaemonSet that utilizes the identity of a Node.
## Next steps
* Learn about [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/),
the replacement for PetSet introduced in Kubernetes version 1.5.
* [Migrate your existing PetSets to StatefulSets](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/)
when upgrading to Kubernetes version 1.5 or higher.

View File

@ -0,0 +1,51 @@
# A headless service to create DNS records
apiVersion: v1
kind: Service
metadata:
name: nginx
labels:
app: nginx
spec:
ports:
- port: 80
name: web
# *.nginx.default.svc.cluster.local
clusterIP: None
selector:
app: nginx
---
apiVersion: apps/v1alpha1
kind: PetSet
metadata:
name: web
spec:
serviceName: "nginx"
replicas: 2
template:
metadata:
labels:
app: nginx
annotations:
pod.alpha.kubernetes.io/initialized: "true"
spec:
terminationGracePeriodSeconds: 0
containers:
- name: nginx
image: gcr.io/google_containers/nginx-slim:0.8
ports:
- containerPort: 80
name: web
volumeMounts:
- name: www
mountPath: /usr/share/nginx/html
volumeClaimTemplates:
- metadata:
name: www
annotations:
volume.alpha.kubernetes.io/storage-class: anything
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests:
storage: 1Gi

View File

@ -0,0 +1,102 @@
---
assignees:
- Kashomon
- bprashanth
- madhusudancs
title: Replica Sets
---
* TOC
{:toc}
## What is a ReplicaSet?
ReplicaSet is the next-generation Replication Controller. The only difference
between a _ReplicaSet_ and a
[_Replication Controller_](/docs/user-guide/replication-controller/) right now is
the selector support. ReplicaSet supports the new set-based selector requirements
as described in the [labels user guide](/docs/user-guide/labels/#label-selectors)
whereas a Replication Controller only supports equality-based selector requirements.
Most [`kubectl`](/docs/user-guide/kubectl/) commands that support
Replication Controllers also support ReplicaSets. One exception is the
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command. If
you want the rolling update functionality please consider using Deployments
instead. Also, the
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command is
imperative whereas Deployments are declarative, so we recommend using Deployments
through the [`rollout`](/docs/user-guide/kubectl/kubectl_rollout/) command.
While ReplicaSets can be used independently, today it's mainly used by
[Deployments](/docs/user-guide/deployments/) as a mechanism to orchestrate pod
creation, deletion and updates. When you use Deployments you don't have to worry
about managing the ReplicaSets that they create. Deployments own and manage
their ReplicaSets.
## When to use a ReplicaSet?
A ReplicaSet ensures that a specified number of pod “replicas” are running at any given
time. However, a Deployment is a higher-level concept that manages ReplicaSets and
provides declarative updates to pods along with a lot of other useful features.
Therefore, we recommend using Deployments instead of directly using ReplicaSets, unless
you require custom update orchestration or don't require updates at all.
This actually means that you may never need to manipulate ReplicaSet objects:
use directly a Deployment and define your application in the spec section.
## Example
{% include code.html language="yaml" file="frontend.yaml" ghlink="/docs/concepts/workloads/controllers/frontend.yaml" %}
Saving this config into `frontend.yaml` and submitting it to a Kubernetes cluster should
create the defined ReplicaSet and the pods that it manages.
```shell
$ kubectl create -f frontend.yaml
replicaset "frontend" created
$ kubectl describe rs/frontend
Name: frontend
Namespace: default
Image(s): gcr.io/google_samples/gb-frontend:v3
Selector: tier=frontend,tier in (frontend)
Labels: app=guestbook,tier=frontend
Replicas: 3 current / 3 desired
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
No volumes.
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-qhloh
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-dnjpy
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-9si5l
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
frontend-9si5l 1/1 Running 0 1m
frontend-dnjpy 1/1 Running 0 1m
frontend-qhloh 1/1 Running 0 1m
```
## ReplicaSet as an Horizontal Pod Autoscaler target
A ReplicaSet can also be a target for
[Horizontal Pod Autoscalers (HPA)](/docs/user-guide/horizontal-pod-autoscaling/),
i.e. a ReplicaSet can be auto-scaled by an HPA. Here is an example HPA targeting
the ReplicaSet we created in the previous example.
{% include code.html language="yaml" file="hpa-rs.yaml" ghlink="/docs/concepts/workloads/controllers/hpa-rs.yaml" %}
Saving this config into `hpa-rs.yaml` and submitting it to a Kubernetes cluster should
create the defined HPA that autoscales the target ReplicaSet depending on the CPU usage
of the replicated pods.
```shell
kubectl create -f hpa-rs.yaml
```
Alternatively, you can just use the `kubectl autoscale` command to accomplish the same
(and it's easier!)
```shell
kubectl autoscale rs frontend
```

View File

@ -0,0 +1,19 @@
apiVersion: v1
kind: ReplicationController
metadata:
name: nginx
spec:
replicas: 3
selector:
app: nginx
template:
metadata:
name: nginx
labels:
app: nginx
spec:
containers:
- name: nginx
image: nginx
ports:
- containerPort: 80

View File

@ -0,0 +1,261 @@
---
assignees:
- bprashanth
- janetkuo
title: Replication Controller
---
* TOC
{:toc}
## What is a ReplicationController?
A _ReplicationController_ ensures that a specified number of pod "replicas" are running at any one
time. In other words, a ReplicationController makes sure that a pod or homogeneous set of pods are
always up and available.
If there are too many pods, it will kill some. If there are too few, the
ReplicationController will start more. Unlike manually created pods, the pods maintained by a
ReplicationController are automatically replaced if they fail, get deleted, or are terminated.
For example, your pods get re-created on a node after disruptive maintenance such as a kernel upgrade.
For this reason, we recommend that you use a ReplicationController even if your application requires
only a single pod. You can think of a ReplicationController as something similar to a process supervisor,
but rather than individual processes on a single node, the ReplicationController supervises multiple pods
across multiple nodes.
ReplicationController is often abbreviated to "rc" or "rcs" in discussion, and as a shortcut in
kubectl commands.
A simple case is to create 1 ReplicationController object in order to reliably run one instance of
a Pod indefinitely. A more complex use case is to run several identical replicas of a replicated
service, such as web servers.
## Running an example ReplicationController
Here is an example ReplicationController config. It runs 3 copies of the nginx web server.
{% include code.html language="yaml" file="replication.yaml" ghlink="/docs/concepts/workloads/controllers/replication.yaml" %}
Run the example job by downloading the example file and then running this command:
```shell
$ kubectl create -f ./replication.yaml
replicationcontroller "nginx" created
```
Check on the status of the ReplicationController using this command:
```shell
$ kubectl describe replicationcontrollers/nginx
Name: nginx
Namespace: default
Image(s): nginx
Selector: app=nginx
Labels: app=nginx
Replicas: 3 current / 3 desired
Pods Status: 0 Running / 3 Waiting / 0 Succeeded / 0 Failed
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- ---- ------ -------
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-qrm3m
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-3ntk0
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-4ok8v
```
Here, 3 pods have been made, but none are running yet, perhaps because the image is being pulled.
A little later, the same command may show:
```shell
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
```
To list all the pods that belong to the rc in a machine readable form, you can use a command like this:
```shell
$ pods=$(kubectl get pods --selector=app=nginx --output=jsonpath={.items..metadata.name})
echo $pods
nginx-3ntk0 nginx-4ok8v nginx-qrm3m
```
Here, the selector is the same as the selector for the ReplicationController (seen in the
`kubectl describe` output, and in a different form in `replication.yaml`. The `--output=jsonpath` option
specifies an expression that just gets the name from each pod in the returned list.
## Writing a ReplicationController Spec
As with all other Kubernetes config, a Job needs `apiVersion`, `kind`, and `metadata` fields. For
general information about working with config files, see [here](/docs/user-guide/simple-yaml/),
[here](/docs/user-guide/configuring-containers/), and [here](/docs/user-guide/working-with-resources/).
A ReplicationController also needs a [`.spec` section](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status).
### Pod Template
The `.spec.template` is the only required field of the `.spec`.
The `.spec.template` is a [pod template](#pod-template). It has exactly
the same schema as a [pod](/docs/user-guide/pods/), except it is nested and does not have an `apiVersion` or
`kind`.
In addition to required fields for a Pod, a pod template in a ReplicationController must specify appropriate
labels (i.e. don't overlap with other controllers, see [pod selector](#pod-selector)) and an appropriate restart policy.
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
if not specified.
For local container restarts, ReplicationControllers delegate to an agent on the node,
for example the [Kubelet](/docs/admin/kubelet/) or Docker.
### Labels on the ReplicationController
The ReplicationController can itself have labels (`.metadata.labels`). Typically, you
would set these the same as the `.spec.template.metadata.labels`; if `.metadata.labels` is not specified
then it is defaulted to `.spec.template.metadata.labels`. However, they are allowed to be
different, and the `.metadata.labels` do not affect the behavior of the ReplicationController.
### Pod Selector
The `.spec.selector` field is a [label selector](/docs/user-guide/labels/#label-selectors). A replication
controller manages all the pods with labels which match the selector. It does not distinguish
between pods which it created or deleted versus pods which some other person or process created or
deleted. This allows the ReplicationController to be replaced without affecting the running pods.
If specified, the `.spec.template.metadata.labels` must be equal to the `.spec.selector`, or it will
be rejected by the API. If `.spec.selector` is unspecified, it will be defaulted to
`.spec.template.metadata.labels`.
Also you should not normally create any pods whose labels match this selector, either directly, via
another ReplicationController or via another controller such as Job. Otherwise, the
ReplicationController will think that those pods were created by it. Kubernetes will not stop you
from doing this.
If you do end up with multiple controllers that have overlapping selectors, you
will have to manage the deletion yourself (see [below](#updating-a-replication-controller)).
### Multiple Replicas
You can specify how many pods should run concurrently by setting `.spec.replicas` to the number
of pods you would like to have running concurrently. The number running at any time may be higher
or lower, such as if the replicas was just increased or decreased, or if a pod is gracefully
shutdown, and a replacement starts early.
If you do not specify `.spec.replicas`, then it defaults to 1.
## Working with ReplicationControllers
### Deleting a ReplicationController and its Pods
To delete a ReplicationController and all its pods, use [`kubectl
delete`](/docs/user-guide/kubectl/kubectl_delete/). Kubectl will scale the ReplicationController to zero and wait
for it to delete each pod before deleting the ReplicationController itself. If this kubectl
command is interrupted, it can be restarted.
When using the REST API or go client library, you need to do the steps explicitly (scale replicas to
0, wait for pod deletions, then delete the ReplicationController).
### Deleting just a ReplicationController
You can delete a ReplicationController without affecting any of its pods.
Using kubectl, specify the `--cascade=false` option to [`kubectl delete`](/docs/user-guide/kubectl/kubectl_delete/).
When using the REST API or go client library, simply delete the ReplicationController object.
Once the original is deleted, you can create a new ReplicationController to replace it. As long
as the old and new `.spec.selector` are the same, then the new one will adopt the old pods.
However, it will not make any effort to make existing pods match a new, different pod template.
To update pods to a new spec in a controlled way, use a [rolling update](#rolling-updates).
### Isolating pods from a ReplicationController
Pods may be removed from a ReplicationController's target set by changing their labels. This technique may be used to remove pods from service for debugging, data recovery, etc. Pods that are removed in this way will be replaced automatically (assuming that the number of replicas is not also changed).
## Common usage patterns
### Rescheduling
As mentioned above, whether you have 1 pod you want to keep running, or 1000, a ReplicationController will ensure that the specified number of pods exists, even in the event of node failure or pod termination (e.g., due to an action by another control agent).
### Scaling
The ReplicationController makes it easy to scale the number of replicas up or down, either manually or by an auto-scaling control agent, by simply updating the `replicas` field.
### Rolling updates
The ReplicationController is designed to facilitate rolling updates to a service by replacing pods one-by-one.
As explained in [#1353](http://issue.k8s.io/1353), the recommended approach is to create a new ReplicationController with 1 replica, scale the new (+1) and old (-1) controllers one by one, and then delete the old controller after it reaches 0 replicas. This predictably updates the set of pods regardless of unexpected failures.
Ideally, the rolling update controller would take application readiness into account, and would ensure that a sufficient number of pods were productively serving at any given time.
The two ReplicationControllers would need to create pods with at least one differentiating label, such as the image tag of the primary container of the pod, since it is typically image updates that motivate rolling updates.
Rolling update is implemented in the client tool
[`kubectl rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update). Visit [`kubectl rolling-update` task](/docs/tasks/run-application/rolling-update-replication-controller/) for more concrete examples.
### Multiple release tracks
In addition to running multiple releases of an application while a rolling update is in progress, it's common to run multiple releases for an extended period of time, or even continuously, using multiple release tracks. The tracks would be differentiated by labels.
For instance, a service might target all pods with `tier in (frontend), environment in (prod)`. Now say you have 10 replicated pods that make up this tier. But you want to be able to 'canary' a new version of this component. You could set up a ReplicationController with `replicas` set to 9 for the bulk of the replicas, with labels `tier=frontend, environment=prod, track=stable`, and another ReplicationController with `replicas` set to 1 for the canary, with labels `tier=frontend, environment=prod, track=canary`. Now the service is covering both the canary and non-canary pods. But you can mess with the ReplicationControllers separately to test things out, monitor the results, etc.
### Using ReplicationControllers with Services
Multiple ReplicationControllers can sit behind a single service, so that, for example, some traffic
goes to the old version, and some goes to the new version.
A ReplicationController will never terminate on its own, but it isn't expected to be as long-lived as services. Services may be composed of pods controlled by multiple ReplicationControllers, and it is expected that many ReplicationControllers may be created and destroyed over the lifetime of a service (for instance, to perform an update of pods that run the service). Both services themselves and their clients should remain oblivious to the ReplicationControllers that maintain the pods of the services.
## Writing programs for Replication
Pods created by a ReplicationController are intended to be fungible and semantically identical, though their configurations may become heterogeneous over time. This is an obvious fit for replicated stateless servers, but ReplicationControllers can also be used to maintain availability of master-elected, sharded, and worker-pool applications. Such applications should use dynamic work assignment mechanisms, such as the [etcd lock module](https://coreos.com/docs/distributed-configuration/etcd-modules/) or [RabbitMQ work queues](https://www.rabbitmq.com/tutorials/tutorial-two-python.html), as opposed to static/one-time customization of the configuration of each pod, which is considered an anti-pattern. Any pod customization performed, such as vertical auto-sizing of resources (e.g., cpu or memory), should be performed by another online controller process, not unlike the ReplicationController itself.
## Responsibilities of the ReplicationController
The ReplicationController simply ensures that the desired number of pods matches its label selector and are operational. Currently, only terminated pods are excluded from its count. In the future, [readiness](http://issue.k8s.io/620) and other information available from the system may be taken into account, we may add more controls over the replacement policy, and we plan to emit events that could be used by external clients to implement arbitrarily sophisticated replacement and/or scale-down policies.
The ReplicationController is forever constrained to this narrow responsibility. It itself will not perform readiness nor liveness probes. Rather than performing auto-scaling, it is intended to be controlled by an external auto-scaler (as discussed in [#492](http://issue.k8s.io/492)), which would change its `replicas` field. We will not add scheduling policies (e.g., [spreading](http://issue.k8s.io/367#issuecomment-48428019)) to the ReplicationController. Nor should it verify that the pods controlled match the currently specified template, as that would obstruct auto-sizing and other automated processes. Similarly, completion deadlines, ordering dependencies, configuration expansion, and other features belong elsewhere. We even plan to factor out the mechanism for bulk pod creation ([#170](http://issue.k8s.io/170)).
The ReplicationController is intended to be a composable building-block primitive. We expect higher-level APIs and/or tools to be built on top of it and other complementary primitives for user convenience in the future. The "macro" operations currently supported by kubectl (run, stop, scale, rolling-update) are proof-of-concept examples of this. For instance, we could imagine something like [Asgard](http://techblog.netflix.com/2012/06/asgard-web-based-cloud-management-and.html) managing ReplicationControllers, auto-scalers, services, scheduling policies, canaries, etc.
## API Object
Replication controller is a top-level resource in the Kubernetes REST API. More details about the
API object can be found at: [ReplicationController API
object](/docs/api-reference/v1.6/#replicationcontroller-v1-core).
## Alternatives to ReplicationController
### ReplicaSet
[`ReplicaSet`](/docs/user-guide/replicasets/) is the next-generation ReplicationController that supports the new [set-based label selector](/docs/user-guide/labels/#set-based-requirement).
Its mainly used by [`Deployment`](/docs/user-guide/deployments/) as a mechanism to orchestrate pod creation, deletion and updates.
Note that we recommend using Deployments instead of directly using Replica Sets, unless you require custom update orchestration or dont require updates at all.
### Deployment (Recommended)
[`Deployment`](/docs/user-guide/deployments/) is a higher-level API object that updates its underlying Replica Sets and their Pods
in a similar fashion as `kubectl rolling-update`. Deployments are recommended if you want this rolling update functionality,
because unlike `kubectl rolling-update`, they are declarative, server-side, and have additional features.
### Bare Pods
Unlike in the case where a user directly created pods, a ReplicationController replaces pods that are deleted or terminated for any reason, such as in the case of node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, we recommend that you use a ReplicationController even if your application requires only a single pod. Think of it similarly to a process supervisor, only it supervises multiple pods across multiple nodes instead of individual processes on a single node. A ReplicationController delegates local container restarts to some agent on the node (e.g., Kubelet or Docker).
### Job
Use a [`Job`](/docs/concepts/jobs/run-to-completion-finite-workloads/) instead of a ReplicationController for pods that are expected to terminate on their own
(i.e. batch jobs).
### DaemonSet
Use a [`DaemonSet`](/docs/admin/daemons/) instead of a ReplicationController for pods that provide a
machine-level function, such as machine monitoring or machine logging. These pods have a lifetime that is tied
to a machine lifetime: the pod needs to be running on the machine before other pods start, and are
safe to terminate when the machine is otherwise ready to be rebooted/shutdown.
## For more information
Read [Run Stateless AP Replication Controller](/docs/tutorials/stateless-application/run-stateless-ap-replication-controller/).

View File

@ -0,0 +1,196 @@
---
assignees:
title: Pods
---
* TOC
{:toc}
_pods_ are the smallest deployable units of computing that can be created and
managed in Kubernetes.
## What is a Pod?
A _pod_ (as in a pod of whales or pea pod) is a group of one or more containers
(such as Docker containers), the shared storage for those containers, and
options about how to run the containers. Pods are always co-located and
co-scheduled, and run in a shared context. A pod models an
application-specific "logical host" - it contains one or more application
containers which are relatively tightly coupled &mdash; in a pre-container
world, they would have executed on the same physical or virtual machine.
While Kubernetes supports more container runtimes than just Docker, Docker is
the most commonly known runtime, and it helps to describe pods in Docker terms.
The shared context of a pod is a set of Linux namespaces, cgroups, and
potentially other facets of isolation - the same things that isolate a Docker
container. Within a pod's context, the individual applications may have
further sub-isolations applied.
Containers within a pod share an IP address and port space, and
can find each other via `localhost`. They can also communicate with each
other using standard inter-process communications like SystemV semaphores or
POSIX shared memory. Containers in different pods have distinct IP addresses
and can not communicate by IPC.
Applications within a pod also have access to shared volumes, which are defined
as part of a pod and are made available to be mounted into each application's
filesystem.
In terms of [Docker](https://www.docker.com/) constructs, a pod is modelled as
a group of Docker containers with shared namespaces and shared
[volumes](/docs/concepts/storage/volumes/). PID namespace sharing is not yet implemented in Docker.
Like individual application containers, pods are considered to be relatively
ephemeral (rather than durable) entities. As discussed in [life of a
pod](/docs/user-guide/pod-states/), pods are created, assigned a unique ID (UID), and
scheduled to nodes where they remain until termination (according to restart
policy) or deletion. If a node dies, the pods scheduled to that node are
scheduled for deletion, after a timeout period. A given pod (as defined by a UID) is not
"rescheduled" to a new node; instead, it can be replaced by an identical pod,
with even the same name if desired, but with a new UID (see [replication
controller](/docs/user-guide/replication-controller/) for more details). (In the future, a
higher-level API may support pod migration.)
When something is said to have the same lifetime as a pod, such as a volume,
that means that it exists as long as that pod (with that UID) exists. If that
pod is deleted for any reason, even if an identical replacement is created, the
related thing (e.g. volume) is also destroyed and created anew.
![pod diagram](/images/docs/pod.svg){: style="max-width: 50%" }
*A multi-container pod that contains a file puller and a
web server that uses a persistent volume for shared storage between the containers.*
## Motivation for pods
### Management
Pods are a model of the pattern of multiple cooperating processes which form a
cohesive unit of service. They simplify application deployment and management
by providing a higher-level abstraction than the set of their constituent
applications. Pods serve as unit of deployment, horizontal scaling, and
replication. Colocation (co-scheduling), shared fate (e.g. termination),
coordinated replication, resource sharing, and dependency management are
handled automatically for containers in a pod.
### Resource sharing and communication
Pods enable data sharing and communication among their constituents.
The applications in a pod all use the same network namespace (same IP and port
space), and can thus "find" each other and communicate using `localhost`.
Because of this, applications in a pod must coordinate their usage of ports.
Each pod has an IP address in a flat shared networking space that has full
communication with other physical computers and pods across the network.
The hostname is set to the pod's Name for the application containers within the
pod. [More details on networking](/docs/admin/networking/).
In addition to defining the application containers that run in the pod, the pod
specifies a set of shared storage volumes. Volumes enable data to survive
container restarts and to be shared among the applications within the pod.
## Uses of pods
Pods can be used to host vertically integrated application stacks (e.g. LAMP),
but their primary motivation is to support co-located, co-managed helper
programs, such as:
* content management systems, file and data loaders, local cache managers, etc.
* log and checkpoint backup, compression, rotation, snapshotting, etc.
* data change watchers, log tailers, logging and monitoring adapters, event publishers, etc.
* proxies, bridges, and adapters
* controllers, managers, configurators, and updaters
Individual pods are not intended to run multiple instances of the same
application, in general.
For a longer explanation, see [The Distributed System ToolKit: Patterns for
Composite
Containers](http://blog.kubernetes.io/2015/06/the-distributed-system-toolkit-patterns.html).
## Alternatives considered
_Why not just run multiple programs in a single (Docker) container?_
1. Transparency. Making the containers within the pod visible to the
infrastructure enables the infrastructure to provide services to those
containers, such as process management and resource monitoring. This
facilitates a number of conveniences for users.
2. Decoupling software dependencies. The individual containers may be
versioned, rebuilt and redeployed independently. Kubernetes may even support
live updates of individual containers someday.
3. Ease of use. Users don't need to run their own process managers, worry about
signal and exit-code propagation, etc.
4. Efficiency. Because the infrastructure takes on more responsibility,
containers can be lighter weight.
_Why not support affinity-based co-scheduling of containers?_
That approach would provide co-location, but would not provide most of the
benefits of pods, such as resource sharing, IPC, guaranteed fate sharing, and
simplified management.
## Durability of pods (or lack thereof)
Pods aren't intended to be treated as durable entities. They won't survive scheduling failures, node failures, or other evictions, such as due to lack of resources, or in the case of node maintenance.
In general, users shouldn't need to create pods directly. They should almost always use controllers (e.g., [Deployments](/docs/user-guide/deployments/)), even for singletons. Controllers provide self-healing with a cluster scope, as well as replication and rollout management.
The use of collective APIs as the primary user-facing primitive is relatively common among cluster scheduling systems, including [Borg](https://research.google.com/pubs/pub43438.html), [Marathon](https://mesosphere.github.io/marathon/docs/rest-api.html), [Aurora](http://aurora.apache.org/documentation/latest/configuration-reference/#job-schema), and [Tupperware](http://www.slideshare.net/Docker/aravindnarayanan-facebook140613153626phpapp02-37588997).
Pod is exposed as a primitive in order to facilitate:
* scheduler and controller pluggability
* support for pod-level operations without the need to "proxy" them via controller APIs
* decoupling of pod lifetime from controller lifetime, such as for bootstrapping
* decoupling of controllers and services &mdash; the endpoint controller just watches pods
* clean composition of Kubelet-level functionality with cluster-level functionality &mdash; Kubelet is effectively the "pod controller"
* high-availability applications, which will expect pods to be replaced in advance of their termination and certainly in advance of deletion, such as in the case of planned evictions, image prefetching, or live pod migration [#3949](http://issue.k8s.io/3949)
There is new first-class support for stateful pods with the [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/) controller (currently in beta). The feature was alpha in 1.4 and was called [PetSet](/docs/user-guide/petset/). For prior versions of Kubernetes, best practice for having stateful pods is to create a replication controller with `replicas` equal to `1` and a corresponding service, see [this MySQL deployment example](/docs/tutorials/stateful-application/run-stateful-application/).
## Termination of Pods
Because pods represent running processes on nodes in the cluster, it is important to allow those processes to gracefully terminate when they are no longer needed (vs being violently killed with a KILL signal and having no chance to clean up). Users should be able to request deletion and know when processes terminate, but also be able to ensure that deletes eventually complete. When a user requests deletion of a pod the system records the intended grace period before the pod is allowed to be forcefully killed, and a TERM signal is sent to the main process in each container. Once the grace period has expired the KILL signal is sent to those processes and the pod is then deleted from the API server. If the Kubelet or the container manager is restarted while waiting for processes to terminate, the termination will be retried with the full grace period.
An example flow:
1. User sends command to delete Pod, with default grace period (30s)
2. The Pod in the API server is updated with the time beyond which the Pod is considered "dead" along with the grace period.
3. Pod shows up as "Terminating" when listed in client commands
4. (simultaneous with 3) When the Kubelet sees that a Pod has been marked as terminating because the time in 2 has been set, it begins the pod shutdown process.
1. If the pod has defined a [preStop hook](/docs/concepts/containers/container-lifecycle-hooks/#hook-details), it is invoked inside of the pod. If the `preStop` hook is still running after the grace period expires, step 2 is then invoked with a small (2 second) extended grace period.
2. The processes in the Pod are sent the TERM signal.
5. (simultaneous with 3), Pod is removed from endpoints list for service, and are no longer considered part of the set of running pods for replication controllers. Pods that shutdown slowly can continue to serve traffic as load balancers (like the service proxy) remove them from their rotations.
6. When the grace period expires, any processes still running in the Pod are killed with SIGKILL.
7. The Kubelet will finish deleting the Pod on the API server by setting grace period 0 (immediate deletion). The Pod disappears from the API and is no longer visible from the client.
By default, all deletes are graceful within 30 seconds. The `kubectl delete` command supports the `--grace-period=<seconds>` option which allows a user to override the default and specify their own value. The value `0` [force deletes](/docs/user-guide/pods/#force-termination-of-pods) the pod. In kubectl version >= 1.5, you must specify an additional flag `--force` along with `--grace-period=0` in order to perform force deletions.
### Force deletion of pods
Force deletion of a pod is defined as deletion of a pod from the cluster state and etcd immediately. When a force deletion is performed, the apiserver does not wait for confirmation from the kubelet that the pod has been terminated on the node it was running on. It removes the pod in the API immediately so a new pod can be created with the same name. On the node, pods that are set to terminate immediately will still be given a small grace period before being force killed.
Force deletions can be potentially dangerous for some pods and should be performed with caution. In case of StatefulSet pods, please refer to the task documentation for [deleting Pods from a StatefulSet](/docs/tasks/manage-stateful-set/delete-pods/#deleting-pods).
## Privileged mode for pod containers
From Kubernetes v1.1, any container in a pod can enable privileged mode, using the `privileged` flag on the `SecurityContext` of the container spec. This is useful for containers that want to use linux capabilities like manipulating the network stack and accessing devices. Processes within the container get almost the same privileges that are available to processes outside a container. With privileged mode, it should be easier to write network and volume plugins as separate pods that don't need to be compiled into the kubelet.
If the master is running Kubernetes v1.1 or higher, and the nodes are running a version lower than v1.1, then new privileged pods will be accepted by api-server, but will not be launched. They will be pending state.
If user calls `kubectl describe pod FooPodName`, user can see the reason why the pod is in pending state. The events table in the describe command output will say:
`Error validating pod "FooPodName"."FooPodNamespace" from api, ignoring: spec.containers[0].securityContext.privileged: forbidden '<*>(0xc2089d3248)true'`
If the master is running a version lower than v1.1, then privileged pods cannot be created. If user attempts to create a pod, that has a privileged container, the user will get the following error:
`The Pod "FooPodName" is invalid.
spec.containers[0].securityContext.privileged: forbidden '<*>(0xc20b222db0)true'`
## API Object
Pod is a top-level resource in the Kubernetes REST API. More details about the
API object can be found at: [Pod API
object](/docs/api-reference/v1.6/#pod-v1-core).

View File

@ -9,7 +9,7 @@ title: Running Kubernetes on AWS EC2
{:toc}
## Supported Production Grade Tools with High Availability Options
## Supported Production Grade Tools
* [Kubernetes Operations](https://github.com/kubernetes/kops) - Production Grade K8s Installation, Upgrades, and Management. Supports running Debian, Ubuntu, CentOS, and RHEL in AWS.
@ -17,16 +17,17 @@ title: Running Kubernetes on AWS EC2
---
## kube-up bash script
## kube-up is no longer supported in kubernetes 1.6
> `kube-up.sh` is a legacy tool that is an easy way to spin up a cluster. This tool is being deprecated, and does not create a production ready environment.
> `kube-up.sh` is a legacy tool for launching clusters. It is deprecated, and removed entirely from kubernetes 1.6.
### Prerequisites
1. You need an AWS account. Visit [http://aws.amazon.com](http://aws.amazon.com) to get started
2. Install and configure the [AWS Command Line Interface](http://aws.amazon.com/cli)
3. We recommend installing using an account which has full access to the AWS APIs.
1. This is only supported for kubernetes 1.5 and earlier. Consider switching to one of the supported options.
2. You need an AWS account. Visit [http://aws.amazon.com](http://aws.amazon.com) to get started
3. Install and configure the [AWS Command Line Interface](http://aws.amazon.com/cli)
4. We recommend installing using an account which has full access to the AWS APIs.
NOTE: This script use the 'default' AWS profile by default.
You may explicitly set the AWS profile to use using the `AWS_DEFAULT_PROFILE` environment variable:
@ -161,7 +162,6 @@ cluster/kube-down.sh
IaaS Provider | Config. Mgmt | OS | Networking | Docs | Conforms | Support Level
-------------------- | ------------ | ------------- | ---------- | --------------------------------------------- | ---------| ----------------------------
AWS | Saltstack | Debian/Ubuntu | k8s (VPC) | [docs](/docs/getting-started-guides/aws) | | Community ([@justinsb](https://github.com/justinsb))
AWS | kops | Debian | k8s (VPC) | [docs](https://github.com/kubernetes/kops) | | Community ([@justinsb](https://github.com/justinsb))
AWS | CoreOS | CoreOS | flannel | [docs](/docs/getting-started-guides/aws) | | Community

View File

@ -450,7 +450,7 @@ You can find the docs at [Kubernetes Dashboard](https://github.com/kubernetes/da
## Launch other Services With Calico-Kubernetes
At this point, you have a fully functioning cluster running on Kubernetes with a master and two nodes networked with Calico. You can now follow any of the [standard documentation](https://github.com/kubernetes/kubernetes/tree/{{page.version}}.0/examples/) to set up other services on your cluster.
At this point, you have a fully functioning cluster running on Kubernetes with a master and two nodes networked with Calico. You can now follow any of the [standard documentation](https://github.com/kubernetes/kubernetes/tree/{{page.fullversion}}/examples/) to set up other services on your cluster.
## Connectivity to outside the cluster

View File

@ -28,4 +28,4 @@ Explore the glossary of essential Kubernetes concepts. Some good starting points
## Design Docs
An archive of the design docs for Kubernetes functionality. Good starting points are [Kubernetes Architecture](https://github.com/kubernetes/kubernetes/blob/{{page.version}}/docs/design/architecture.md) and [Kubernetes Design Overview](https://github.com/kubernetes/kubernetes/tree/{{page.version}}/docs/design).
An archive of the design docs for Kubernetes functionality. Good starting points are [Kubernetes Architecture](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture.md) and [Kubernetes Design Overview](https://github.com/kubernetes/kubernetes/tree/{{page.fullversion}}/docs/design).

View File

@ -87,7 +87,7 @@ than what you expect to use.
If you specify a request, a Pod is guaranteed to be able to use that much
of the resource. See
[Resource QoS](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resource-qos.md) for the difference between resource limits and requests.
[Resource QoS](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-qos.md) for the difference between resource limits and requests.
## If you don't specify limits or requests

View File

@ -29,7 +29,7 @@ When prompted, enter your Docker username and password.
The login process creates or updates a `config.json` file that holds an
authorization token.
View the `configfile.json` file:
View the `config.json` file:
cat ~/.docker/config.json

View File

@ -5,27 +5,6 @@ assignees:
title: Annotations
---
We have [labels](/docs/user-guide/labels) for identifying metadata.
{% include user-guide-content-moved.md %}
It is also useful to be able to attach arbitrary non-identifying metadata, for retrieval by API clients such as tools, libraries, etc. This information may be large, may be structured or unstructured, may include characters not permitted by labels, etc. Such information would not be used for object selection and therefore doesn't belong in labels.
Like labels, annotations are key-value maps.
```json
"annotations": {
"key1" : "value1",
"key2" : "value2"
}
```
Possible information that could be recorded in annotations:
* fields managed by a declarative configuration layer, to distinguish them from client- and/or server-set default values and other auto-generated fields, fields set by auto-sizing/auto-scaling systems, etc., in order to facilitate merging
* build/release/image information (timestamps, release ids, git branch, PR numbers, image hashes, registry address, etc.)
* pointers to logging/monitoring/analytics/audit repos
* client library/tool information (e.g. for debugging purposes -- name, version, build info)
* other user and/or tool/system provenance info, such as URLs of related objects from other ecosystem components
* lightweight rollout tool metadata (config and/or checkpoints)
* phone/pager number(s) of person(s) responsible, or directory entry where that info could be found, such as a team website
Yes, this information could be stored in an external database or directory, but that would make it much harder to produce shared client libraries and tools for deployment, management, introspection, etc.
[Annotations](/docs/concepts/overview/working-with-objects/annotations/)

View File

@ -5,805 +5,6 @@ assignees:
title: Deployments
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
## What is a Deployment?
A _Deployment_ provides declarative updates for [Pods](/docs/user-guide/pods/) and [Replica Sets](/docs/user-guide/replicasets/) (the next-generation Replication Controller).
You only need to describe the desired state in a Deployment object, and the Deployment
controller will change the actual state to the desired state at a controlled rate for you.
You can define Deployments to create new resources, or replace existing ones
by new ones.
A typical use case is:
* Create a Deployment to bring up a Replica Set and Pods.
* Check the status of a Deployment to see if it succeeds or not.
* Later, update that Deployment to recreate the Pods (for example, to use a new image).
* Rollback to an earlier Deployment revision if the current Deployment isn't stable.
* Pause and resume a Deployment.
## Creating a Deployment
Here is an example Deployment. It creates a Replica Set to
bring up 3 nginx Pods.
{% include code.html language="yaml" file="nginx-deployment.yaml" ghlink="/docs/user-guide/nginx-deployment.yaml" %}
Run the example by downloading the example file and then running this command:
```shell
$ kubectl create -f docs/user-guide/nginx-deployment.yaml --record
deployment "nginx-deployment" created
```
Setting the kubectl flag `--record` to `true` allows you to record current command in the annotations of the resources being created or updated. It will be useful for future introspection; for example, to see the commands executed in each Deployment revision.
Then running `get` immediately will give:
```shell
$ kubectl get deployments
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 0 0 0 1s
```
This indicates that the Deployment's number of desired replicas is 3 (according to deployment's `.spec.replicas`), the number of current replicas (`.status.replicas`) is 0, the number of up-to-date replicas (`.status.updatedReplicas`) is 0, and the number of available replicas (`.status.availableReplicas`) is also 0.
Running the `get` again a few seconds later, should give:
```shell
$ kubectl get deployments
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 3 3 3 18s
```
This indicates that the Deployment has created all three replicas, and all replicas are up-to-date (contains the latest pod template) and available (pod status is ready for at least Deployment's `.spec.minReadySeconds`). Running `kubectl get rs` and `kubectl get pods` will show the Replica Set (RS) and Pods created.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-2035384211 3 3 0 18s
```
You may notice that the name of the Replica Set is always `<the name of the Deployment>-<hash value of the pod template>`.
```shell
$ kubectl get pods --show-labels
NAME READY STATUS RESTARTS AGE LABELS
nginx-deployment-2035384211-7ci7o 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
nginx-deployment-2035384211-kzszj 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
nginx-deployment-2035384211-qqcnn 1/1 Running 0 18s app=nginx,pod-template-hash=2035384211
```
The created Replica Set will ensure that there are three nginx Pods at all times.
**Note:** You must specify appropriate selector and pod template labels of a Deployment (in this case, `app = nginx`), i.e. don't overlap with other controllers (including Deployments, Replica Sets, Replication Controllers, etc.) Kubernetes won't stop you from doing that, and if you end up with multiple controllers that have overlapping selectors, those controllers will fight with each other's and won't behave correctly.
## Updating a Deployment
**Note:** a Deployment's rollout is triggered if and only if the Deployment's pod template (i.e. `.spec.template`) is changed,
e.g. updating labels or container images of the template. Other updates, such as scaling the Deployment, will not trigger a rollout.
Suppose that we now want to update the nginx Pods to start using the `nginx:1.9.1` image
instead of the `nginx:1.7.9` image.
```shell
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
deployment "nginx-deployment" image updated
```
Alternatively, we can `edit` the Deployment and change `.spec.template.spec.containers[0].image` from `nginx:1.7.9` to `nginx:1.9.1`:
```shell
$ kubectl edit deployment/nginx-deployment
deployment "nginx-deployment" edited
```
To see its rollout status, simply run:
```shell
$ kubectl rollout status deployment/nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
deployment "nginx-deployment" successfully rolled out
```
After the rollout succeeds, you may want to `get` the Deployment:
```shell
$ kubectl get deployments
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 3 3 3 36s
```
The number of up-to-date replicas indicates that the Deployment has updated the replicas to the latest configuration.
The current replicas indicates the total replicas this Deployment manages, and the available replicas indicates the
number of current replicas that are available.
We can run `kubectl get rs` to see that the Deployment updated the Pods by creating a new Replica Set and scaling it up to 3 replicas, as well as scaling down the old Replica Set to 0 replicas.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 3 3 0 6s
nginx-deployment-2035384211 0 0 0 36s
```
Running `get pods` should now show only the new Pods:
```shell
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
nginx-deployment-1564180365-khku8 1/1 Running 0 14s
nginx-deployment-1564180365-nacti 1/1 Running 0 14s
nginx-deployment-1564180365-z9gth 1/1 Running 0 14s
```
Next time we want to update these Pods, we only need to update the Deployment's pod template again.
Deployment can ensure that only a certain number of Pods may be down while they are being updated. By
default, it ensures that at least 25% less than the desired number of Pods are
up (25% max unavailable).
Deployment can also ensure that only a certain number of Pods may be created above the desired number of Pods. By default, it ensures that at most 25% more than the desired number of Pods are up (25% max surge).
For example, if you look at the above Deployment closely, you will see that
it first created a new Pod, then deleted some old Pods and created new ones. It
does not kill old Pods until a sufficient number of new Pods have come up, and does not create new Pods until a sufficient number of old Pods have been killed. It makes sure that number of available Pods is at least 2 and the number of total Pods is at most 4.
```shell
$ kubectl describe deployments
Name: nginx-deployment
Namespace: default
CreationTimestamp: Tue, 15 Mar 2016 12:01:06 -0700
Labels: app=nginx
Selector: app=nginx
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
StrategyType: RollingUpdate
MinReadySeconds: 0
RollingUpdateStrategy: 1 max unavailable, 1 max surge
OldReplicaSets: <none>
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
36s 36s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
23s 23s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
```
Here we see that when we first created the Deployment, it created a Replica Set (nginx-deployment-2035384211) and scaled it up to 3 replicas directly.
When we updated the Deployment, it created a new Replica Set (nginx-deployment-1564180365) and scaled it up to 1 and then scaled down the old Replica Set to 2, so that at least 2 Pods were available and at most 4 Pods were created at all times.
It then continued scaling up and down the new and the old Replica Set, with the same rolling update strategy. Finally, we'll have 3 available replicas in the new Replica Set, and the old Replica Set is scaled down to 0.
### Multiple Updates
Each time a new deployment object is observed by the deployment controller, a Replica Set is
created to bring up the desired Pods if there is no existing Replica Set doing so.
Existing Replica Set controlling Pods whose labels match `.spec.selector` but whose
template does not match `.spec.template` are scaled down.
Eventually, the new Replica Set will be scaled to `.spec.replicas` and all old Replica Sets will
be scaled to 0.
If you update a Deployment while an existing deployment is in progress,
the Deployment will create a new Replica Set as per the update and start scaling that up, and
will roll the Replica Set that it was scaling up previously -- it will add it to its list of old Replica Sets and will
start scaling it down.
For example, suppose you create a Deployment to create 5 replicas of `nginx:1.7.9`,
but then updates the Deployment to create 5 replicas of `nginx:1.9.1`, when only 3
replicas of `nginx:1.7.9` had been created. In that case, Deployment will immediately start
killing the 3 `nginx:1.7.9` Pods that it had created, and will start creating
`nginx:1.9.1` Pods. It will not wait for 5 replicas of `nginx:1.7.9` to be created
before changing course.
## Rolling Back a Deployment
Sometimes you may want to rollback a Deployment; for example, when the Deployment is not stable, such as crash looping.
By default, two previous Deployment's rollout history are kept in the system so that you can rollback anytime you want
(you can change that by modifying [revision history limit](/docs/user-guide/deployments/#revision-history-limit)).
**Note:** a Deployment's revision is created when a Deployment's rollout is triggered. This means that the new revision is created
if and only if the Deployment's pod template (i.e. `.spec.template`) is changed, e.g. updating labels or container images of the template.
Other updates, such as scaling the Deployment, will not create a Deployment revision -- so that we can facilitate simultaneous manual- or
auto-scaling. This implies that when you rollback to an earlier revision, only the Deployment's pod template part will be rolled back.
Suppose that we made a typo while updating the Deployment, by putting the image name as `nginx:1.91` instead of `nginx:1.9.1`:
```shell
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.91
deployment "nginx-deployment" image updated
```
The rollout will be stuck.
```shell
$ kubectl rollout status deployments nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
```
Press Ctrl-C to stop the above rollout status watch. For more information on stuck rollouts, [read more here](#deployment-status).
You will also see that both the number of old replicas (nginx-deployment-1564180365 and nginx-deployment-2035384211) and new replicas (nginx-deployment-3066724191) are 2.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 2 2 0 25s
nginx-deployment-2035384211 0 0 0 36s
nginx-deployment-3066724191 2 2 2 6s
```
Looking at the Pods created, you will see that the 2 Pods created by new Replica Set are stuck in an image pull loop.
```shell
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
nginx-deployment-1564180365-70iae 1/1 Running 0 25s
nginx-deployment-1564180365-jbqqo 1/1 Running 0 25s
nginx-deployment-3066724191-08mng 0/1 ImagePullBackOff 0 6s
nginx-deployment-3066724191-eocby 0/1 ImagePullBackOff 0 6s
```
Note that the Deployment controller will stop the bad rollout automatically, and will stop scaling up the new Replica Set.
```shell
$ kubectl describe deployment
Name: nginx-deployment
Namespace: default
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
Labels: app=nginx
Selector: app=nginx
Replicas: 2 updated | 3 total | 2 available | 2 unavailable
StrategyType: RollingUpdate
MinReadySeconds: 0
RollingUpdateStrategy: 1 max unavailable, 1 max surge
OldReplicaSets: nginx-deployment-1564180365 (2/2 replicas created)
NewReplicaSet: nginx-deployment-3066724191 (2/2 replicas created)
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
1m 1m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
22s 22s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
21s 21s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
13s 13s 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
```
To fix this, we need to rollback to a previous revision of Deployment that is stable.
### Checking Rollout History of a Deployment
First, check the revisions of this deployment:
```shell
$ kubectl rollout history deployment/nginx-deployment
deployments "nginx-deployment":
REVISION CHANGE-CAUSE
1 kubectl create -f docs/user-guide/nginx-deployment.yaml --record
2 kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
3 kubectl set image deployment/nginx-deployment nginx=nginx:1.91
```
Because we recorded the command while creating this Deployment using `--record`, we can easily see the changes we made in each revision.
To further see the details of each revision, run:
```shell
$ kubectl rollout history deployment/nginx-deployment --revision=2
deployments "nginx-deployment" revision 2
Labels: app=nginx
pod-template-hash=1159050644
Annotations: kubernetes.io/change-cause=kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1
Containers:
nginx:
Image: nginx:1.9.1
Port: 80/TCP
QoS Tier:
cpu: BestEffort
memory: BestEffort
Environment Variables: <none>
No volumes.
```
### Rolling Back to a Previous Revision
Now we've decided to undo the current rollout and rollback to the previous revision:
```shell
$ kubectl rollout undo deployment/nginx-deployment
deployment "nginx-deployment" rolled back
```
Alternatively, you can rollback to a specific revision by specify that in `--to-revision`:
```shell
$ kubectl rollout undo deployment/nginx-deployment --to-revision=2
deployment "nginx-deployment" rolled back
```
For more details about rollout related commands, read [`kubectl rollout`](/docs/user-guide/kubectl/kubectl_rollout/).
The Deployment is now rolled back to a previous stable revision. As you can see, a `DeploymentRollback` event for rolling back to revision 2 is generated from Deployment controller.
```shell
$ kubectl get deployment
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 3 3 3 3 30m
$ kubectl describe deployment
Name: nginx-deployment
Namespace: default
CreationTimestamp: Tue, 15 Mar 2016 14:48:04 -0700
Labels: app=nginx
Selector: app=nginx
Replicas: 3 updated | 3 total | 3 available | 0 unavailable
StrategyType: RollingUpdate
MinReadySeconds: 0
RollingUpdateStrategy: 1 max unavailable, 1 max surge
OldReplicaSets: <none>
NewReplicaSet: nginx-deployment-1564180365 (3/3 replicas created)
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
30m 30m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-2035384211 to 3
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 1
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 2
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 2
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-2035384211 to 0
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 2
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-3066724191 to 1
29m 29m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-1564180365 to 2
2m 2m 1 {deployment-controller } Normal ScalingReplicaSet Scaled down replica set nginx-deployment-3066724191 to 0
2m 2m 1 {deployment-controller } Normal DeploymentRollback Rolled back deployment "nginx-deployment" to revision 2
29m 2m 2 {deployment-controller } Normal ScalingReplicaSet Scaled up replica set nginx-deployment-1564180365 to 3
```
### Clean up Policy
You can set `.spec.revisionHistoryLimit` field to specify how much revision history of this deployment you want to keep. By default,
all revision history will be kept; explicitly setting this field to `0` disallows a deployment being rolled back.
## Scaling a Deployment
You can scale a Deployment by using the following command:
```shell
$ kubectl scale deployment nginx-deployment --replicas 10
deployment "nginx-deployment" scaled
```
Assuming [horizontal pod autoscaling](/docs/user-guide/horizontal-pod-autoscaling/walkthrough.md) is enabled
in your cluster, you can setup an autoscaler for your Deployment and choose the minimum and maximum number of
Pods you want to run based on the CPU utilization of your existing Pods.
```shell
$ kubectl autoscale deployment nginx-deployment --min=10 --max=15 --cpu-percent=80
deployment "nginx-deployment" autoscaled
```
RollingUpdate Deployments support running multiple versions of an application at the same time. When you
or an autoscaler scales a RollingUpdate Deployment that is in the middle of a rollout (either in progress
or paused), then the Deployment controller will balance the additional replicas in the existing active
ReplicaSets (ReplicaSets with Pods) in order to mitigate risk. This is called *proportional scaling*.
For example, you are running a Deployment with 10 replicas, [maxSurge](#max-surge)=3, and [maxUnavailable](#max-unavailable)=2.
```shell
$ kubectl get deploy
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 10 10 10 10 50s
```
You update to a new image which happens to be unresolvable from inside the cluster.
```shell
$ kubectl set image deploy/nginx-deployment nginx=nginx:sometag
deployment "nginx-deployment" image updated
```
The image update starts a new rollout with ReplicaSet nginx-deployment-1989198191 but it's blocked due to the
maxUnavailable requirement that we mentioned above.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1989198191 5 5 0 9s
nginx-deployment-618515232 8 8 8 1m
```
Then a new scaling request for the Deployment comes along. The autoscaler increments the Deployment replicas
to 15. The Deployment controller needs to decide where to add these new 5 replicas. If we weren't using
proportional scaling, all 5 of them would be added in the new ReplicaSet. With proportional scaling, we
spread the additional replicas across all ReplicaSets. Bigger proportions go to the ReplicaSets with the
most replicas and lower proportions go to ReplicaSets with less replicas. Any leftovers are added to the
ReplicaSet with the most replicas. ReplicaSets with zero replicas are not scaled up.
In our example above, 3 replicas will be added to the old ReplicaSet and 2 replicas will be added to the
new ReplicaSet. The rollout process should eventually move all replicas to the new ReplicaSet, assuming
the new replicas become healthy.
```shell
$ kubectl get deploy
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE
nginx-deployment 15 18 7 8 7m
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1989198191 7 7 0 7m
nginx-deployment-618515232 11 11 11 7m
```
## Pausing and Resuming a Deployment
You can also pause a Deployment mid-way and then resume it. A use case is to support canary deployment.
Update the Deployment again and then pause the Deployment with `kubectl rollout pause`:
```shell
$ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1; kubectl rollout pause deployment/nginx-deployment
deployment "nginx-deployment" image updated
deployment "nginx-deployment" paused
```
Note that any current state of the Deployment will continue its function, but new updates to the Deployment will not have an effect as long as the Deployment is paused.
The Deployment was still in progress when we paused it, so the actions of scaling up and down Replica Sets are paused too.
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 2 2 2 1h
nginx-deployment-2035384211 2 2 0 1h
nginx-deployment-3066724191 0 0 0 1h
```
In a separate terminal, watch for rollout status changes and you'll see the rollout won't continue:
```shell
$ kubectl rollout status deployment/nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
```
To resume the Deployment, simply do `kubectl rollout resume`:
```shell
$ kubectl rollout resume deployment/nginx-deployment
deployment "nginx-deployment" resumed
```
Then the Deployment will continue and finish the rollout:
```shell
$ kubectl rollout status deployment/nginx-deployment
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
Waiting for deployment spec update to be observed...
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
deployment nginx-deployment successfully rolled out
```
```shell
$ kubectl get rs
NAME DESIRED CURRENT READY AGE
nginx-deployment-1564180365 3 3 3 1h
nginx-deployment-2035384211 0 0 0 1h
nginx-deployment-3066724191 0 0 0 1h
```
Note: You cannot rollback a paused Deployment until you resume it.
## Deployment status
A Deployment enters various states during its lifecycle. It can be [progressing](#progressing-deployment) while rolling out a new ReplicaSet,
it can be [complete](#complete-deployment), or it can [fail to progress](#failed-deployment).
### Progressing Deployment
Kubernetes marks a Deployment as _progressing_ when one of the following tasks is performed:
* The Deployment is in the process of creating a new ReplicaSet.
* The Deployment is scaling up an existing ReplicaSet.
* The Deployment is scaling down an existing ReplicaSet.
* New pods become available.
You can monitor the progress for a Deployment by using `kubectl rollout status`.
### Complete Deployment
Kubernetes marks a Deployment as _complete_ when it has the following characteristics:
* The Deployment has minimum availability. Minimum availability means that the Deployment's number of available replicas
equals or exceeds the number required by the Deployment strategy.
* All of the replicas associated with the Deployment have been updated to the latest version you've specified, meaning any
updates you've requested have been completed.
* No old pods for the Deployment are running.
You can check if a Deployment has completed by using `kubectl rollout status`. If the rollout completed successfully, `kubectl rollout status` returns a zero exit code.
```shell
$ kubectl rollout status deploy/nginx
Waiting for rollout to finish: 2 of 3 updated replicas are available...
deployment "nginx" successfully rolled out
$ echo $?
0
```
### Failed Deployment
Your Deployment may get stuck trying to deploy its newest ReplicaSet without ever completing. This can occur due to some of the following factors:
* Insufficient quota
* Readiness probe failures
* Image pull errors
* Insufficient permissions
* Limit ranges
* Application runtime misconfiguration
One way you can detect this condition is to specify a deadline parameter in your Deployment spec: ([`spec.progressDeadlineSeconds`](#progress-deadline-seconds)). `spec.progressDeadlineSeconds` denotes the number of seconds the Deployment controller waits before indicating (via the Deployment status) that the Deployment progress has stalled.
The following `kubectl` command sets the spec with `progressDeadlineSeconds` to make the controller report lack of progress for a Deployment after 10 minutes:
```shell
$ kubectl patch deployment/nginx-deployment -p '{"spec":{"progressDeadlineSeconds":600}}'
"nginx-deployment" patched
```
Once the deadline has been exceeded, the Deployment controller adds a DeploymentCondition with the following attributes to
the Deployment's `status.conditions`:
* Type=Progressing
* Status=False
* Reason=ProgressDeadlineExceeded
See the [Kubernetes API conventions](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#typical-status-properties) for more information on status conditions.
Note that in version 1.5, Kubernetes will take no action on a stalled Deployment other than to report a status condition with
`Reason=ProgressDeadlineExceeded`.
**Note:** If you pause a Deployment, Kubernetes does not check progress against your specified deadline. You can safely pause a Deployment in the middle of a rollout and resume without triggering the condition for exceeding the deadline.
You may experience transient errors with your Deployments, either due to a low timeout that you have set or due to any other kind
of error that can be treated as transient. For example, let's suppose you have insufficient quota. If you describe the Deployment
you will notice the following section:
```shell
$ kubectl describe deployment nginx-deployment
<...>
Conditions:
Type Status Reason
---- ------ ------
Available True MinimumReplicasAvailable
Progressing True ReplicaSetUpdated
ReplicaFailure True FailedCreate
<...>
```
If you run `kubectl get deployment nginx-deployment -o yaml`, the Deployement status might look like this:
```
status:
availableReplicas: 2
conditions:
- lastTransitionTime: 2016-10-04T12:25:39Z
lastUpdateTime: 2016-10-04T12:25:39Z
message: Replica set "nginx-deployment-4262182780" is progressing.
reason: ReplicaSetUpdated
status: "True"
type: Progressing
- lastTransitionTime: 2016-10-04T12:25:42Z
lastUpdateTime: 2016-10-04T12:25:42Z
message: Deployment has minimum availability.
reason: MinimumReplicasAvailable
status: "True"
type: Available
- lastTransitionTime: 2016-10-04T12:25:39Z
lastUpdateTime: 2016-10-04T12:25:39Z
message: 'Error creating: pods "nginx-deployment-4262182780-" is forbidden: exceeded quota:
object-counts, requested: pods=1, used: pods=3, limited: pods=2'
reason: FailedCreate
status: "True"
type: ReplicaFailure
observedGeneration: 3
replicas: 2
unavailableReplicas: 2
```
Eventually, once the Deployment progress deadline is exceeded, Kubernetes updates the status and the reason for the Progressing condition:
```
Conditions:
Type Status Reason
---- ------ ------
Available True MinimumReplicasAvailable
Progressing False ProgressDeadlineExceeded
ReplicaFailure True FailedCreate
```
You can address an issue of insufficient quota by scaling down your Deployment, by scaling down other controllers you may be running,
or by increasing quota in your namespace. If you satisfy the quota conditions and the Deployment controller then completes the Deployment
rollout, you'll see the Deployment's status update with a successful condition (`Status=True` and `Reason=NewReplicaSetAvailable`).
```
Conditions:
Type Status Reason
---- ------ ------
Available True MinimumReplicasAvailable
Progressing True NewReplicaSetAvailable
```
`Type=Available` with `Status=True` means that your Deployment has minimum availability. Minimum availability is dictated
by the parameters specified in the deployment strategy. `Type=Progressing` with `Status=True` means that your Deployment
is either in the middle of a rollout and it is progressing or that it has successfully completed its progress and the minimum
required new replicas are available (see the Reason of the condition for the particulars - in our case
`Reason=NewReplicaSetAvailable` means that the Deployment is complete).
You can check if a Deployment has failed to progress by using `kubectl rollout status`. `kubectl rollout status` returns a non-zero exit code if the Deployment has exceeded the progression deadline.
```shell
$ kubectl rollout status deploy/nginx
Waiting for rollout to finish: 2 out of 3 new replicas have been updated...
error: deployment "nginx" exceeded its progress deadline
$ echo $?
1
```
### Operating on a failed deployment
All actions that apply to a complete Deployment also apply to a failed Deployment. You can scale it up/down, roll back
to a previous revision, or even pause it if you need to apply multiple tweaks in the Deployment pod template.
## Use Cases
### Canary Deployment
If you want to roll out releases to a subset of users or servers using the Deployment, you can create multiple Deployments, one for each release,
following the canary pattern described in [managing resources](/docs/concepts/cluster-administration/manage-deployment/#canary-deployments).
## Writing a Deployment Spec
As with all other Kubernetes configs, a Deployment needs `apiVersion`, `kind`, and
`metadata` fields. For general information about working with config files,
see [deploying applications](/docs/user-guide/deploying-applications), [configuring containers](/docs/user-guide/configuring-containers), and [using kubectl to manage resources](/docs/user-guide/working-with-resources) documents.
A Deployment also needs a [`.spec` section](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status).
### Pod Template
The `.spec.template` is the only required field of the `.spec`.
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template). It has exactly
the same schema as a [Pod](/docs/user-guide/pods), except it is nested and does not have an
`apiVersion` or `kind`.
In addition to required fields for a Pod, a pod template in a Deployment must specify appropriate
labels (i.e. don't overlap with other controllers, see [selector](#selector)) and an appropriate restart policy.
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
if not specified.
### Replicas
`.spec.replicas` is an optional field that specifies the number of desired Pods. It defaults
to 1.
### Selector
`.spec.selector` is an optional field that specifies a [label selector](/docs/user-guide/labels/#label-selectors) for the Pods
targeted by this deployment.
If specified, `.spec.selector` must match `.spec.template.metadata.labels`, or it will
be rejected by the API. If `.spec.selector` is unspecified, `.spec.selector.matchLabels` will be defaulted to
`.spec.template.metadata.labels`.
Deployment may kill Pods whose labels match the selector, in the case that their
template is different than `.spec.template` or if the total number of such Pods
exceeds `.spec.replicas`. It will bring up new Pods with `.spec.template` if
number of Pods are less than the desired number.
Note that you should not create other pods whose labels match this selector, either directly, via another Deployment or via another controller such as Replica Sets or Replication Controllers. Otherwise, the Deployment will think that those pods were created by it. Kubernetes will not stop you from doing this.
If you have multiple controllers that have overlapping selectors, the controllers will fight with each other's and won't behave correctly.
### Strategy
`.spec.strategy` specifies the strategy used to replace old Pods by new ones.
`.spec.strategy.type` can be "Recreate" or "RollingUpdate". "RollingUpdate" is
the default value.
#### Recreate Deployment
All existing Pods are killed before new ones are created when
`.spec.strategy.type==Recreate`.
#### Rolling Update Deployment
The Deployment updates Pods in a [rolling update](/docs/tasks/run-application/rolling-update-replication-controller/) fashion
when `.spec.strategy.type==RollingUpdate`.
You can specify `maxUnavailable` and `maxSurge` to control
the rolling update process.
##### Max Unavailable
`.spec.strategy.rollingUpdate.maxUnavailable` is an optional field that specifies the
maximum number of Pods that can be unavailable during the update process.
The value can be an absolute number (e.g. 5) or a percentage of desired Pods
(e.g. 10%).
The absolute number is calculated from percentage by rounding up.
This can not be 0 if `.spec.strategy.rollingUpdate.maxSurge` is 0.
By default, a fixed value of 1 is used.
For example, when this value is set to 30%, the old Replica Set can be scaled down to
70% of desired Pods immediately when the rolling update starts. Once new Pods are
ready, old Replica Set can be scaled down further, followed by scaling up the new Replica Set,
ensuring that the total number of Pods available at all times during the
update is at least 70% of the desired Pods.
##### Max Surge
`.spec.strategy.rollingUpdate.maxSurge` is an optional field that specifies the
maximum number of Pods that can be created above the desired number of Pods.
Value can be an absolute number (e.g. 5) or a percentage of desired Pods
(e.g. 10%).
This can not be 0 if `MaxUnavailable` is 0.
The absolute number is calculated from percentage by rounding up.
By default, a value of 1 is used.
For example, when this value is set to 30%, the new Replica Set can be scaled up immediately when
the rolling update starts, such that the total number of old and new Pods do not exceed
130% of desired Pods. Once old Pods have been killed,
the new Replica Set can be scaled up further, ensuring that the total number of Pods running
at any time during the update is at most 130% of desired Pods.
### Progress Deadline Seconds
`.spec.progressDeadlineSeconds` is an optional field that specifies the number of seconds you want
to wait for your Deployment to progress before the system reports back that the Deployment has
[failed progressing](#failed-deployment) - surfaced as a condition with `Type=Progressing`, `Status=False`.
and `Reason=ProgressDeadlineExceeded` in the status of the resource. The deployment controller will keep
retrying the Deployment. In the future, once automatic rollback will be implemented, the deployment
controller will roll back a Deployment as soon as it observes such a condition.
If specified, this field needs to be greater than `.spec.minReadySeconds`.
### Min Ready Seconds
`.spec.minReadySeconds` is an optional field (with default value of 600s) that specifies the
minimum number of seconds for which a newly created Pod should be ready
without any of its containers crashing, for it to be considered available.
This defaults to 0 (the Pod will be considered available as soon as it is ready).
To learn more about when a Pod is considered ready, see [Container Probes](/docs/user-guide/pod-states/#container-probes).
### Rollback To
`.spec.rollbackTo` is an optional field with the configuration the Deployment is rolling back to. Setting this field will trigger a rollback, and this field will be cleared every time a rollback is done.
#### Revision
`.spec.rollbackTo.revision` is an optional field specifying the revision to rollback to. This defaults to 0, meaning rollback to the last revision in history.
### Revision History Limit
A deployment's revision history is stored in the replica sets it controls.
`.spec.revisionHistoryLimit` is an optional field (with default value of two) that specifies the number of old Replica Sets to retain to allow rollback. Its ideal value depends on the frequency and stability of new deployments. All old Replica Sets will be kept by default, consuming resources in `etcd` and crowding the output of `kubectl get rs`, if this field is not set. The configuration of each Deployment revision is stored in its Replica Sets; therefore, once an old Replica Set is deleted, you lose the ability to rollback to that revision of Deployment.
More specifically, setting this field to zero means that all old replica sets with 0 replica will be cleaned up.
In this case, a new deployment rollout cannot be undone, since its revision history is cleaned up.
### Paused
`.spec.paused` is an optional boolean field for pausing and resuming a Deployment. It defaults to false (a Deployment is not paused).
## Alternative to Deployments
### kubectl rolling update
[Kubectl rolling update](/docs/user-guide/kubectl/kubectl_rolling-update) updates Pods and Replication Controllers in a similar fashion.
But Deployments are recommended, since they are declarative, server side, and have additional features, such as rolling back to any previous revision even after the rolling update is done.
[Deployments](/docs/concepts/workloads/controllers/deployment/)

View File

@ -5,14 +5,6 @@ assignees:
title: Names
---
All objects in the Kubernetes REST API are unambiguously identified by a Name and a UID.
{% include user-guide-content-moved.md %}
For non-unique user-provided attributes, Kubernetes provides [labels](/docs/user-guide/labels) and [annotations](/docs/user-guide/annotations).
## Names
Names are generally client-provided. Only one object of a given kind can have a given name at a time (i.e., they are spatially unique). But if you delete an object, you can make a new object with the same name. Names are used to refer to an object in a resource URL, such as `/api/v1/pods/some-name`. By convention, the names of Kubernetes resources should be up to maximum length of 253 characters and consist of lower case alphanumeric characters, `-`, and `.`, but certain resources have more specific restrictions. See the [identifiers design doc](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/identifiers.md) for the precise syntax rules for names.
## UIDs
UID are generated by Kubernetes. Every object created over the whole lifetime of a Kubernetes cluster has a distinct UID (i.e., they are spatially and temporally unique).
[Names](/docs/concepts/overview/working-with-objects/names/)

View File

@ -4,291 +4,6 @@ assignees:
title: Ingress Resources
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
__Terminology__
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
* Node: A single virtual or physical machine in a Kubernetes cluster.
* Cluster: A group of nodes firewalled from the internet, that are the primary compute resources managed by Kubernetes.
* Edge router: A router that enforces the firewall policy for your cluster. This could be a gateway managed by a cloudprovider or a physical piece of hardware.
* Cluster network: A set of links, logical or physical, that facilitate communication within a cluster according to the [Kubernetes networking model](/docs/admin/networking/). Examples of a Cluster network include Overlays such as [flannel](https://github.com/coreos/flannel#flannel) or SDNs such as [OVS](/docs/admin/ovs-networking/).
* Service: A Kubernetes [Service](/docs/user-guide/services/) that identifies a set of pods using label selectors. Unless mentioned otherwise, Services are assumed to have virtual IPs only routable within the cluster network.
## What is Ingress?
Typically, services and pods have IPs only routable by the cluster network. All traffic that ends up at an edge router is either dropped or forwarded elsewhere. Conceptually, this might look like:
```
internet
|
------------
[ Services ]
```
An Ingress is a collection of rules that allow inbound connections to reach the cluster services.
```
internet
|
[ Ingress ]
--|-----|--
[ Services ]
```
It can be configured to give services externally-reachable urls, load balance traffic, terminate SSL, offer name based virtual hosting etc. Users request ingress by POSTing the Ingress resource to the API server. An [Ingress controller](#ingress-controllers) is responsible for fulfilling the Ingress, usually with a loadbalancer, though it may also configure your edge router or additional frontends to help handle the traffic in an HA manner.
## Prerequisites
Before you start using the Ingress resource, there are a few things you should understand. The Ingress is a beta resource, not available in any Kubernetes release prior to 1.1. You need an Ingress controller to satisfy an Ingress, simply creating the resource will have no effect.
GCE/GKE deploys an ingress controller on the master. You can deploy any number of custom ingress controllers in a pod. You must annotate each ingress with the appropriate class, as indicated [here](https://github.com/kubernetes/ingress/tree/master/controllers/nginx#running-multiple-ingress-controllers) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md#disabling-glbc).
Make sure you review the [beta limitations](https://github.com/kubernetes/ingress/blob/master/controllers/gce/BETA_LIMITATIONS.md) of this controller. In environments other than GCE/GKE, you need to [deploy a controller](https://github.com/kubernetes/ingress/tree/master/controllers) as a pod.
## The Ingress Resource
A minimal Ingress might look like:
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test-ingress
spec:
rules:
- http:
paths:
- path: /testpath
backend:
serviceName: test
servicePort: 80
```
*POSTing this to the API server will have no effect if you have not configured an [Ingress controller](#ingress-controllers).*
__Lines 1-4__: As with all other Kubernetes config, an Ingress needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/deploying-applications), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
__Lines 5-7__: Ingress [spec](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status) has all the information needed to configure a loadbalancer or proxy server. Most importantly, it contains a list of rules matched against all incoming requests. Currently the Ingress resource only supports http rules.
__Lines 8-9__: Each http rule contains the following information: A host (e.g.: foo.bar.com, defaults to * in this example), a list of paths (e.g.: /testpath) each of which has an associated backend (test:80). Both the host and path must match the content of an incoming request before the loadbalancer directs traffic to the backend.
__Lines 10-12__: A backend is a service:port combination as described in the [services doc](/docs/user-guide/services). Ingress traffic is typically sent directly to the endpoints matching a backend.
__Global Parameters__: For the sake of simplicity the example Ingress has no global parameters, see the [api-reference](https://releases.k8s.io/{{page.githubbranch}}/pkg/apis/extensions/v1beta1/types.go) for a full definition of the resource. One can specify a global default backend in the absence of which requests that don't match a path in the spec are sent to the default backend of the Ingress controller.
## Ingress controllers
In order for the Ingress resource to work, the cluster must have an Ingress controller running. This is unlike other types of controllers, which typically run as part of the `kube-controller-manager` binary, and which are typically started automatically as part of cluster creation. You need to choose the ingress controller implementation that is the best fit for your cluster, or implement one. Examples and instructions can be found [here](https://github.com/kubernetes/ingress/tree/master/controllers).
## Before you begin
The following document describes a set of cross platform features exposed through the Ingress resource. Ideally, all Ingress controllers should fulfill this specification, but we're not there yet. The docs for the GCE and nginx controllers are [here](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md) and [here](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md) respectively. **Make sure you review controller specific docs so you understand the caveats of each one**.
## Types of Ingress
### Single Service Ingress
There are existing Kubernetes concepts that allow you to expose a single service (see [alternatives](#alternatives)), however you can do so through an Ingress as well, by specifying a *default backend* with no rules.
{% include code.html language="yaml" file="ingress.yaml" ghlink="/docs/user-guide/ingress.yaml" %}
If you create it using `kubectl create -f` you should see:
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test-ingress - testsvc:80 107.178.254.228
```
Where `107.178.254.228` is the IP allocated by the Ingress controller to satisfy this Ingress. The `RULE` column shows that all traffic send to the IP is directed to the Kubernetes Service listed under `BACKEND`.
### Simple fanout
As described previously, pods within kubernetes have IPs only visible on the cluster network, so we need something at the edge accepting ingress traffic and proxying it to the right endpoints. This component is usually a highly available loadbalancer. An Ingress allows you to keep the number of loadbalancers down to a minimum, for example, a setup like:
```shell
foo.bar.com -> 178.91.123.132 -> / foo s1:80
/ bar s2:80
```
would require an Ingress such as:
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test
spec:
rules:
- host: foo.bar.com
http:
paths:
- path: /foo
backend:
serviceName: s1
servicePort: 80
- path: /bar
backend:
serviceName: s2
servicePort: 80
```
When you create the Ingress with `kubectl create -f`:
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test -
foo.bar.com
/foo s1:80
/bar s2:80
```
The Ingress controller will provision an implementation specific loadbalancer that satisfies the Ingress, as long as the services (s1, s2) exist. When it has done so, you will see the address of the loadbalancer under the last column of the Ingress.
### Name based virtual hosting
Name-based virtual hosts use multiple host names for the same IP address.
```
foo.bar.com --| |-> foo.bar.com s1:80
| 178.91.123.132 |
bar.foo.com --| |-> bar.foo.com s2:80
```
The following Ingress tells the backing loadbalancer to route requests based on the [Host header](https://tools.ietf.org/html/rfc7230#section-5.4).
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: test
spec:
rules:
- host: foo.bar.com
http:
paths:
- backend:
serviceName: s1
servicePort: 80
- host: bar.foo.com
http:
paths:
- backend:
serviceName: s2
servicePort: 80
```
__Default Backends__: An Ingress with no rules, like the one shown in the previous section, sends all traffic to a single default backend. You can use the same technique to tell a loadbalancer where to find your website's 404 page, by specifying a set of rules *and* a default backend. Traffic is routed to your default backend if none of the Hosts in your Ingress match the Host in the request header, and/or none of the paths match the url of the request.
### TLS
You can secure an Ingress by specifying a [secret](/docs/user-guide/secrets) that contains a TLS private key and certificate. Currently the Ingress only supports a single TLS port, 443, and assumes TLS termination. If the TLS configuration section in an Ingress specifies different hosts, they will be multiplexed on the same port according to the hostname specified through the SNI TLS extension (provided the Ingress controller supports SNI). The TLS secret must contain keys named `tls.crt` and `tls.key` that contain the certificate and private key to use for TLS, e.g.:
```yaml
apiVersion: v1
data:
tls.crt: base64 encoded cert
tls.key: base64 encoded key
kind: Secret
metadata:
name: testsecret
namespace: default
type: Opaque
```
Referencing this secret in an Ingress will tell the Ingress controller to secure the channel from the client to the loadbalancer using TLS:
```yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: no-rules-map
spec:
tls:
- secretName: testsecret
backend:
serviceName: s1
servicePort: 80
```
Note that there is a gap between TLS features supported by various Ingress controllers. Please refer to documentation on [nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md#https), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#tls), or any other platform specific Ingress controller to understand how TLS works in your environment.
### Loadbalancing
An Ingress controller is bootstrapped with some loadbalancing policy settings that it applies to all Ingress, such as the loadbalancing algorithm, backend weight scheme etc. More advanced loadbalancing concepts (e.g.: persistent sessions, dynamic weights) are not yet exposed through the Ingress. You can still get these features through the [service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). With time, we plan to distill loadbalancing patterns that are applicable cross platform into the Ingress resource.
It's also worth noting that even though health checks are not exposed directly through the Ingress, there exist parallel concepts in Kubernetes such as [readiness probes](/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/) which allow you to achieve the same end result. Please review the controller specific docs to see how they handle health checks ([nginx](https://github.com/kubernetes/ingress/blob/master/controllers/nginx/README.md), [GCE](https://github.com/kubernetes/ingress/blob/master/controllers/gce/README.md#health-checks)).
## Updating an Ingress
Say you'd like to add a new Host to an existing Ingress, you can update it by editing the resource:
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test - 178.91.123.132
foo.bar.com
/foo s1:80
$ kubectl edit ing test
```
This should pop up an editor with the existing yaml, modify it to include the new Host.
```yaml
spec:
rules:
- host: foo.bar.com
http:
paths:
- backend:
serviceName: s1
servicePort: 80
path: /foo
- host: bar.baz.com
http:
paths:
- backend:
serviceName: s2
servicePort: 80
path: /foo
..
```
saving it will update the resource in the API server, which should tell the Ingress controller to reconfigure the loadbalancer.
```shell
$ kubectl get ing
NAME RULE BACKEND ADDRESS
test - 178.91.123.132
foo.bar.com
/foo s1:80
bar.baz.com
/foo s2:80
```
You can achieve the same by invoking `kubectl replace -f` on a modified Ingress yaml file.
## Failing across availability zones
Techniques for spreading traffic across failure domains differs between cloud providers. Please check the documentation of the relevant Ingress controller for details. Please refer to the federation [doc](/docs/user-guide/federation/) for details on deploying Ingress in a federated cluster.
## Future Work
* Various modes of HTTPS/TLS support (e.g.: SNI, re-encryption)
* Requesting an IP or Hostname via claims
* Combining L4 and L7 Ingress
* More Ingress controllers
Please track the [L7 and Ingress proposal](https://github.com/kubernetes/kubernetes/pull/12827) for more details on the evolution of the resource, and the [Ingress repository](https://github.com/kubernetes/ingress/tree/master) for more details on the evolution of various Ingress controllers.
## Alternatives
You can expose a Service in multiple ways that don't directly involve the Ingress resource:
* Use [Service.Type=LoadBalancer](/docs/user-guide/services/#type-loadbalancer)
* Use [Service.Type=NodePort](/docs/user-guide/services/#type-nodeport)
* Use a [Port Proxy](https://github.com/kubernetes/contrib/tree/master/for-demos/proxy-to-service)
* Deploy the [Service loadbalancer](https://github.com/kubernetes/contrib/tree/master/service-loadbalancer). This allows you to share a single IP among multiple Services and achieve more advanced loadbalancing through Service Annotations.
[Ingress Resources](/docs/concepts/services-networking/ingress/)

View File

@ -6,83 +6,6 @@ assignees:
title: Namespaces
---
Kubernetes supports multiple virtual clusters backed by the same physical cluster.
These virtual clusters are called namespaces.
{% include user-guide-content-moved.md %}
## When to Use Multiple Namespaces
Namespaces are intended for use in environments with many users spread across multiple
teams, or projects. For clusters with a few to tens of users, you should not
need to create or think about namespaces at all. Start using namespaces when you
need the features they provide.
Namespaces provide a scope for names. Names of resources need to be unique within a namespace, but not across namespaces.
Namespaces are a way to divide cluster resources between multiple uses (via [resource quota](/docs/admin/resourcequota/)).
In future versions of Kubernetes, objects in the same namespace will have the same
access control policies by default.
It is not necessary to use multiple namespaces just to separate slightly different
resources, such as different versions of the same software: use [labels](/docs/user-guide/labels) to distinguish
resources within the same namespace.
## Working with Namespaces
Creation and deletion of namespaces is described in the [Admin Guide documentation
for namespaces](/docs/admin/namespaces)
### Viewing namespaces
You can list the current namespaces in a cluster using:
```shell
$ kubectl get namespaces
NAME LABELS STATUS
default <none> Active
kube-system <none> Active
```
Kubernetes starts with two initial namespaces:
* `default` The default namespace for objects with no other namespace
* `kube-system` The namespace for objects created by the Kubernetes system
### Setting the namespace for a request
To temporarily set the namespace for a request, use the `--namespace` flag.
For example:
```shell
$ kubectl --namespace=<insert-namespace-name-here> run nginx --image=nginx
$ kubectl --namespace=<insert-namespace-name-here> get pods
```
### Setting the namespace preference
You can permanently save the namespace for all subsequent kubectl commands in that
context.
```shell
$ kubectl config set-context $(kubectl config current-context) --namespace=<insert-namespace-name-here>
# Validate it
$ kubectl config view | grep namespace:
```
## Namespaces and DNS
When you create a [Service](/docs/user-guide/services), it creates a corresponding [DNS entry](/docs/admin/dns).
This entry is of the form `<service-name>.<namespace-name>.svc.cluster.local`, which means
that if a container just uses `<service-name>` it will resolve to the service which
is local to a namespace. This is useful for using the same configuration across
multiple namespaces such as Development, Staging and Production. If you want to reach
across namespaces, you need to use the fully qualified domain name (FQDN).
## Not All Objects are in a Namespace
Most Kubernetes resources (e.g. pods, services, replication controllers, and others) are
in some namespace. However namespace resources are not themselves in a namespace.
And low-level resources, such as [nodes](/docs/admin/node) and
persistentVolumes, are not in any namespace. Events are an exception: they may or may not
have a namespace, depending on the object the event is about.
[Namespaces](/docs/concepts/overview/working-with-objects/namespaces/)

View File

@ -5,94 +5,6 @@ assignees:
title: Network Policies
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
A network policy is a specification of how selections of pods are allowed to communicate with each other and other network endpoints.
`NetworkPolicy` resources use labels to select pods and define whitelist rules which allow traffic to the selected pods in addition to what is allowed by the isolation policy for a given namespace.
## Prerequisites
You must enable the `extensions/v1beta1/networkpolicies` runtime config in your apiserver to enable this resource.
You must also be using a networking solution which supports `NetworkPolicy` - simply creating the
resource without a controller to implement it will have no effect.
## Configuring Namespace Isolation Policy
Isolation can be configured on a per-namespace basis. Once isolation is configured on a namespace it will be applied to all pods in that namespace. Currently, only isolation policy on inbound traffic (ingress) can be defined.
The following ingress isolation types being supported:
- `DefaultDeny`: Pods in the namespace will be inaccessible from any source except the pod's local node.
Ingress isolation can be enabled using an annotation on the Namespace.
```yaml
kind: Namespace
apiVersion: v1
metadata:
annotations:
net.beta.kubernetes.io/network-policy: |
{
"ingress": {
"isolation": "DefaultDeny"
}
}
```
To configure the annotation via `kubectl`:
```shell
{% raw %}
kubectl annotate ns <namespace> "net.beta.kubernetes.io/network-policy={\"ingress\": {\"isolation\": \"DefaultDeny\"}}"
{% endraw %}
```
See the [NetworkPolicy getting started guide](/docs/getting-started-guides/network-policy/walkthrough) for an example.
## The `NetworkPolicy` Resource
See the [api-reference](/docs/api-reference/extensions/v1beta1/definitions/#_v1beta1_networkpolicy) for a full definition of the resource.
A minimal `NetworkPolicy` might look like this:
```yaml
apiVersion: extensions/v1beta1
kind: NetworkPolicy
metadata:
name: test-network-policy
namespace: default
spec:
podSelector:
matchLabels:
role: db
ingress:
- from:
- namespaceSelector:
matchLabels:
project: myproject
- podSelector:
matchLabels:
role: frontend
ports:
- protocol: tcp
port: 6379
```
*POSTing this to the API server will have no effect unless your chosen networking solution supports network policy.*
__Mandatory Fields__: As with all other Kubernetes config, a `NetworkPolicy` needs `apiVersion`, `kind`, and `metadata` fields. For general information about working with config files, see [here](/docs/user-guide/simple-yaml), [here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
__spec__: `NetworkPolicy` [spec](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status) has all the information needed to define a network isolation policy in the deployed controller.
__podSelector__: Each `NetworkPolicy` includes a `podSelector` which selects the grouping of pods to which the `ingress` rules in the policy apply.
__ingress__: Each `NetworkPolicy` includes a list of whitelist `ingress` rules. Each rule allows traffic which matches both the `from` and `ports` sections.
This example NetworkPolicy has the following characteristics:
1. applies to all pods in the default namespace with the label "role=db"
2. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the current namespace with the label "role=frontend" (due to the podSelector list element)
3. allows tcp/6379 ingress traffic to the "role=db" pods from any pod in the namespace "myproject" (due to the namespaceSelector list element)
[Network Policies](/docs/concepts/services-networking/networkpolicies/)

View File

@ -7,673 +7,6 @@ assignees:
title: Persistent Volumes
---
This document describes the current state of `PersistentVolumes` in Kubernetes. Familiarity with [volumes](/docs/concepts/storage/volumes/) is suggested.
{% include user-guide-content-moved.md %}
* TOC
{:toc}
## Introduction
Managing storage is a distinct problem from managing compute. The `PersistentVolume` subsystem provides an API for users and administrators that abstracts details of how storage is provided from how it is consumed. To do this we introduce two new API resources: `PersistentVolume` and `PersistentVolumeClaim`.
A `PersistentVolume` (PV) is a piece of networked storage in the cluster that has been provisioned by an administrator. It is a resource in the cluster just like a node is a cluster resource. PVs are volume plugins like Volumes, but have a lifecycle independent of any individual pod that uses the PV. This API object captures the details of the implementation of the storage, be that NFS, iSCSI, or a cloud-provider-specific storage system.
A `PersistentVolumeClaim` (PVC) is a request for storage by a user. It is similar to a pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., can be mounted once read/write or many times read-only).
While `PersistentVolumeClaims` allow a user to consume abstract storage
resources, it is common that users need `PersistentVolumes` with varying
properties, such as performance, for different problems. Cluster administrators
need to be able to offer a variety of `PersistentVolumes` that differ in more
ways than just size and access modes, without exposing users to the details of
how those volumes are implemented. For these needs there is the `StorageClass`
resource.
A `StorageClass` provides a way for administrators to describe the "classes" of
storage they offer. Different classes might map to quality-of-service levels,
or to backup policies, or to arbitrary policies determined by the cluster
administrators. Kubernetes itself is unopinionated about what classes
represent. This concept is sometimes called "profiles" in other storage
systems.
Please see the [detailed walkthrough with working examples](/docs/user-guide/persistent-volumes/walkthrough/).
## Lifecycle of a volume and claim
PVs are resources in the cluster. PVCs are requests for those resources and also act as claim checks to the resource. The interaction between PVs and PVCs follows this lifecycle:
### Provisioning
There are two ways PVs may be provisioned: statically or dynamically.
#### Static
A cluster administrator creates a number of PVs. They carry the details of the real storage which is available for use by cluster users. They exist in the Kubernetes API and are available for consumption.
#### Dynamic
When none of the static PVs the administrator created matches a user's `PersistentVolumeClaim`, the cluster may try to dynamically provision a volume specially for the PVC. This provisioning is based on `StorageClasses`: the PVC must request a class and the administrator must have created and configured that class in order for dynamic provisioning to occur. Claims that request the class `""` effectively disable dynamic provisioning for themselves.
### Binding
A user creates, or has already created in the case of dynamic provisioning, a `PersistentVolumeClaim` with a specific amount of storage requested and with certain access modes. A control loop in the master watches for new PVCs, finds a matching PV (if possible), and binds them together. If a PV was dynamically provisioned for a new PVC, the loop will always bind that PV to the PVC. Otherwise, the user will always get at least what they asked for, but the volume may be in excess of what was requested. Once bound, `PersistentVolumeClaim` binds are exclusive, regardless of the mode used to bind them.
Claims will remain unbound indefinitely if a matching volume does not exist. Claims will be bound as matching volumes become available. For example, a cluster provisioned with many 50Gi PVs would not match a PVC requesting 100Gi. The PVC can be bound when a 100Gi PV is added to the cluster.
### Using
Pods use claims as volumes. The cluster inspects the claim to find the bound volume and mounts that volume for a pod. For volumes which support multiple access modes, the user specifies which mode desired when using their claim as a volume in a pod.
Once a user has a claim and that claim is bound, the bound PV belongs to the user for as long as they need it. Users schedule Pods and access their claimed PVs by including a persistentVolumeClaim in their Pod's volumes block. [See below for syntax details](#claims-as-volumes).
### Releasing
When a user is done with their volume, they can delete the PVC objects from the API which allows reclamation of the resource. The volume is considered "released" when the claim is deleted, but it is not yet available for another claim. The previous claimant's data remains on the volume which must be handled according to policy.
### Reclaiming
The reclaim policy for a `PersistentVolume` tells the cluster what to do with the volume after it has been released of its claim. Currently, volumes can either be Retained, Recycled or Deleted. Retention allows for manual reclamation of the resource. For those volume plugins that support it, deletion removes both the `PersistentVolume` object from Kubernetes, as well as deleting the associated storage asset in external infrastructure (such as an AWS EBS, GCE PD, Azure Disk, or Cinder volume). Volumes that were dynamically provisioned are always deleted.
#### Recycling
If supported by appropriate volume plugin, recycling performs a basic scrub (`rm -rf /thevolume/*`) on the volume and makes it available again for a new claim.
However, an administrator can configure a custom recycler pod templates using the Kubernetes controller manager command line arguments as described [here](/docs/admin/kube-controller-manager/). The custom recycler pod template must contain a `volumes` specification, as shown in the example below:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: pv-recycler-
namespace: default
spec:
restartPolicy: Never
volumes:
- name: vol
hostPath:
path: /any/path/it/will/be/replaced
containers:
- name: pv-recycler
image: "gcr.io/google_containers/busybox"
command: ["/bin/sh", "-c", "test -e /scrub && rm -rf /scrub/..?* /scrub/.[!.]* /scrub/* && test -z \"$(ls -A /scrub)\" || exit 1"]
volumeMounts:
- name: vol
mountPath: /scrub
```
However, the particular path specified in the custom recycler pod template in the `volumes` part is replaced with the particular path of the volume that is being recycled.
## Types of Persistent Volumes
`PersistentVolume` types are implemented as plugins. Kubernetes currently supports the following plugins:
* GCEPersistentDisk
* AWSElasticBlockStore
* AzureFile
* AzureDisk
* FC (Fibre Channel)
* Flocker
* NFS
* iSCSI
* RBD (Ceph Block Device)
* CephFS
* Cinder (OpenStack block storage)
* Glusterfs
* VsphereVolume
* Quobyte Volumes
* HostPath (single node testing only -- local storage is not supported in any way and WILL NOT WORK in a multi-node cluster)
* VMware Photon
* Portworx Volumes
* ScaleIO Volumes
## Persistent Volumes
Each PV contains a spec and status, which is the specification and status of the volume.
```yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv0003
spec:
capacity:
storage: 5Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
storageClassName: slow
nfs:
path: /tmp
server: 172.17.0.2
```
### Capacity
Generally, a PV will have a specific storage capacity. This is set using the PV's `capacity` attribute. See the Kubernetes [Resource Model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) to understand the units expected by `capacity`.
Currently, storage size is the only resource that can be set or requested. Future attributes may include IOPS, throughput, etc.
### Access Modes
A `PersistentVolume` can be mounted on a host in any way supported by the resource provider. As shown in the table below, providers will have different capabilities and each PV's access modes are set to the specific modes supported by that particular volume. For example, NFS can support multiple read/write clients, but a specific NFS PV might be exported on the server as read-only. Each PV gets its own set of access modes describing that specific PV's capabilities.
The access modes are:
* ReadWriteOnce -- the volume can be mounted as read-write by a single node
* ReadOnlyMany -- the volume can be mounted read-only by many nodes
* ReadWriteMany -- the volume can be mounted as read-write by many nodes
In the CLI, the access modes are abbreviated to:
* RWO - ReadWriteOnce
* ROX - ReadOnlyMany
* RWX - ReadWriteMany
> __Important!__ A volume can only be mounted using one access mode at a time, even if it supports many. For example, a GCEPersistentDisk can be mounted as ReadWriteOnce by a single node or ReadOnlyMany by many nodes, but not at the same time.
| Volume Plugin | ReadWriteOnce| ReadOnlyMany| ReadWriteMany|
| :--- | :---: | :---: | :---: |
| AWSElasticBlockStore | &#x2713; | - | - |
| AzureFile | &#x2713; | &#x2713; | &#x2713; |
| AzureDisk | &#x2713; | - | - |
| CephFS | &#x2713; | &#x2713; | &#x2713; |
| Cinder | &#x2713; | - | - |
| FC | &#x2713; | &#x2713; | - |
| FlexVolume | &#x2713; | &#x2713; | - |
| Flocker | &#x2713; | - | - |
| GCEPersistentDisk | &#x2713; | &#x2713; | - |
| Glusterfs | &#x2713; | &#x2713; | &#x2713; |
| HostPath | &#x2713; | - | - |
| iSCSI | &#x2713; | &#x2713; | - |
| PhotonPersistentDisk | &#x2713; | - | - |
| Quobyte | &#x2713; | &#x2713; | &#x2713; |
| NFS | &#x2713; | &#x2713; | &#x2713; |
| RBD | &#x2713; | &#x2713; | - |
| VsphereVolume | &#x2713; | - | - |
| PortworxVolume | &#x2713; | - | &#x2713; |
| ScaleIO | &#x2713; | &#x2713; | - |
### Class
A PV can have a class, which is specified by setting the
`storageClassName` attribute to the name of a
`StorageClass`. A PV of a particular class can only be bound to PVCs requesting
that class. A PV with no `storageClassName` has no class and can only be bound
to PVCs that request no particular class.
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
of the `storageClassName` attribute. This annotation is still working, however
it will become fully deprecated in a future Kubernetes release.
### Reclaim Policy
Current reclaim policies are:
* Retain -- manual reclamation
* Recycle -- basic scrub ("rm -rf /thevolume/*")
* Delete -- associated storage asset such as AWS EBS, GCE PD, Azure Disk, or OpenStack Cinder volume is deleted
Currently, only NFS and HostPath support recycling. AWS EBS, GCE PD, Azure Disk, and Cinder volumes support deletion.
### Phase
A volume will be in one of the following phases:
* Available -- a free resource that is not yet bound to a claim
* Bound -- the volume is bound to a claim
* Released -- the claim has been deleted, but the resource is not yet reclaimed by the cluster
* Failed -- the volume has failed its automatic reclamation
The CLI will show the name of the PVC bound to the PV.
### Mount Options
A Kubernetes administrator can specify additional mount options for when a Persistent Volume is being mounted on a node.
You can specify a mount option by using the annotation `volume.beta.kubernetes.io/mount-options` on
your Persistent Volume.
For example:
```yaml
apiVersion: "v1"
kind: "PersistentVolume"
metadata:
name: gce-disk-1
annotations:
volume.beta.kubernetes.io/mount-options: "discard"
spec:
capacity:
storage: "10Gi"
accessModes:
- "ReadWriteOnce"
gcePersistentDisk:
fsType: "ext4"
pdName: "gce-disk-1
```
A mount option is a string which will be cumulatively joined and used while mounting volume to the disk.
Note that not all Persistent volume types support mount options. In Kubernetes version 1.6, the following
volume types support mount options.
* GCEPersistentDisk
* AWSElasticBlockStore
* AzureFile
* AzureDisk
* NFS
* iSCSI
* RBD (Ceph Block Device)
* CephFS
* Cinder (OpenStack block storage)
* Glusterfs
* VsphereVolume
* Quobyte Volumes
* VMware Photon
## PersistentVolumeClaims
Each PVC contains a spec and status, which is the specification and status of the claim.
```yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: myclaim
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 8Gi
storageClassName: slow
selector:
matchLabels:
release: "stable"
matchExpressions:
- {key: environment, operator: In, values: [dev]}
```
### Access Modes
Claims use the same conventions as volumes when requesting storage with specific access modes.
### Resources
Claims, like pods, can request specific quantities of a resource. In this case, the request is for storage. The same [resource model](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/resources.md) applies to both volumes and claims.
### Selector
Claims can specify a [label selector](/docs/user-guide/labels/#label-selectors) to further filter the set of volumes. Only the volumes whose labels match the selector can be bound to the claim. The selector can consist of two fields:
* matchLabels - the volume must have a label with this value
* matchExpressions - a list of requirements made by specifying key, list of values, and operator that relates the key and values. Valid operators include In, NotIn, Exists, and DoesNotExist.
All of the requirements, from both `matchLabels` and `matchExpressions` are ANDed together they must all be satisfied in order to match.
### Class
A claim can request a particular class by specifying the name of a
`StorageClass` using the attribute `storageClassName`.
Only PVs of the requested class, ones with the same `storageClassName` as the PVC, can
be bound to the PVC.
PVCs don't necessarily have to request a class. A PVC with its `storageClasName` set
equal to `""` is always interpreted to be requesting a PV with no class, so it
can only be bound to PVs with no class (no annotation or one set equal to
`""`). A PVC with no `storageClassName` is not quite the same and is treated differently
by the cluster depending on whether the
[`DefaultStorageClass` admission plugin](/docs/admin/admission-controllers/#defaultstorageclass)
is turned on.
* If the admission plugin is turned on, the administrator may specify a
default `StorageClass`. All PVCs that have no `storageClassName` can be bound only to
PVs of that default. Specifying a default `StorageClass` is done by setting the
annotation `storageclass.kubernetes.io/is-default-class` equal to "true" in
a `StorageClass` object. If the administrator does not specify a default, the
cluster responds to PVC creation as if the admission plugin were turned off. If
more than one default is specified, the admission plugin forbids the creation of
all PVCs.
* If the admission plugin is turned off, there is no notion of a default
`StorageClass`. All PVCs that have no `storageClassName` can be bound only to PVs that
have no class. In this case, the PVCs that have no `storageClassName` are treated the
same way as PVCs that have their `storageClassName` set to `""`.
Depending on installation method, a default StorageClass may be deployed
to Kubernetes cluster by addon manager during installation.
When a PVC specifies a `selector` in addition to requesting a `StorageClass`,
the requirements are ANDed together: only a PV of the requested class and with
the requested labels may be bound to the PVC. Note that currently, a PVC with a
non-empty `selector` can't have a PV dynamically provisioned for it.
In the past, the annotation `volume.beta.kubernetes.io/storage-class` was used instead
of `storageClassName` attribute. This annotation is still working, however
it won't be supported in a future Kubernetes release.
## Claims As Volumes
Pods access storage by using the claim as a volume. Claims must exist in the same namespace as the pod using the claim. The cluster finds the claim in the pod's namespace and uses it to get the `PersistentVolume` backing the claim. The volume is then mounted to the host and into the pod.
```yaml
kind: Pod
apiVersion: v1
metadata:
name: mypod
spec:
containers:
- name: myfrontend
image: dockerfile/nginx
volumeMounts:
- mountPath: "/var/www/html"
name: mypd
volumes:
- name: mypd
persistentVolumeClaim:
claimName: myclaim
```
### A Note on Namespaces
`PersistentVolumes` binds are exclusive, and since `PersistentVolumeClaims` are namespaced objects, mounting claims with "Many" modes (`ROX`, `RWX`) is only possible within one namespace.
## StorageClasses
Each `StorageClass` contains the fields `provisioner` and `parameters`, which
are used when a `PersistentVolume` belonging to the class needs to be
dynamically provisioned.
The name of a `StorageClass` object is significant, and is how users can
request a particular class. Administrators set the name and other parameters
of a class when first creating `StorageClass` objects, and the objects cannot
be updated once they are created.
Administrators can specify a default `StorageClass` just for PVCs that don't
request any particular class to bind to: see the
[`PersistentVolumeClaim` section](#persistentvolumeclaims)
for details.
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: standard
provisioner: kubernetes.io/aws-ebs
parameters:
type: gp2
```
### Provisioner
Storage classes have a provisioner that determines what volume plugin is used
for provisioning PVs. This field must be specified.
You are not restricted to specifying the "internal" provisioners
listed here (whose names are prefixed with "kubernetes.io" and shipped
alongside Kubernetes). You can also run and specify external provisioners,
which are independent programs that follow a [specification](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/volume-provisioning.md)
defined by Kubernetes. Authors of external provisioners have full discretion
over where their code lives, how the provisioner is shipped, how it needs to be
run, what volume plugin it uses (including Flex), etc. The repository [kubernetes-incubator/external-storage](https://github.com/kubernetes-incubator/external-storage)
houses a library for writing external provisioners that implements the bulk of
the specification plus various community-maintained external provisioners.
### Parameters
Storage classes have parameters that describe volumes belonging to the storage
class. Different parameters may be accepted depending on the `provisioner`. For
example, the value `io1`, for the parameter `type`, and the parameter
`iopsPerGB` are specific to EBS. When a parameter is omitted, some default is
used.
#### AWS
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/aws-ebs
parameters:
type: io1
zone: us-east-1d
iopsPerGB: "10"
```
* `type`: `io1`, `gp2`, `sc1`, `st1`. See [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) for details. Default: `gp2`.
* `zone`: AWS zone. If not specified, a random zone from those where Kubernetes cluster has a node is chosen.
* `iopsPerGB`: only for `io1` volumes. I/O operations per second per GiB. AWS volume plugin multiplies this with size of requested volume to compute IOPS of the volume and caps it at 20 000 IOPS (maximum supported by AWS, see [AWS docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html). A string is expected here, i.e. `"10"`, not `10`.
* `encrypted`: denotes whether the EBS volume should be encrypted or not. Valid values are `"true"` or `"false"`. A string is expected here, i.e. `"true"`, not `true`.
* `kmsKeyId`: optional. The full Amazon Resource Name of the key to use when encrypting the volume. If none is supplied but `encrypted` is true, a key is generated by AWS. See AWS docs for valid ARN value.
#### GCE
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/gce-pd
parameters:
type: pd-standard
zone: us-central1-a
```
* `type`: `pd-standard` or `pd-ssd`. Default: `pd-standard`
* `zone`: GCE zone. If not specified, a random zone in the same region as controller-manager will be chosen.
#### Glusterfs
```yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: slow
provisioner: kubernetes.io/glusterfs
parameters:
resturl: "http://127.0.0.1:8081"
restauthenabled: "true"
restuser: "admin"
secretNamespace: "default"
secretName: "heketi-secret"
```
* `resturl`: Gluster REST service/Heketi service url which provision gluster volumes on demand. The general format should be `IPaddress:Port` and this is a mandatory parameter for GlusterFS dynamic provisioner. If Heketi service is exposed as a routable service in openshift/kubernetes setup, this can have a format similar to
`http://heketi-storage-project.cloudapps.mystorage.com` where the fqdn is a resolvable heketi service url.
* `restauthenabled` : Gluster REST service authentication boolean that enables authentication to the REST server. If this value is 'true', `restuser` and `restuserkey` or `secretNamespace` + `secretName` have to be filled. This option is deprecated, authentication is enabled when any of `restuser`, `restuserkey`, `secretName` or `secretNamespace` is specified.
* `restuser` : Gluster REST service/Heketi user who has access to create volumes in the Gluster Trusted Pool.
* `restuserkey` : Gluster REST service/Heketi user's password which will be used for authentication to the REST server. This parameter is deprecated in favor of `secretNamespace` + `secretName`.
* `secretNamespace` + `secretName` : Identification of Secret instance that contains user password to use when talking to Gluster REST service. These parameters are optional, empty password will be used when both `secretNamespace` and `secretName` are omitted. The provided secret must have type "kubernetes.io/glusterfs", e.g. created in this way:
```
$ kubectl create secret generic heketi-secret --type="kubernetes.io/glusterfs" --from-literal=key='opensesame' --namespace=default
```
#### OpenStack Cinder
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: gold
provisioner: kubernetes.io/cinder
parameters:
type: fast
availability: nova
```
* `type`: [VolumeType](http://docs.openstack.org/admin-guide/dashboard-manage-volumes.html) created in Cinder. Default is empty.
* `availability`: Availability Zone. Default is empty.
#### vSphere
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: fast
provisioner: kubernetes.io/vsphere-volume
parameters:
diskformat: zeroedthick
```
* `diskformat`: `thin`, `zeroedthick` and `eagerzeroedthick`. Default: `"thin"`.
#### Ceph RBD
```yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: fast
provisioner: kubernetes.io/rbd
parameters:
monitors: 10.16.153.105:6789
adminId: kube
adminSecretName: ceph-secret
adminSecretNamespace: kube-system
pool: kube
userId: kube
userSecretName: ceph-secret-user
```
* `monitors`: Ceph monitors, comma delimited. This parameter is required.
* `adminId`: Ceph client ID that is capable of creating images in the pool. Default is "admin".
* `adminSecretNamespace`: The namespace for `adminSecret`. Default is "default".
* `adminSecret`: Secret Name for `adminId`. This parameter is required. The provided secret must have type "kubernetes.io/rbd".
* `pool`: Ceph RBD pool. Default is "rbd".
* `userId`: Ceph client ID that is used to map the RBD image. Default is the same as `adminId`.
* `userSecretName`: The name of Ceph Secret for `userId` to map RBD image. It must exist in the same namespace as PVCs. This parameter is required. The provided secret must have type "kubernetes.io/rbd", e.g. created in this way:
```
$ kubectl create secret generic ceph-secret --type="kubernetes.io/rbd" --from-literal=key='QVFEQ1pMdFhPUnQrSmhBQUFYaERWNHJsZ3BsMmNjcDR6RFZST0E9PQ==' --namespace=kube-system
```
#### Quobyte
```yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: slow
provisioner: kubernetes.io/quobyte
parameters:
quobyteAPIServer: "http://138.68.74.142:7860"
registry: "138.68.74.142:7861"
adminSecretName: "quobyte-admin-secret"
adminSecretNamespace: "kube-system"
user: "root"
group: "root"
quobyteConfig: "BASE"
quobyteTenant: "DEFAULT"
```
* `quobyteAPIServer`: API Server of Quobyte in the format `http(s)://api-server:7860`
* `registry`: Quobyte registry to use to mount the volume. You can specify the registry as ``<host>:<port>`` pair or if you want to specify multiple registries you just have to put a comma between them e.q. ``<host1>:<port>,<host2>:<port>,<host3>:<port>``. The host can be an IP address or if you have a working DNS you can also provide the DNS names.
* `adminSecretNamespace`: The namespace for `adminSecretName`. Default is "default".
* `adminSecretName`: secret that holds information about the Quobyte user and the password to authenticate against the API server. The provided secret must have type "kubernetes.io/quobyte", e.g. created in this way:
```
$ kubectl create secret generic quobyte-admin-secret --type="kubernetes.io/quobyte" --from-literal=key='opensesame' --namespace=kube-system
```
* `user`: maps all access to this user. Default is "root".
* `group`: maps all access to this group. Default is "nfsnobody".
* `quobyteConfig`: use the specified configuration to create the volume. You can create a new configuration or modify an existing one with the Web console or the quobyte CLI. Default is "BASE".
* `quobyteTenant`: use the specified tenant ID to create/delete the volume. This Quobyte tenant has to be already present in Quobyte. Default is "DEFAULT".
#### Azure Disk
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/azure-disk
parameters:
skuName: Standard_LRS
location: eastus
storageAccount: azure_storage_account_name
```
* `skuName`: Azure storage account Sku tier. Default is empty.
* `location`: Azure storage account location. Default is empty.
* `storageAccount`: Azure storage account name. If storage account is not provided, all storage accounts associated with the resource group are searched to find one that matches `skuName` and `location`. If storage account is provided, it must reside in the same resource group as the cluster, and `skuName` and `location` are ignored.
#### Portworx Volume
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: portworx-io-priority-high
provisioner: kubernetes.io/portworx-volume
parameters:
repl: "1"
snap_interval: "70"
io_priority: "high"
```
* `fs`: filesystem to be laid out: [none/xfs/ext4] (default: `ext4`).
* `block_size`: block size in Kbytes (default: `32`).
* `repl`: number of synchronous replicas to be provided in the form of replication factor [1..3] (default: `1`) A string is expected here i.e.`"1"` and not `1`.
* `io_priority`: determines whether the volume will be created from higher performance or a lower priority storage [high/medium/low] (default: `low`).
* `snap_interval`: clock/time interval in minutes for when to trigger snapshots. Snapshots are incremental based on difference with the prior snapshot, 0 disables snaps (default: `0`). A string is expected here i.e. `"70"` and not `70`.
* `aggregation_level`: specifies the number of chunks the volume would be distributed into, 0 indicates a non-aggregated volume (default: `0`). A string is expected here i.e. `"0"` and not `0`
* `ephemeral`: specifies whether the volume should be cleaned-up after unmount or should be persistent. `emptyDir` use case can set this value to true and `persistent volumes` use case such as for databases like Cassandra should set to false, [true/false] (default `false`). A string is expected here i.e. `"true"` and not `true`.
#### ScaleIO
```yaml
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: slow
provisioner: kubernetes.io/scaleio
parameters:
gateway: https://192.168.99.200:443/api
system: scaleio
protectionDomain: default
storagePool: default
storageMode: ThinProvisionned
secretRef: sio-secret
readOnly: false
fsType: xfs
```
* `provisioner`: attribute is set to `kubernetes.io/scaleio`
* `gateway`: address to a ScaleIO API gateway (required)
* `system`: the name of the ScaleIO system (required)
* `protectionDomain`: the name of the ScaleIO protection domain
* `storagePool`: the name of the volume storage pool
* `storageMode`: the storage provision mode: `ThinProvisionned` (default) or `ThickProvisionned`
* `secretRef`: reference to a configuered Secret object (required, see detail below)
* `readOnly`: specifies the access mode to the mounted volume
* `fsType`: the file system to use for the volume
The ScaleIO Kubernetes volume plugin requires a configuered Secret object.
The secret must be created with type `kubernetes.io/scaleio` and use the same namespace value as that of the PVC where it is referenced
as shown in the following command:
```
$> kubectl create secret generic sio-secret --type="kubernetes.io/scaleio" --from-literal=username=sioadmin --from-literal=password=d2NABDNjMA== --namespace=default
```
## Writing Portable Configuration
If you're writing configuration templates or examples that run on a wide range of clusters
and need persistent storage, we recommend that you use the following pattern:
- Do include PersistentVolumeClaim objects in your bundle of config (alongside Deployments, ConfigMaps, etc).
- Do not include PersistentVolume objects in the config, since the user instantiating the config may not have
permission to create PersistentVolumes.
- Give the user the option of providing a storage class name when instantiating the template.
- If the user provides a storage class name, and the cluster is version 1.4 or newer, put that value into the `volume.beta.kubernetes.io/storage-class` annotation of the PVC.
This will cause the PVC to match the right storage class if the cluster has StorageClasses enabled by the admin.
- If the user does not provide a storage class name or the cluster is version 1.3, then instead put a `volume.alpha.kubernetes.io/storage-class: default` annotation on the PVC.
- This will cause a PV to be automatically provisioned for the user with sane default characteristics on some clusters.
- Despite the word `alpha` in the name, the code behind this annotation has `beta` level support.
- Do not use `volume.beta.kubernetes.io/storage-class:` with any value including the empty string since it will prevent DefaultStorageClass admission controller
from running if enabled.
- In your tooling, do watch for PVCs that are not getting bound after some time and surface this to the user, as this may indicate that the cluster has no dynamic
storage support (in which case the user should create a matching PV) or the cluster has no storage system (in which case the user cannot deploy config requiring
PVCs).
- In the future, we expect most clusters to have `DefaultStorageClass` enabled, and to have some form of storage available. However, there may not be any
storage class names which work on all clusters, so continue to not set one by default.
At some point, the alpha annotation will cease to have meaning, but the unset `storageClass` field on the PVC
will have the desired effect.
[Persistent Volumes](/docs/concepts/storage/persistent-volumes/)

View File

@ -10,435 +10,6 @@ assignees:
title: Pet Sets
---
__Warning:__ Starting in Kubernetes version 1.5, PetSet has been renamed to
[StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/).
To use (or continue to use) PetSet in Kubernetes 1.5 or higher, you must
[migrate your existing PetSets to StatefulSets](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/).
__This document has been deprecated__, but can still apply if you're using
Kubernetes version 1.4 or earlier.
* TOC
{:toc}
__Terminology__
Throughout this doc you will see a few terms that are sometimes used interchangeably elsewhere, that might cause confusion. This section attempts to clarify them.
* Node: A single virtual or physical machine in a Kubernetes cluster.
* Cluster: A group of nodes in a single failure domain, unless mentioned otherwise.
* Persistent Volume Claim (PVC): A request for storage, typically a [persistent volume](/docs/user-guide/persistent-volumes/walkthrough/).
* Host name: The hostname attached to the UTS namespace of the pod, i.e. the output of `hostname` in the pod.
* DNS/Domain name: A *cluster local* domain name resolvable using standard methods (e.g.: [gethostbyname](http://linux.die.net/man/3/gethostbyname)).
* Ordinality: the property of being "ordinal", or occupying a position in a sequence.
* Pet: a single member of a PetSet; more generally, a stateful application.
* Peer: a process running a server, capable of communicating with other such processes.
__Prerequisites__
This doc assumes familiarity with the following Kubernetes concepts:
* [Pods](/docs/user-guide/pods/single-container/)
* [Cluster DNS](/docs/admin/dns/)
* [Headless Services](/docs/user-guide/services/#headless-services)
* [Persistent Volumes](/docs/concepts/storage/volumes/)
* [Persistent Volume Provisioning](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md)
You need a working Kubernetes cluster at version >= 1.3, with a healthy DNS [cluster addon](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/README.md) at version >= 15. You cannot use PetSet on a hosted Kubernetes provider that has disabled `alpha` resources.
## What is a PetSet?
In Kubernetes, most pod management abstractions group them into disposable units of work that compose a micro service. Replication controllers for example, are designed with a weak guarantee - that there should be N replicas of a particular pod template. The pods are treated as stateless units, if one of them is unhealthy or superseded by a newer version, the system just disposes it.
```
foo.default.svc.cluster.local
|service|
/ \
| pod-asdf | | pod-zxcv |
```
A PetSet, in contrast, is a group of stateful pods that require a stronger notion of identity. The document refers to these as "clustered applications".
```
*.foo.default.svc.cluster.local
| mysql-0 | <-> | mysql-1 |
[pv 0] [pv 1]
```
The co-ordinated deployment of clustered applications is notoriously hard. They require stronger notions of identity and membership, which they use in opaque internal protocols, and are especially prone to race conditions and deadlock. Traditionally administrators have deployed these applications by leveraging nodes as stable, long-lived entities with persistent storage and static ips.
The goal of PetSet is to decouple this dependency by assigning identities to individual instances of an application that are not anchored to the underlying physical infrastructure. For the rest of this document we will refer to these entities as "Pets". Our use of this term is predated by the "Pets vs Cattle" analogy.
__Relationship between Pets and Pods__: PetSet requires there be {0..N-1} Pets. Each Pet has a deterministic name - PetSetName-Ordinal, and a unique identity. Each Pet has at most one pod, and each PetSet has at most one Pet with a given identity.
## When to use PetSet?
A PetSet ensures that a specified number of "pets" with unique identities are running at any given time. The identity of a Pet is comprised of:
* a stable hostname, available in DNS
* an ordinal index
* stable storage: linked to the ordinal & hostname
These properties are useful in deploying stateful applications. However most stateful applications are also clustered, meaning they form groups with strict membership requirements that rely on stored state. PetSet also helps with the 2 most common problems encountered managing such clustered applications:
* discovery of peers for quorum
* startup/teardown ordering
Only use PetSet if your application requires some or all of these properties. Managing pods as stateless replicas is vastly easier.
Example workloads for PetSet:
* Databases like MySQL or PostgreSQL that require a single instance attached to an NFS persistent volume at any time
* Clustered software like Zookeeper, Etcd, or Elasticsearch that require stable membership.
## Alpha limitations
Before you start deploying applications as PetSets, there are a few limitations you should understand.
* PetSet is an *alpha* resource, not available in any Kubernetes release prior to 1.3.
* As with all alpha/beta resources, it can be disabled through the `--runtime-config` option passed to the apiserver, and in fact most likely will be disabled on hosted offerings of Kubernetes.
* The only updatable field on a PetSet is `replicas`
* The storage for a given pet must either be provisioned by a [persistent volume provisioner](http://releases.k8s.io/{{page.githubbranch}}/examples/persistent-volume-provisioning/README.md) based on the requested `storage class`, or pre-provisioned by an admin. Note that persistent volume provisioning is also currently in alpha.
* Deleting and/or scaling a PetSet down will *not* delete the volumes associated with the PetSet. This is done to ensure safety first, your data is more valuable than an auto purge of all related PetSet resources. **Deleting the Persistent Volume Claims will result in a deletion of the associated volumes**.
* All PetSets currently require a "governing service", or a Service responsible for the network identity of the pets. The user is responsible for this Service.
* Updating an existing PetSet is currently a manual process, meaning you either need to deploy a new PetSet with the new image version, or orphan Pets one by one, update their image, and join them back to the cluster.
## Example PetSet
We'll create a basic PetSet to demonstrate how Pets are assigned unique and "sticky" identities.
{% include code.html language="yaml" file="petset.yaml" ghlink="/docs/user-guide/petset.yaml" %}
Saving this config into `petset.yaml` and submitting it to a Kubernetes cluster should create the defined PetSet and Pets it manages:
```shell
$ kubectl create -f petset.yaml
service "nginx" created
petset "nginx" created
```
## Pet Identity
The identity of a Pet sticks to it, regardless of which node it's (re) scheduled on. We can examine the identity of the pets we just created.
### Ordinal index
you should see 2 pods with predictable names formatted thus: `$(petset name)-$(ordinal index assigned by petset controller)`
```shell
$ kubectl get po
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 10m
web-1 1/1 Running 0 10m
```
### Stable storage
2 persistent volumes, one per pod. This is auto created by the PetSet based on the `volumeClaimTemplate` field
```shell
$ kubectl get pv
NAME CAPACITY ACCESSMODES STATUS CLAIM REASON AGE
pvc-90234946-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-0 11m
pvc-902733c2-3717-11e6-a46e-42010af00002 1Gi RWO Bound default/www-web-1 11m
```
### Network identity
The network identity has 2 parts. First, we created a headless Service that controls the domain within which we create Pets. The domain managed by this Service takes the form: `$(service name).$(namespace).svc.cluster.local`, where "cluster.local" is the [cluster domain](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it). As each pet is created, it gets a matching DNS subdomain, taking the form: `$(petname).$(governing service domain)`, where the governing service is defined by the `serviceName` field on the PetSet.
Here are some examples of choices for Cluster Domain, Service name, PetSet name, and how that affects the DNS names for the Pets and the hostnames in the Pet's pods:
Cluster Domain | Service (ns/name) | PetSet (ns/name) | PetSet Domain | Pet DNS | Pet Hostname |
-------------- | ----------------- | ----------------- | -------------- | ------- | ------------ |
cluster.local | default/nginx | default/web | nginx.default.svc.cluster.local | web-{0..N-1}.nginx.default.svc.cluster.local | web-{0..N-1} |
cluster.local | foo/nginx | foo/web | nginx.foo.svc.cluster.local | web-{0..N-1}.nginx.foo.svc.cluster.local | web-{0..N-1} |
kube.local | foo/nginx | foo/web | nginx.foo.svc.kube.local | web-{0..N-1}.nginx.foo.svc.kube.local | web-{0..N-1} |
Note that Cluster Domain will be set to `cluster.local` unless [otherwise configured](http://releases.k8s.io/{{page.githubbranch}}/build/kube-dns/README.md#how-do-i-configure-it).
Let's verify our assertion with a simple test.
```shell
$ kubectl get svc
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
nginx None <none> 80/TCP 12m
...
```
First, the PetSet provides a stable hostname:
```shell
$ for i in 0 1; do kubectl exec web-$i -- sh -c 'hostname'; done
web-0
web-1
```
And the hostname is linked to the in-cluster DNS address:
```shell
$ kubectl run -i --tty --image busybox dns-test --restart=Never /bin/sh
dns-test # nslookup web-0.nginx
Server: 10.0.0.10
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
Name: web-0.nginx
Address 1: 10.180.3.5
dns-test # nslookup web-1.nginx
Server: 10.0.0.10
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
Name: web-1.nginx
Address 1: 10.180.0.9
```
The containers are running nginx webservers, which by default will look for an index.html file in `/usr/share/nginx/html/index.html`. That directory is backed by a `PersistentVolume` created by the PetSet. So let's write our hostname there:
```shell
$ for i in 0 1; do
kubectl exec web-$i -- sh -c 'echo $(hostname) > /usr/share/nginx/html/index.html';
done
```
And verify each webserver serves its own hostname:
```shell
$ for i in 0 1; do kubectl exec -it web-$i -- curl localhost; done
web-0
web-1
```
Now delete all pods in the petset:
```shell
$ kubectl delete po -l app=nginx
pod "web-0" deleted
pod "web-1" deleted
```
Wait for them to come back up, and try to retrieve the previously written hostname through the DNS name of the peer. They match, because the storage, DNS name, and hostname stick to the Pet no matter where it gets scheduled:
```shell
$ kubectl exec -it web-1 -- curl web-0.nginx
web-0
$ kubectl exec -it web-0 -- curl web-1.nginx
web-1
```
## Peer discovery
A pet can piece together its own identity:
1. Use the [downward api](/docs/user-guide/downward-api/) to find its pod name
2. Run `hostname` to find its DNS name
3. Run `mount` or `df` to find its volumes (usually this is unnecessary)
It's not necessary to "discover" the governing Service of a PetSet, since it's known at creation time you can simply pass it down through an [environment variable](/docs/user-guide/environment-guide).
Usually pets also need to find their peers. In the previous nginx example, we just used `kubectl` to get the names of existing pods, and as humans, we could tell which ones belonged to a given PetSet. Another way to find peers is by contacting the API server, just like `kubectl`, but that has several disadvantages (you end up implementing a Kubernetes specific init system that runs as pid 1 in your application container).
PetSet gives you a way to discover your peers using DNS records. To illustrate this we can use the previous example (note: one usually doesn't `apt-get` in a container).
```shell
$ kubectl exec -it web-0 /bin/sh
web-0 # apt-get update && apt-get install -y dnsutils
...
web-0 # nslookup -type=srv nginx.default
Server: 10.0.0.10
Address: 10.0.0.10#53
nginx.default.svc.cluster.local service = 10 50 0 web-1.ub.default.svc.cluster.local.
nginx.default.svc.cluster.local service = 10 50 0 web-0.ub.default.svc.cluster.local.
```
## Updating a PetSet
You cannot update any field of the PetSet except `spec.replicas` and the `containers` in the podTemplate. Updating `spec.replicas` will scale the PetSet, updating `containers` will not have any effect till a Pet is deleted, at which time it is recreated with the modified podTemplate.
## Scaling a PetSet
You can scale a PetSet by updating the "replicas" field. Note however that the controller will only:
1. Create one pet at a time, in order from {0..N-1}, and wait till each one is in [Running and Ready](/docs/user-guide/pod-states) before creating the next
2. Delete one pet at a time, in reverse order from {N-1..0}, and wait till each one is completely shutdown (past its [terminationGracePeriodSeconds](/docs/user-guide/pods/index#termination-of-pods)) before deleting the next
```shell
$ kubectl get po
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 30s
web-1 1/1 Running 0 36s
$ kubectl patch petset web -p '{"spec":{"replicas":3}}'
"web" patched
$ kubectl get po
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 40s
web-1 1/1 Running 0 46s
web-2 1/1 Running 0 8s
```
You can also use the `kubectl scale` command:
```shell
$ kubectl get petset
NAME DESIRED CURRENT AGE
web 3 3 24m
$ kubectl scale petset web --replicas=5
petset "web" scaled
$ kubectl get po --watch-only
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 10m
web-1 1/1 Running 0 27m
web-2 1/1 Running 0 10m
web-3 1/1 Running 0 3m
web-4 0/1 ContainerCreating 0 48s
$ kubectl get petset web
NAME DESIRED CURRENT AGE
web 5 5 30m
```
Note however, that scaling up to N and back down to M *will not* delete the volumes of the M-N pets, as described in the section on [deletion](#deleting-a-petset), i.e. scaling back up to M creates new pets that use the same volumes. To see this in action, scale the PetSet back down to 3:
```shell
$ kubectl get po --watch-only
web-4 1/1 Terminating 0 4m
web-4 1/1 Terminating 0 4m
web-3 1/1 Terminating 0 6m
web-3 1/1 Terminating 0 6m
```
Note that we still have 5 pvcs:
```shell
$ kubectl get pvc
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
www-web-0 Bound pvc-42ca5cef-8113-11e6-82f6-42010af00002 1Gi RWO 32m
www-web-1 Bound pvc-42de30af-8113-11e6-82f6-42010af00002 1Gi RWO 32m
www-web-2 Bound pvc-ba416413-8115-11e6-82f6-42010af00002 1Gi RWO 14m
www-web-3 Bound pvc-ba45f19c-8115-11e6-82f6-42010af00002 1Gi RWO 14m
www-web-4 Bound pvc-ba47674a-8115-11e6-82f6-42010af00002 1Gi RWO 14m
```
This allows you to upgrade the image of a petset and have it come back up with the same data, as described in the next section.
## Image upgrades
PetSet currently *does not* support automated image upgrade as noted in the section on [limitations](#alpha-limitations), however you can update the `image` field of any container in the podTemplate and delete Pets one by one, the PetSet controller will recreate it with the new image.
Edit the image on the PetSet to `gcr.io/google_containers/nginx-slim:0.7` and delete 1 Pet:
```shell{% raw %}
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
gcr.io/google_containers/nginx-slim:0.8
gcr.io/google_containers/nginx-slim:0.8
gcr.io/google_containers/nginx-slim:0.8
$ kubectl delete po web-0
pod "web-0" deleted
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
gcr.io/google_containers/nginx-slim:0.7
gcr.io/google_containers/nginx-slim:0.8
gcr.io/google_containers/nginx-slim:0.8
{% endraw %}```
Delete the remaining 2:
```shell
$ kubectl delete po web-1 web-2
pod "web-1" deleted
pod "web-2" deleted
```
Wait till the PetSet is stable and check the images:
```shell{% raw %}
$ for p in 0 1 2; do kubectl get po web-$p --template '{{range $i, $c := .spec.containers}}{{$c.image}}{{end}}'; echo; done
gcr.io/google_containers/nginx-slim:0.7
gcr.io/google_containers/nginx-slim:0.7
gcr.io/google_containers/nginx-slim:0.7
{% endraw %}```
## Deleting a PetSet
Deleting a PetSet through kubectl will scale it down to 0, thereby deleting all the Pets. If you wish to delete just the PetSet and not the Pets, use `--cascade=false`:
```shell
$ kubectl delete -f petset.yaml --cascade=false
petset "web" deleted
$ kubectl get po -l app=nginx
NAME READY STATUS RESTARTS AGE
web-0 1/1 Running 0 21h
web-1 1/1 Running 0 21h
$ kubectl delete po -l app=nginx
pod "web-0" deleted
pod "web-1" deleted
```
Deleting the pods will *not* delete the volumes. Until we finalize the recycle policy for these volumes they will have to get cleaned up by an admin. This is to ensure that you have the chance to copy data off the volume before deleting it. Simply deleting the PVC after the pods have left the [terminating state](/docs/user-guide/pods/index#termination-of-pods) should trigger deletion of the backing Persistent Volumes.
**Note: you will lose all your data once the PVC is deleted, do this with caution.**
```shell
$ kubectl get po -l app=nginx
$ kubectl get pvc -l app=nginx
NAME STATUS VOLUME CAPACITY ACCESSMODES AGE
www-web-0 Bound pvc-62d271cd-3822-11e6-b1b7-42010af00002 0 21h
www-web-1 Bound pvc-62d6750e-3822-11e6-b1b7-42010af00002 0 21h
$ kubectl delete pvc -l app=nginx
$ kubectl get pv
```
If you simply want to clean everything:
```shell{% raw %}
$ grace=$(kubectl get po web-0 --template '{{.spec.terminationGracePeriodSeconds}}')
$ kubectl delete petset,po -l app=nginx
$ sleep $grace
$ kubectl delete pvc -l app=nginx
{% endraw %}
```
## Troubleshooting
You might have noticed an `annotations` field in all the PetSets shown above.
```yaml
annotations:
pod.alpha.kubernetes.io/initialized: "true"
```
This field is a debugging hook. It pauses any scale up/down operations on the entire PetSet. If you'd like to pause a petset after each pet, set it to `false` in the template, wait for each pet to come up, verify it has initialized correctly, and then set it to `true` using `kubectl edit` on the pet (setting it to `false` on *any pet* is enough to pause the PetSet). If you don't need it, create the PetSet with it set to `true` as shown. This is surprisingly useful in debugging bootstrapping race conditions.
## Future Work
There are a LOT of planned improvements since PetSet is still in alpha.
* Data gravity and local storage
* Richer notification events
* Public network identities
* WAN cluster deployments (multi-AZ/region/cloud provider)
* Image and node upgrades
This list goes on, if you have examples, ideas or thoughts, please contribute.
## Alternatives
Deploying one RC of size 1/Service per pod is a popular alternative, as is simply deploying a DaemonSet that utilizes the identity of a Node.
## Next steps
* Learn about [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/),
the replacement for PetSet introduced in Kubernetes version 1.5.
* [Migrate your existing PetSets to StatefulSets](/docs/tasks/manage-stateful-set/upgrade-pet-set-to-stateful-set/)
when upgrading to Kubernetes version 1.5 or higher.
{% include user-guide-content-moved.md %}
[PetSets](/docs/concepts/workloads/controllers/petset/)

View File

@ -3,194 +3,6 @@ assignees:
title: Pods
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
_pods_ are the smallest deployable units of computing that can be created and
managed in Kubernetes.
## What is a Pod?
A _pod_ (as in a pod of whales or pea pod) is a group of one or more containers
(such as Docker containers), the shared storage for those containers, and
options about how to run the containers. Pods are always co-located and
co-scheduled, and run in a shared context. A pod models an
application-specific "logical host" - it contains one or more application
containers which are relatively tightly coupled &mdash; in a pre-container
world, they would have executed on the same physical or virtual machine.
While Kubernetes supports more container runtimes than just Docker, Docker is
the most commonly known runtime, and it helps to describe pods in Docker terms.
The shared context of a pod is a set of Linux namespaces, cgroups, and
potentially other facets of isolation - the same things that isolate a Docker
container. Within a pod's context, the individual applications may have
further sub-isolations applied.
Containers within a pod share an IP address and port space, and
can find each other via `localhost`. They can also communicate with each
other using standard inter-process communications like SystemV semaphores or
POSIX shared memory. Containers in different pods have distinct IP addresses
and can not communicate by IPC.
Applications within a pod also have access to shared volumes, which are defined
as part of a pod and are made available to be mounted into each application's
filesystem.
In terms of [Docker](https://www.docker.com/) constructs, a pod is modelled as
a group of Docker containers with shared namespaces and shared
[volumes](/docs/concepts/storage/volumes/). PID namespace sharing is not yet implemented in Docker.
Like individual application containers, pods are considered to be relatively
ephemeral (rather than durable) entities. As discussed in [life of a
pod](/docs/user-guide/pod-states/), pods are created, assigned a unique ID (UID), and
scheduled to nodes where they remain until termination (according to restart
policy) or deletion. If a node dies, the pods scheduled to that node are
scheduled for deletion, after a timeout period. A given pod (as defined by a UID) is not
"rescheduled" to a new node; instead, it can be replaced by an identical pod,
with even the same name if desired, but with a new UID (see [replication
controller](/docs/user-guide/replication-controller/) for more details). (In the future, a
higher-level API may support pod migration.)
When something is said to have the same lifetime as a pod, such as a volume,
that means that it exists as long as that pod (with that UID) exists. If that
pod is deleted for any reason, even if an identical replacement is created, the
related thing (e.g. volume) is also destroyed and created anew.
![pod diagram](/images/docs/pod.svg){: style="max-width: 50%" }
*A multi-container pod that contains a file puller and a
web server that uses a persistent volume for shared storage between the containers.*
## Motivation for pods
### Management
Pods are a model of the pattern of multiple cooperating processes which form a
cohesive unit of service. They simplify application deployment and management
by providing a higher-level abstraction than the set of their constituent
applications. Pods serve as unit of deployment, horizontal scaling, and
replication. Colocation (co-scheduling), shared fate (e.g. termination),
coordinated replication, resource sharing, and dependency management are
handled automatically for containers in a pod.
### Resource sharing and communication
Pods enable data sharing and communication among their constituents.
The applications in a pod all use the same network namespace (same IP and port
space), and can thus "find" each other and communicate using `localhost`.
Because of this, applications in a pod must coordinate their usage of ports.
Each pod has an IP address in a flat shared networking space that has full
communication with other physical computers and pods across the network.
The hostname is set to the pod's Name for the application containers within the
pod. [More details on networking](/docs/admin/networking/).
In addition to defining the application containers that run in the pod, the pod
specifies a set of shared storage volumes. Volumes enable data to survive
container restarts and to be shared among the applications within the pod.
## Uses of pods
Pods can be used to host vertically integrated application stacks (e.g. LAMP),
but their primary motivation is to support co-located, co-managed helper
programs, such as:
* content management systems, file and data loaders, local cache managers, etc.
* log and checkpoint backup, compression, rotation, snapshotting, etc.
* data change watchers, log tailers, logging and monitoring adapters, event publishers, etc.
* proxies, bridges, and adapters
* controllers, managers, configurators, and updaters
Individual pods are not intended to run multiple instances of the same
application, in general.
For a longer explanation, see [The Distributed System ToolKit: Patterns for
Composite
Containers](http://blog.kubernetes.io/2015/06/the-distributed-system-toolkit-patterns.html).
## Alternatives considered
_Why not just run multiple programs in a single (Docker) container?_
1. Transparency. Making the containers within the pod visible to the
infrastructure enables the infrastructure to provide services to those
containers, such as process management and resource monitoring. This
facilitates a number of conveniences for users.
2. Decoupling software dependencies. The individual containers may be
versioned, rebuilt and redeployed independently. Kubernetes may even support
live updates of individual containers someday.
3. Ease of use. Users don't need to run their own process managers, worry about
signal and exit-code propagation, etc.
4. Efficiency. Because the infrastructure takes on more responsibility,
containers can be lighter weight.
_Why not support affinity-based co-scheduling of containers?_
That approach would provide co-location, but would not provide most of the
benefits of pods, such as resource sharing, IPC, guaranteed fate sharing, and
simplified management.
## Durability of pods (or lack thereof)
Pods aren't intended to be treated as durable entities. They won't survive scheduling failures, node failures, or other evictions, such as due to lack of resources, or in the case of node maintenance.
In general, users shouldn't need to create pods directly. They should almost always use controllers (e.g., [Deployments](/docs/user-guide/deployments/)), even for singletons. Controllers provide self-healing with a cluster scope, as well as replication and rollout management.
The use of collective APIs as the primary user-facing primitive is relatively common among cluster scheduling systems, including [Borg](https://research.google.com/pubs/pub43438.html), [Marathon](https://mesosphere.github.io/marathon/docs/rest-api.html), [Aurora](http://aurora.apache.org/documentation/latest/configuration-reference/#job-schema), and [Tupperware](http://www.slideshare.net/Docker/aravindnarayanan-facebook140613153626phpapp02-37588997).
Pod is exposed as a primitive in order to facilitate:
* scheduler and controller pluggability
* support for pod-level operations without the need to "proxy" them via controller APIs
* decoupling of pod lifetime from controller lifetime, such as for bootstrapping
* decoupling of controllers and services &mdash; the endpoint controller just watches pods
* clean composition of Kubelet-level functionality with cluster-level functionality &mdash; Kubelet is effectively the "pod controller"
* high-availability applications, which will expect pods to be replaced in advance of their termination and certainly in advance of deletion, such as in the case of planned evictions, image prefetching, or live pod migration [#3949](http://issue.k8s.io/3949)
There is new first-class support for stateful pods with the [StatefulSet](/docs/concepts/abstractions/controllers/statefulsets/) controller (currently in beta). The feature was alpha in 1.4 and was called [PetSet](/docs/user-guide/petset/). For prior versions of Kubernetes, best practice for having stateful pods is to create a replication controller with `replicas` equal to `1` and a corresponding service, see [this MySQL deployment example](/docs/tutorials/stateful-application/run-stateful-application/).
## Termination of Pods
Because pods represent running processes on nodes in the cluster, it is important to allow those processes to gracefully terminate when they are no longer needed (vs being violently killed with a KILL signal and having no chance to clean up). Users should be able to request deletion and know when processes terminate, but also be able to ensure that deletes eventually complete. When a user requests deletion of a pod the system records the intended grace period before the pod is allowed to be forcefully killed, and a TERM signal is sent to the main process in each container. Once the grace period has expired the KILL signal is sent to those processes and the pod is then deleted from the API server. If the Kubelet or the container manager is restarted while waiting for processes to terminate, the termination will be retried with the full grace period.
An example flow:
1. User sends command to delete Pod, with default grace period (30s)
2. The Pod in the API server is updated with the time beyond which the Pod is considered "dead" along with the grace period.
3. Pod shows up as "Terminating" when listed in client commands
4. (simultaneous with 3) When the Kubelet sees that a Pod has been marked as terminating because the time in 2 has been set, it begins the pod shutdown process.
1. If the pod has defined a [preStop hook](/docs/concepts/containers/container-lifecycle-hooks/#hook-details), it is invoked inside of the pod. If the `preStop` hook is still running after the grace period expires, step 2 is then invoked with a small (2 second) extended grace period.
2. The processes in the Pod are sent the TERM signal.
5. (simultaneous with 3), Pod is removed from endpoints list for service, and are no longer considered part of the set of running pods for replication controllers. Pods that shutdown slowly can continue to serve traffic as load balancers (like the service proxy) remove them from their rotations.
6. When the grace period expires, any processes still running in the Pod are killed with SIGKILL.
7. The Kubelet will finish deleting the Pod on the API server by setting grace period 0 (immediate deletion). The Pod disappears from the API and is no longer visible from the client.
By default, all deletes are graceful within 30 seconds. The `kubectl delete` command supports the `--grace-period=<seconds>` option which allows a user to override the default and specify their own value. The value `0` [force deletes](/docs/user-guide/pods/#force-termination-of-pods) the pod. In kubectl version >= 1.5, you must specify an additional flag `--force` along with `--grace-period=0` in order to perform force deletions.
### Force deletion of pods
Force deletion of a pod is defined as deletion of a pod from the cluster state and etcd immediately. When a force deletion is performed, the apiserver does not wait for confirmation from the kubelet that the pod has been terminated on the node it was running on. It removes the pod in the API immediately so a new pod can be created with the same name. On the node, pods that are set to terminate immediately will still be given a small grace period before being force killed.
Force deletions can be potentially dangerous for some pods and should be performed with caution. In case of StatefulSet pods, please refer to the task documentation for [deleting Pods from a StatefulSet](/docs/tasks/manage-stateful-set/delete-pods/#deleting-pods).
## Privileged mode for pod containers
From Kubernetes v1.1, any container in a pod can enable privileged mode, using the `privileged` flag on the `SecurityContext` of the container spec. This is useful for containers that want to use linux capabilities like manipulating the network stack and accessing devices. Processes within the container get almost the same privileges that are available to processes outside a container. With privileged mode, it should be easier to write network and volume plugins as separate pods that don't need to be compiled into the kubelet.
If the master is running Kubernetes v1.1 or higher, and the nodes are running a version lower than v1.1, then new privileged pods will be accepted by api-server, but will not be launched. They will be pending state.
If user calls `kubectl describe pod FooPodName`, user can see the reason why the pod is in pending state. The events table in the describe command output will say:
`Error validating pod "FooPodName"."FooPodNamespace" from api, ignoring: spec.containers[0].securityContext.privileged: forbidden '<*>(0xc2089d3248)true'`
If the master is running a version lower than v1.1, then privileged pods cannot be created. If user attempts to create a pod, that has a privileged container, the user will get the following error:
`The Pod "FooPodName" is invalid.
spec.containers[0].securityContext.privileged: forbidden '<*>(0xc20b222db0)true'`
## API Object
Pod is a top-level resource in the Kubernetes REST API. More details about the
API object can be found at: [Pod API
object](/docs/api-reference/v1.6/#pod-v1-core).
[Pods](/docs/concepts/workloads/pods/pod/)

View File

@ -6,97 +6,6 @@ assignees:
title: Replica Sets
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
## What is a ReplicaSet?
ReplicaSet is the next-generation Replication Controller. The only difference
between a _ReplicaSet_ and a
[_Replication Controller_](/docs/user-guide/replication-controller/) right now is
the selector support. ReplicaSet supports the new set-based selector requirements
as described in the [labels user guide](/docs/user-guide/labels/#label-selectors)
whereas a Replication Controller only supports equality-based selector requirements.
Most [`kubectl`](/docs/user-guide/kubectl/) commands that support
Replication Controllers also support ReplicaSets. One exception is the
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command. If
you want the rolling update functionality please consider using Deployments
instead. Also, the
[`rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update/) command is
imperative whereas Deployments are declarative, so we recommend using Deployments
through the [`rollout`](/docs/user-guide/kubectl/kubectl_rollout/) command.
While ReplicaSets can be used independently, today it's mainly used by
[Deployments](/docs/user-guide/deployments/) as a mechanism to orchestrate pod
creation, deletion and updates. When you use Deployments you don't have to worry
about managing the ReplicaSets that they create. Deployments own and manage
their ReplicaSets.
## When to use a ReplicaSet?
A ReplicaSet ensures that a specified number of pod “replicas” are running at any given
time. However, a Deployment is a higher-level concept that manages ReplicaSets and
provides declarative updates to pods along with a lot of other useful features.
Therefore, we recommend using Deployments instead of directly using ReplicaSets, unless
you require custom update orchestration or don't require updates at all.
This actually means that you may never need to manipulate ReplicaSet objects:
use directly a Deployment and define your application in the spec section.
## Example
{% include code.html language="yaml" file="replicasets/frontend.yaml" ghlink="/docs/user-guide/replicasets/frontend.yaml" %}
Saving this config into `frontend.yaml` and submitting it to a Kubernetes cluster should
create the defined ReplicaSet and the pods that it manages.
```shell
$ kubectl create -f frontend.yaml
replicaset "frontend" created
$ kubectl describe rs/frontend
Name: frontend
Namespace: default
Image(s): gcr.io/google_samples/gb-frontend:v3
Selector: tier=frontend,tier in (frontend)
Labels: app=guestbook,tier=frontend
Replicas: 3 current / 3 desired
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
No volumes.
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- -------- ------ -------
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-qhloh
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-dnjpy
1m 1m 1 {replicaset-controller } Normal SuccessfulCreate Created pod: frontend-9si5l
$ kubectl get pods
NAME READY STATUS RESTARTS AGE
frontend-9si5l 1/1 Running 0 1m
frontend-dnjpy 1/1 Running 0 1m
frontend-qhloh 1/1 Running 0 1m
```
## ReplicaSet as an Horizontal Pod Autoscaler target
A ReplicaSet can also be a target for
[Horizontal Pod Autoscalers (HPA)](/docs/user-guide/horizontal-pod-autoscaling/),
i.e. a ReplicaSet can be auto-scaled by an HPA. Here is an example HPA targeting
the ReplicaSet we created in the previous example.
{% include code.html language="yaml" file="replicasets/hpa-rs.yaml" ghlink="/docs/user-guide/replicasets/hpa-rs.yaml" %}
Saving this config into `hpa-rs.yaml` and submitting it to a Kubernetes cluster should
create the defined HPA that autoscales the target ReplicaSet depending on the CPU usage
of the replicated pods.
```shell
kubectl create -f hpa-rs.yaml
```
Alternatively, you can just use the `kubectl autoscale` command to accomplish the same
(and it's easier!)
```shell
kubectl autoscale rs frontend
```
[ReplicaSets](/docs/concepts/workloads/controllers/replicaset/)

View File

@ -5,257 +5,6 @@ assignees:
title: Replication Controller
---
* TOC
{:toc}
{% include user-guide-content-moved.md %}
## What is a ReplicationController?
A _ReplicationController_ ensures that a specified number of pod "replicas" are running at any one
time. In other words, a ReplicationController makes sure that a pod or homogeneous set of pods are
always up and available.
If there are too many pods, it will kill some. If there are too few, the
ReplicationController will start more. Unlike manually created pods, the pods maintained by a
ReplicationController are automatically replaced if they fail, get deleted, or are terminated.
For example, your pods get re-created on a node after disruptive maintenance such as a kernel upgrade.
For this reason, we recommend that you use a ReplicationController even if your application requires
only a single pod. You can think of a ReplicationController as something similar to a process supervisor,
but rather than individual processes on a single node, the ReplicationController supervises multiple pods
across multiple nodes.
ReplicationController is often abbreviated to "rc" or "rcs" in discussion, and as a shortcut in
kubectl commands.
A simple case is to create 1 ReplicationController object in order to reliably run one instance of
a Pod indefinitely. A more complex use case is to run several identical replicas of a replicated
service, such as web servers.
## Running an example ReplicationController
Here is an example ReplicationController config. It runs 3 copies of the nginx web server.
{% include code.html language="yaml" file="replication.yaml" ghlink="/docs/user-guide/replication.yaml" %}
Run the example job by downloading the example file and then running this command:
```shell
$ kubectl create -f ./replication.yaml
replicationcontroller "nginx" created
```
Check on the status of the ReplicationController using this command:
```shell
$ kubectl describe replicationcontrollers/nginx
Name: nginx
Namespace: default
Image(s): nginx
Selector: app=nginx
Labels: app=nginx
Replicas: 3 current / 3 desired
Pods Status: 0 Running / 3 Waiting / 0 Succeeded / 0 Failed
Events:
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
--------- -------- ----- ---- ------------- ---- ------ -------
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-qrm3m
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-3ntk0
20s 20s 1 {replication-controller } Normal SuccessfulCreate Created pod: nginx-4ok8v
```
Here, 3 pods have been made, but none are running yet, perhaps because the image is being pulled.
A little later, the same command may show:
```shell
Pods Status: 3 Running / 0 Waiting / 0 Succeeded / 0 Failed
```
To list all the pods that belong to the rc in a machine readable form, you can use a command like this:
```shell
$ pods=$(kubectl get pods --selector=app=nginx --output=jsonpath={.items..metadata.name})
echo $pods
nginx-3ntk0 nginx-4ok8v nginx-qrm3m
```
Here, the selector is the same as the selector for the ReplicationController (seen in the
`kubectl describe` output, and in a different form in `replication.yaml`. The `--output=jsonpath` option
specifies an expression that just gets the name from each pod in the returned list.
## Writing a ReplicationController Spec
As with all other Kubernetes config, a Job needs `apiVersion`, `kind`, and `metadata` fields. For
general information about working with config files, see [here](/docs/user-guide/simple-yaml/),
[here](/docs/user-guide/configuring-containers/), and [here](/docs/user-guide/working-with-resources/).
A ReplicationController also needs a [`.spec` section](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status).
### Pod Template
The `.spec.template` is the only required field of the `.spec`.
The `.spec.template` is a [pod template](#pod-template). It has exactly
the same schema as a [pod](/docs/user-guide/pods/), except it is nested and does not have an `apiVersion` or
`kind`.
In addition to required fields for a Pod, a pod template in a ReplicationController must specify appropriate
labels (i.e. don't overlap with other controllers, see [pod selector](#pod-selector)) and an appropriate restart policy.
Only a [`.spec.template.spec.restartPolicy`](/docs/user-guide/pod-states/) equal to `Always` is allowed, which is the default
if not specified.
For local container restarts, ReplicationControllers delegate to an agent on the node,
for example the [Kubelet](/docs/admin/kubelet/) or Docker.
### Labels on the ReplicationController
The ReplicationController can itself have labels (`.metadata.labels`). Typically, you
would set these the same as the `.spec.template.metadata.labels`; if `.metadata.labels` is not specified
then it is defaulted to `.spec.template.metadata.labels`. However, they are allowed to be
different, and the `.metadata.labels` do not affect the behavior of the ReplicationController.
### Pod Selector
The `.spec.selector` field is a [label selector](/docs/user-guide/labels/#label-selectors). A replication
controller manages all the pods with labels which match the selector. It does not distinguish
between pods which it created or deleted versus pods which some other person or process created or
deleted. This allows the ReplicationController to be replaced without affecting the running pods.
If specified, the `.spec.template.metadata.labels` must be equal to the `.spec.selector`, or it will
be rejected by the API. If `.spec.selector` is unspecified, it will be defaulted to
`.spec.template.metadata.labels`.
Also you should not normally create any pods whose labels match this selector, either directly, via
another ReplicationController or via another controller such as Job. Otherwise, the
ReplicationController will think that those pods were created by it. Kubernetes will not stop you
from doing this.
If you do end up with multiple controllers that have overlapping selectors, you
will have to manage the deletion yourself (see [below](#updating-a-replication-controller)).
### Multiple Replicas
You can specify how many pods should run concurrently by setting `.spec.replicas` to the number
of pods you would like to have running concurrently. The number running at any time may be higher
or lower, such as if the replicas was just increased or decreased, or if a pod is gracefully
shutdown, and a replacement starts early.
If you do not specify `.spec.replicas`, then it defaults to 1.
## Working with ReplicationControllers
### Deleting a ReplicationController and its Pods
To delete a ReplicationController and all its pods, use [`kubectl
delete`](/docs/user-guide/kubectl/kubectl_delete/). Kubectl will scale the ReplicationController to zero and wait
for it to delete each pod before deleting the ReplicationController itself. If this kubectl
command is interrupted, it can be restarted.
When using the REST API or go client library, you need to do the steps explicitly (scale replicas to
0, wait for pod deletions, then delete the ReplicationController).
### Deleting just a ReplicationController
You can delete a ReplicationController without affecting any of its pods.
Using kubectl, specify the `--cascade=false` option to [`kubectl delete`](/docs/user-guide/kubectl/kubectl_delete/).
When using the REST API or go client library, simply delete the ReplicationController object.
Once the original is deleted, you can create a new ReplicationController to replace it. As long
as the old and new `.spec.selector` are the same, then the new one will adopt the old pods.
However, it will not make any effort to make existing pods match a new, different pod template.
To update pods to a new spec in a controlled way, use a [rolling update](#rolling-updates).
### Isolating pods from a ReplicationController
Pods may be removed from a ReplicationController's target set by changing their labels. This technique may be used to remove pods from service for debugging, data recovery, etc. Pods that are removed in this way will be replaced automatically (assuming that the number of replicas is not also changed).
## Common usage patterns
### Rescheduling
As mentioned above, whether you have 1 pod you want to keep running, or 1000, a ReplicationController will ensure that the specified number of pods exists, even in the event of node failure or pod termination (e.g., due to an action by another control agent).
### Scaling
The ReplicationController makes it easy to scale the number of replicas up or down, either manually or by an auto-scaling control agent, by simply updating the `replicas` field.
### Rolling updates
The ReplicationController is designed to facilitate rolling updates to a service by replacing pods one-by-one.
As explained in [#1353](http://issue.k8s.io/1353), the recommended approach is to create a new ReplicationController with 1 replica, scale the new (+1) and old (-1) controllers one by one, and then delete the old controller after it reaches 0 replicas. This predictably updates the set of pods regardless of unexpected failures.
Ideally, the rolling update controller would take application readiness into account, and would ensure that a sufficient number of pods were productively serving at any given time.
The two ReplicationControllers would need to create pods with at least one differentiating label, such as the image tag of the primary container of the pod, since it is typically image updates that motivate rolling updates.
Rolling update is implemented in the client tool
[`kubectl rolling-update`](/docs/user-guide/kubectl/kubectl_rolling-update). Visit [`kubectl rolling-update` task](/docs/tasks/run-application/rolling-update-replication-controller/) for more concrete examples.
### Multiple release tracks
In addition to running multiple releases of an application while a rolling update is in progress, it's common to run multiple releases for an extended period of time, or even continuously, using multiple release tracks. The tracks would be differentiated by labels.
For instance, a service might target all pods with `tier in (frontend), environment in (prod)`. Now say you have 10 replicated pods that make up this tier. But you want to be able to 'canary' a new version of this component. You could set up a ReplicationController with `replicas` set to 9 for the bulk of the replicas, with labels `tier=frontend, environment=prod, track=stable`, and another ReplicationController with `replicas` set to 1 for the canary, with labels `tier=frontend, environment=prod, track=canary`. Now the service is covering both the canary and non-canary pods. But you can mess with the ReplicationControllers separately to test things out, monitor the results, etc.
### Using ReplicationControllers with Services
Multiple ReplicationControllers can sit behind a single service, so that, for example, some traffic
goes to the old version, and some goes to the new version.
A ReplicationController will never terminate on its own, but it isn't expected to be as long-lived as services. Services may be composed of pods controlled by multiple ReplicationControllers, and it is expected that many ReplicationControllers may be created and destroyed over the lifetime of a service (for instance, to perform an update of pods that run the service). Both services themselves and their clients should remain oblivious to the ReplicationControllers that maintain the pods of the services.
## Writing programs for Replication
Pods created by a ReplicationController are intended to be fungible and semantically identical, though their configurations may become heterogeneous over time. This is an obvious fit for replicated stateless servers, but ReplicationControllers can also be used to maintain availability of master-elected, sharded, and worker-pool applications. Such applications should use dynamic work assignment mechanisms, such as the [etcd lock module](https://coreos.com/docs/distributed-configuration/etcd-modules/) or [RabbitMQ work queues](https://www.rabbitmq.com/tutorials/tutorial-two-python.html), as opposed to static/one-time customization of the configuration of each pod, which is considered an anti-pattern. Any pod customization performed, such as vertical auto-sizing of resources (e.g., cpu or memory), should be performed by another online controller process, not unlike the ReplicationController itself.
## Responsibilities of the ReplicationController
The ReplicationController simply ensures that the desired number of pods matches its label selector and are operational. Currently, only terminated pods are excluded from its count. In the future, [readiness](http://issue.k8s.io/620) and other information available from the system may be taken into account, we may add more controls over the replacement policy, and we plan to emit events that could be used by external clients to implement arbitrarily sophisticated replacement and/or scale-down policies.
The ReplicationController is forever constrained to this narrow responsibility. It itself will not perform readiness nor liveness probes. Rather than performing auto-scaling, it is intended to be controlled by an external auto-scaler (as discussed in [#492](http://issue.k8s.io/492)), which would change its `replicas` field. We will not add scheduling policies (e.g., [spreading](http://issue.k8s.io/367#issuecomment-48428019)) to the ReplicationController. Nor should it verify that the pods controlled match the currently specified template, as that would obstruct auto-sizing and other automated processes. Similarly, completion deadlines, ordering dependencies, configuration expansion, and other features belong elsewhere. We even plan to factor out the mechanism for bulk pod creation ([#170](http://issue.k8s.io/170)).
The ReplicationController is intended to be a composable building-block primitive. We expect higher-level APIs and/or tools to be built on top of it and other complementary primitives for user convenience in the future. The "macro" operations currently supported by kubectl (run, stop, scale, rolling-update) are proof-of-concept examples of this. For instance, we could imagine something like [Asgard](http://techblog.netflix.com/2012/06/asgard-web-based-cloud-management-and.html) managing ReplicationControllers, auto-scalers, services, scheduling policies, canaries, etc.
## API Object
Replication controller is a top-level resource in the Kubernetes REST API. More details about the
API object can be found at: [ReplicationController API
object](/docs/api-reference/v1.6/#replicationcontroller-v1-core).
## Alternatives to ReplicationController
### ReplicaSet
[`ReplicaSet`](/docs/user-guide/replicasets/) is the next-generation ReplicationController that supports the new [set-based label selector](/docs/user-guide/labels/#set-based-requirement).
Its mainly used by [`Deployment`](/docs/user-guide/deployments/) as a mechanism to orchestrate pod creation, deletion and updates.
Note that we recommend using Deployments instead of directly using Replica Sets, unless you require custom update orchestration or dont require updates at all.
### Deployment (Recommended)
[`Deployment`](/docs/user-guide/deployments/) is a higher-level API object that updates its underlying Replica Sets and their Pods
in a similar fashion as `kubectl rolling-update`. Deployments are recommended if you want this rolling update functionality,
because unlike `kubectl rolling-update`, they are declarative, server-side, and have additional features.
### Bare Pods
Unlike in the case where a user directly created pods, a ReplicationController replaces pods that are deleted or terminated for any reason, such as in the case of node failure or disruptive node maintenance, such as a kernel upgrade. For this reason, we recommend that you use a ReplicationController even if your application requires only a single pod. Think of it similarly to a process supervisor, only it supervises multiple pods across multiple nodes instead of individual processes on a single node. A ReplicationController delegates local container restarts to some agent on the node (e.g., Kubelet or Docker).
### Job
Use a [`Job`](/docs/concepts/jobs/run-to-completion-finite-workloads/) instead of a ReplicationController for pods that are expected to terminate on their own
(i.e. batch jobs).
### DaemonSet
Use a [`DaemonSet`](/docs/admin/daemons/) instead of a ReplicationController for pods that provide a
machine-level function, such as machine monitoring or machine logging. These pods have a lifetime that is tied
to a machine lifetime: the pod needs to be running on the machine before other pods start, and are
safe to terminate when the machine is otherwise ready to be rebooted/shutdown.
## For more information
Read [Run Stateless AP Replication Controller](/docs/tutorials/stateless-application/run-stateless-ap-replication-controller/).
[ReplicationControllers](/docs/concepts/workloads/controllers/replicationcontroller/)

View File

@ -4,792 +4,6 @@ assignees:
title: Secrets
---
Objects of type `secret` are intended to hold sensitive information, such as
passwords, OAuth tokens, and ssh keys. Putting this information in a `secret`
is safer and more flexible than putting it verbatim in a `pod` definition or in
a docker image. See [Secrets design document](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/secrets.md) for more information.
{% include user-guide-content-moved.md %}
* TOC
{:toc}
## Overview of Secrets
A Secret is an object that contains a small amount of sensitive data such as
a password, a token, or a key. Such information might otherwise be put in a
Pod specification or in an image; putting it in a Secret object allows for
more control over how it is used, and reduces the risk of accidental exposure.
Users can create secrets, and the system also creates some secrets.
To use a secret, a pod needs to reference the secret.
A secret can be used with a pod in two ways: as files in a [volume](/docs/concepts/storage/volumes/) mounted on one or more of
its containers, or used by kubelet when pulling images for the pod.
### Built-in Secrets
#### Service Accounts Automatically Create and Attach Secrets with API Credentials
Kubernetes automatically creates secrets which contain credentials for
accessing the API and it automatically modifies your pods to use this type of
secret.
The automatic creation and use of API credentials can be disabled or overridden
if desired. However, if all you need to do is securely access the apiserver,
this is the recommended workflow.
See the [Service Account](/docs/user-guide/service-accounts) documentation for more
information on how Service Accounts work.
### Creating your own Secrets
#### Creating a Secret Using kubectl create secret
Say that some pods need to access a database. The
username and password that the pods should use is in the files
`./username.txt` and `./password.txt` on your local machine.
```shell
# Create files needed for rest of example.
$ echo -n "admin" > ./username.txt
$ echo -n "1f2d1e2e67df" > ./password.txt
```
The `kubectl create secret` command
packages these files into a Secret and creates
the object on the Apiserver.
```shell
$ kubectl create secret generic db-user-pass --from-file=./username.txt --from-file=./password.txt
secret "db-user-pass" created
```
You can check that the secret was created like this:
```shell
$ kubectl get secrets
NAME TYPE DATA AGE
db-user-pass Opaque 2 51s
$ kubectl describe secrets/db-user-pass
Name: db-user-pass
Namespace: default
Labels: <none>
Annotations: <none>
Type: Opaque
Data
====
password.txt: 12 bytes
username.txt: 5 bytes
```
Note that neither `get` nor `describe` shows the contents of the file by default.
This is to protect the secret from being exposed accidentally to someone looking
or from being stored in a terminal log.
See [decoding a secret](#decoding-a-secret) for how to see the contents.
#### Creating a Secret Manually
You can also create a secret object in a file first,
in json or yaml format, and then create that object.
Each item must be base64 encoded:
```shell
$ echo -n "admin" | base64
YWRtaW4=
$ echo -n "1f2d1e2e67df" | base64
MWYyZDFlMmU2N2Rm
```
Now write a secret object that looks like this:
```yaml
apiVersion: v1
kind: Secret
metadata:
name: mysecret
type: Opaque
data:
username: YWRtaW4=
password: MWYyZDFlMmU2N2Rm
```
The data field is a map. Its keys must match
[`DNS_SUBDOMAIN`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/design/identifiers.md), except that leading dots are also
allowed. The values are arbitrary data, encoded using base64.
Create the secret using [`kubectl create`](/docs/user-guide/kubectl/kubectl_create/):
```shell
$ kubectl create -f ./secret.yaml
secret "mysecret" created
```
**Encoding Note:** The serialized JSON and YAML values of secret data are
encoded as base64 strings. Newlines are not valid within these strings and must
be omitted. When using the `base64` utility on Darwin/OS X users should avoid
using the `-b` option to split long lines. Conversely Linux users *should* add
the option `-w 0` to `base64` commands.
#### Decoding a Secret
Get back the secret created in the previous section:
```shell
$ kubectl get secret mysecret -o yaml
apiVersion: v1
data:
username: YWRtaW4=
password: MWYyZDFlMmU2N2Rm
kind: Secret
metadata:
creationTimestamp: 2016-01-22T18:41:56Z
name: mysecret
namespace: default
resourceVersion: "164619"
selfLink: /api/v1/namespaces/default/secrets/mysecret
uid: cfee02d6-c137-11e5-8d73-42010af00002
type: Opaque
```
Decode the password field:
```shell
$ echo "MWYyZDFlMmU2N2Rm" | base64 --decode
1f2d1e2e67df
```
### Using Secrets
Secrets can be mounted as data volumes or be exposed as environment variables to
be used by a container in a pod. They can also be used by other parts of the
system, without being directly exposed to the pod. For example, they can hold
credentials that other parts of the system should use to interact with external
systems on your behalf.
#### Using Secrets as Files from a Pod
To consume a Secret in a volume in a Pod:
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
1. Modify your Pod definition to add a volume under `spec.volumes[]`. Name the volume anything, and have a `spec.volumes[].secret.secretName` field equal to the name of the secret object.
1. Add a `spec.containers[].volumeMounts[]` to each container that needs the secret. Specify `spec.containers[].volumeMounts[].readOnly = true` and `spec.containers[].volumeMounts[].mountPath` to an unused directory name where you would like the secrets to appear.
1. Modify your image and/or command line so that the program looks for files in that directory. Each key in the secret `data` map becomes the filename under `mountPath`.
This is an example of a pod that mounts a secret in a volume:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo",
"readOnly": true
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret"
}
}]
}
}
```
Each secret you want to use needs to be referred to in `spec.volumes`.
If there are multiple containers in the pod, then each container needs its
own `volumeMounts` block, but only one `spec.volumes` is needed per secret.
You can package many files into one secret, or use many secrets, whichever is convenient.
**Projection of secret keys to specific paths**
We can also control the paths within the volume where Secret keys are projected.
You can use `spec.volumes[].secret.items` field to change target path of each key:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo",
"readOnly": true
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret",
"items": [{
"key": "username",
"path": "my-group/my-username"
}]
}
}]
}
}
```
What will happen:
* `username` secret is stored under `/etc/foo/my-group/my-username` file instead of `/etc/foo/username`.
* `password` secret is not projected
If `spec.volumes[].secret.items` is used, only keys specified in `items` are projected.
To consume all keys from the secret, all of them must be listed in the `items` field.
All listed keys must exist in the corresponding secret. Otherwise, the volume is not created.
**Secret files permissions**
You can also specify the permission mode bits files part of a secret will have.
If you don't specify any, `0644` is used by default. You can specify a default
mode for the whole secret volume and override per key if needed.
For example, you can specify a default mode like this:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo"
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret",
"defaultMode": 256
}
}]
}
}
```
Then, the secret will be mounted on `/etc/foo` and all the files created by the
secret volume mount will have permission `0400`.
Note that the JSON spec doesn't support octal notation, so use the value 256 for
0400 permissions. If you use yaml instead of json for the pod, you can use octal
notation to specify permissions in a more natural way.
You can also use mapping, as in the previous example, and specify different
permission for different files like this:
```json
{
"apiVersion": "v1",
"kind": "Pod",
"metadata": {
"name": "mypod",
"namespace": "myns"
},
"spec": {
"containers": [{
"name": "mypod",
"image": "redis",
"volumeMounts": [{
"name": "foo",
"mountPath": "/etc/foo"
}]
}],
"volumes": [{
"name": "foo",
"secret": {
"secretName": "mysecret",
"items": [{
"key": "username",
"path": "my-group/my-username",
"mode": 511
}]
}
}]
}
}
```
In this case, the file resulting in `/etc/foo/my-group/my-username` will have
permission value of `0777`. Owing to JSON limitations, you must specify the mode
in decimal notation.
Note that this permission value might be displayed in decimal notation if you
read it later.
**Consuming Secret Values from Volumes**
Inside the container that mounts a secret volume, the secret keys appear as
files and the secret values are base-64 decoded and stored inside these files.
This is the result of commands
executed inside the container from the example above:
```shell
$ ls /etc/foo/
username
password
$ cat /etc/foo/username
admin
$ cat /etc/foo/password
1f2d1e2e67df
```
The program in a container is responsible for reading the secrets from the
files.
**Mounted Secrets are updated automatically**
When a secret being already consumed in a volume is updated, projected keys are eventually updated as well.
Kubelet is checking whether the mounted secret is fresh on every periodic sync.
However, it is using its local ttl-based cache for getting the current value of the secret.
As a result, the total delay from the moment when the secret is updated to the moment when new keys are
projected to the pod can be as long as kubelet sync period + ttl of secrets cache in kubelet.
#### Using Secrets as Environment Variables
To use a secret in an environment variable in a pod:
1. Create a secret or use an existing one. Multiple pods can reference the same secret.
1. Modify your Pod definition in each container that you wish to consume the value of a secret key to add an environment variable for each secret key you wish to consume. The environment variable that consumes the secret key should populate the secret's name and key in `env[x].valueFrom.secretKeyRef`.
1. Modify your image and/or command line so that the program looks for values in the specified environment variables
This is an example of a pod that uses secrets from environment variables:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: secret-env-pod
spec:
containers:
- name: mycontainer
image: redis
env:
- name: SECRET_USERNAME
valueFrom:
secretKeyRef:
name: mysecret
key: username
- name: SECRET_PASSWORD
valueFrom:
secretKeyRef:
name: mysecret
key: password
restartPolicy: Never
```
**Consuming Secret Values from Environment Variables**
Inside a container that consumes a secret in an environment variables, the secret keys appear as
normal environment variables containing the base-64 decoded values of the secret data.
This is the result of commands executed inside the container from the example above:
```shell
$ echo $SECRET_USERNAME
admin
$ echo $SECRET_PASSWORD
1f2d1e2e67df
```
#### Using imagePullSecrets
An imagePullSecret is a way to pass a secret that contains a Docker (or other) image registry
password to the Kubelet so it can pull a private image on behalf of your Pod.
**Manually specifying an imagePullSecret**
Use of imagePullSecrets is described in the [images documentation](/docs/concepts/containers/images/#specifying-imagepullsecrets-on-a-pod)
### Arranging for imagePullSecrets to be Automatically Attached
You can manually create an imagePullSecret, and reference it from
a serviceAccount. Any pods created with that serviceAccount
or that default to use that serviceAccount, will get their imagePullSecret
field set to that of the service account.
See [here](/docs/user-guide/service-accounts/#adding-imagepullsecrets-to-a-service-account)
for a detailed explanation of that process.
#### Automatic Mounting of Manually Created Secrets
We plan to extend the service account behavior so that manually created
secrets (e.g. one containing a token for accessing a github account)
can be automatically attached to pods based on their service account.
*This is not implemented yet. See [issue 9902](http://issue.k8s.io/9902).*
## Details
### Restrictions
Secret volume sources are validated to ensure that the specified object
reference actually points to an object of type `Secret`. Therefore, a secret
needs to be created before any pods that depend on it.
Secret API objects reside in a namespace. They can only be referenced by pods
in that same namespace.
Individual secrets are limited to 1MB in size. This is to discourage creation
of very large secrets which would exhaust apiserver and kubelet memory.
However, creation of many smaller secrets could also exhaust memory. More
comprehensive limits on memory usage due to secrets is a planned feature.
Kubelet only supports use of secrets for Pods it gets from the API server.
This includes any pods created using kubectl, or indirectly via a replication
controller. It does not include pods created via the kubelets
`--manifest-url` flag, its `--config` flag, or its REST API (these are
not common ways to create pods.)
### Secret and Pod Lifetime interaction
When a pod is created via the API, there is no check whether a referenced
secret exists. Once a pod is scheduled, the kubelet will try to fetch the
secret value. If the secret cannot be fetched because it does not exist or
because of a temporary lack of connection to the API server, kubelet will
periodically retry. It will report an event about the pod explaining the
reason it is not started yet. Once the secret is fetched, the kubelet will
create and mount a volume containing it. None of the pod's containers will
start until all the pod's volumes are mounted.
## Use cases
### Use-Case: Pod with ssh keys
Create a secret containing some ssh keys:
```shell
$ kubectl create secret generic ssh-key-secret --from-file=ssh-privatekey=/path/to/.ssh/id_rsa --from-file=ssh-publickey=/path/to/.ssh/id_rsa.pub
```
**Security Note:** think carefully before sending your own ssh keys: other users of the cluster may have access to the secret. Use a service account which you want to have accessible to all the users with whom you share the Kubernetes cluster, and can revoke if they are compromised.
Now we can create a pod which references the secret with the ssh key and
consumes it in a volume:
```json
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "secret-test-pod",
"labels": {
"name": "secret-test"
}
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "ssh-key-secret"
}
}
],
"containers": [
{
"name": "ssh-test-container",
"image": "mySshImage",
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
}
```
When the container's command runs, the pieces of the key will be available in:
```shell
/etc/secret-volume/ssh-publickey
/etc/secret-volume/ssh-privatekey
```
The container is then free to use the secret data to establish an ssh connection.
### Use-Case: Pods with prod / test credentials
This example illustrates a pod which consumes a secret containing prod
credentials and another pod which consumes a secret with test environment
credentials.
Make the secrets:
```shell
$ kubectl create secret generic prod-db-secret --from-literal=username=produser --from-literal=password=Y4nys7f11
secret "prod-db-secret" created
$ kubectl create secret generic test-db-secret --from-literal=username=testuser --from-literal=password=iluvtests
secret "test-db-secret" created
```
Now make the pods:
```json
{
"apiVersion": "v1",
"kind": "List",
"items":
[{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "prod-db-client-pod",
"labels": {
"name": "prod-db-client"
}
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "prod-db-secret"
}
}
],
"containers": [
{
"name": "db-client-container",
"image": "myClientImage",
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
},
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "test-db-client-pod",
"labels": {
"name": "test-db-client"
}
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "test-db-secret"
}
}
],
"containers": [
{
"name": "db-client-container",
"image": "myClientImage",
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
}]
}
```
Both containers will have the following files present on their filesystems:
```shell
/etc/secret-volume/username
/etc/secret-volume/password
```
Note how the specs for the two pods differ only in one field; this facilitates
creating pods with different capabilities from a common pod config template.
You could further simplify the base pod specification by using two Service Accounts:
one called, say, `prod-user` with the `prod-db-secret`, and one called, say,
`test-user` with the `test-db-secret`. Then, the pod spec can be shortened to, for example:
```json
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "prod-db-client-pod",
"labels": {
"name": "prod-db-client"
}
},
"spec": {
"serviceAccount": "prod-db-client",
"containers": [
{
"name": "db-client-container",
"image": "myClientImage"
}
]
}
}
```
### Use-case: Dotfiles in secret volume
In order to make piece of data 'hidden' (i.e., in a file whose name begins with a dot character), simply
make that key begin with a dot. For example, when the following secret is mounted into a volume:
```json
{
"kind": "Secret",
"apiVersion": "v1",
"metadata": {
"name": "dotfile-secret"
},
"data": {
".secret-file": "dmFsdWUtMg0KDQo="
}
}
{
"kind": "Pod",
"apiVersion": "v1",
"metadata": {
"name": "secret-dotfiles-pod"
},
"spec": {
"volumes": [
{
"name": "secret-volume",
"secret": {
"secretName": "dotfile-secret"
}
}
],
"containers": [
{
"name": "dotfile-test-container",
"image": "gcr.io/google_containers/busybox",
"command": [ "ls", "-l", "/etc/secret-volume" ],
"volumeMounts": [
{
"name": "secret-volume",
"readOnly": true,
"mountPath": "/etc/secret-volume"
}
]
}
]
}
}
```
The `secret-volume` will contain a single file, called `.secret-file`, and
the `dotfile-test-container` will have this file present at the path
`/etc/secret-volume/.secret-file`.
**NOTE**
Files beginning with dot characters are hidden from the output of `ls -l`;
you must use `ls -la` to see them when listing directory contents.
### Use-case: Secret visible to one container in a pod
Consider a program that needs to handle HTTP requests, do some complex business
logic, and then sign some messages with an HMAC. Because it has complex
application logic, there might be an unnoticed remote file reading exploit in
the server, which could expose the private key to an attacker.
This could be divided into two processes in two containers: a frontend container
which handles user interaction and business logic, but which cannot see the
private key; and a signer container that can see the private key, and responds
to simple signing requests from the frontend (e.g. over localhost networking).
With this partitioned approach, an attacker now has to trick the application
server into doing something rather arbitrary, which may be harder than getting
it to read a file.
<!-- TODO: explain how to do this while still using automation. -->
## Security Properties
### Protections
Because `secret` objects can be created independently of the `pods` that use
them, there is less risk of the secret being exposed during the workflow of
creating, viewing, and editing pods. The system can also take additional
precautions with `secret` objects, such as avoiding writing them to disk where
possible.
A secret is only sent to a node if a pod on that node requires it. It is not
written to disk. It is stored in a tmpfs. It is deleted once the pod that
depends on it is deleted.
On most Kubernetes-project-maintained distributions, communication between user
to the apiserver, and from apiserver to the kubelets, is protected by SSL/TLS.
Secrets are protected when transmitted over these channels.
Secret data on nodes is stored in tmpfs volumes and thus does not come to rest
on the node.
There may be secrets for several pods on the same node. However, only the
secrets that a pod requests are potentially visible within its containers.
Therefore, one Pod does not have access to the secrets of another pod.
There may be several containers in a pod. However, each container in a pod has
to request the secret volume in its `volumeMounts` for it to be visible within
the container. This can be used to construct useful [security partitions at the
Pod level](#use-case-secret-visible-to-one-container-in-a-pod).
### Risks
- In the API server secret data is stored as plaintext in etcd; therefore:
- Administrators should limit access to etcd to admin users
- Secret data in the API server is at rest on the disk that etcd uses; admins may want to wipe/shred disks
used by etcd when no longer in use
- Applications still need to protect the value of secret after reading it from the volume,
such as not accidentally logging it or transmitting it to an untrusted party.
- A user who can create a pod that uses a secret can also see the value of that secret. Even
if apiserver policy does not allow that user to read the secret object, the user could
run a pod which exposes the secret.
- If multiple replicas of etcd are run, then the secrets will be shared between them.
By default, etcd does not secure peer-to-peer communication with SSL/TLS, though this can be configured.
- Currently, anyone with root on any node can read any secret from the apiserver,
by impersonating the kubelet. It is a planned feature to only send secrets to
nodes that actually require them, to restrict the impact of a root exploit on a
single node.
[Secrets](/docs/concepts/configuration/secret/)

View File

@ -1,6 +1,18 @@
$( document ).ready(function() {
var oldURLs = ["/README.md","/README.html","/index.md",".html",".md","/v1.1/","/v1.0/"];
var fwdDirs = ["examples/","cluster/","docs/devel","docs/design"];
var forwardingRules = [{
"from":"/docs/api-reference/v1/definitions",
"pattern":"#_v1_(\\w+)",
"to":"/docs/api-reference/v1.6",
"postfix":"/#<token>-v1-core"
},
{
"from":"/docs/user-guide/kubectl/kubectl_",
"pattern":"kubectl_(\\w+)",
"to":"/docs/user-guide/kubectl/v1.6",
"postfix":"/#<token>"
}];
var doRedirect = false;
var notHere = false;
var forwardingURL = window.location.href;
@ -26,6 +38,20 @@ $( document ).ready(function() {
"to": "http://kubernetes.io/docs/whatisk8s/"
}];
forwardingRules.forEach(function(rule) {
if (forwardingURL.indexOf(rule.from) > -1) {
var re = new RegExp(rule.pattern, 'g');
var matchary = re.exec(forwardingURL);
var newURL = rule.to;
if (matchary !== null) {
newURL += rule.postfix.replace("<token>", matchary[1]);
}
notHere = true;
window.location.replace(newURL);
}
});
for (var i = 0; i < redirects.length; i++) {
if (forwardingURL.indexOf(redirects[i].from) > -1){
notHere = true;
@ -41,6 +67,7 @@ $( document ).ready(function() {
window.location.replace(newURL);
}
}
if (!notHere) {
for (var i = 0; i < oldURLs.length; i++) {
if (forwardingURL.indexOf(oldURLs[i]) > -1 &&

View File

@ -2,7 +2,5 @@
title: Kubernetes API Swagger Spec
---
---
Kubernetes swagger UI has now been replaced by our generated API reference docs
which can be accessed at http://kubernetes.io/docs/api-reference/README/.
which can be accessed at [http://kubernetes.io/docs/api-reference/{{page.version}}/](/docs/api-reference/{{page.version}}/).

View File

@ -51,7 +51,6 @@
REPO_TMPL = "https://github.com/kubernetes/kubernetes/tree/%s/%s/:splat"
fixed_redirects = """# 301 redirects (301 is the default status when no other one is provided for each line)
/third_party/swagger-ui /kubernetes/third_party/swagger-ui/
/resource-quota /docs/admin/resourcequota/
/horizontal-pod-autoscaler /docs/user-guide/horizontal-pod-autoscaling/
/docs/user-guide/overview /docs/whatisk8s/

View File

@ -10,7 +10,6 @@ Disallow: /docs/user-guide/configuring-containers
Disallow: /docs/user-guide/containers
Disallow: /docs/user-guide/deploying-applications
Disallow: /docs/user-guide/getting-into-containers
>>>>>>> fb2ab359... Remove from TOC/Search: pods/init-containers ...
Disallow: /docs/user-guide/liveness/index
Disallow: /docs/user-guide/pod-states
Disallow: /docs/user-guide/simple-nginx