Merge branch 'master' into kubectrlprereq
commit
7085715c47
|
@ -24,3 +24,4 @@ Session.vim
|
|||
tags
|
||||
|
||||
kubernetes.github.io.iml
|
||||
_redirects
|
||||
|
|
22
.travis.yml
22
.travis.yml
|
@ -7,15 +7,33 @@ install:
|
|||
- export PATH=$GOPATH/bin:$PATH
|
||||
- mkdir -p $HOME/gopath/src/k8s.io
|
||||
- mv $TRAVIS_BUILD_DIR $HOME/gopath/src/k8s.io/kubernetes.github.io
|
||||
|
||||
# (1) Fetch dependencies for us to run the tests in test/examples_test.go
|
||||
- go get -t -v k8s.io/kubernetes.github.io/test
|
||||
- git clone --depth=50 --branch=master https://github.com/kubernetes/md-check $HOME/gopath/src/k8s.io/md-check
|
||||
- go get -t -v k8s.io/md-check
|
||||
|
||||
# The dependencies are complicated for test/examples_test.go
|
||||
# k8s.io/kubernetes/pkg is a dependency, which in turn depends on apimachinery
|
||||
# but we also have apimachinery directly as one of our dependencies, which causes a conflict.
|
||||
# Additionally, we get symlinks when we clone the directory. The below steps do the following:
|
||||
|
||||
# (a) Replace the symlink with the actual dependencies from kubernetes/staging/src/
|
||||
# (b) copy all the vendored files to $GOPATH/src
|
||||
- rm $GOPATH/src/k8s.io/kubernetes/vendor/k8s.io/apimachinery
|
||||
- rm $GOPATH/src/k8s.io/kubernetes/vendor/k8s.io/apiserver
|
||||
- rm $GOPATH/src/k8s.io/kubernetes/vendor/k8s.io/client-go
|
||||
- rm $GOPATH/src/k8s.io/kubernetes/vendor/k8s.io/sample-apiserver
|
||||
- rm $GOPATH/src/k8s.io/kubernetes/vendor/k8s.io/kube-aggregator
|
||||
- cp -r $GOPATH/src/k8s.io/kubernetes/vendor/* $GOPATH/src/
|
||||
- rm -rf $GOPATH/src/k8s.io/kubernetes/vendor/*
|
||||
- cp -r $GOPATH/src/k8s.io/kubernetes/staging/src/* $GOPATH/src/
|
||||
- cp -r $GOPATH/src/k8s.io/apimachinery/vendor/* $GOPATH/src/
|
||||
- rm -rf $GOPATH/src/k8s.io/apimachinery/vendor/*
|
||||
|
||||
# (2) Fetch md-check along with all its dependencies.
|
||||
- git clone --depth=50 --branch=master https://github.com/kubernetes/md-check $HOME/gopath/src/k8s.io/md-check
|
||||
- go get -t -v k8s.io/md-check
|
||||
|
||||
# (3) Fetch mungedocs
|
||||
- go get -v k8s.io/kubernetes/cmd/mungedocs
|
||||
|
||||
script:
|
||||
|
|
5
Makefile
5
Makefile
|
@ -1,4 +1,4 @@
|
|||
.PHONY: all build build-preview help serve
|
||||
.PHONY: all build build-preview generate-redirects help serve
|
||||
|
||||
help: ## Show this help.
|
||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {sub("\\\\n",sprintf("\n%22c"," "), $$2);printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||
|
@ -11,5 +11,8 @@ build: ## Build site with production settings and put deliverables in _site.
|
|||
build-preview: ## Build site with drafts and future posts enabled.
|
||||
jekyll build --drafts --future
|
||||
|
||||
generate-redirects: ## Generate a redirects file and copy it into the _site directory.
|
||||
mkdir -p _site && REDIRECTS_PATH=_site/_redirects ruby redirects.rb
|
||||
|
||||
serve: ## Boot the development server.
|
||||
jekyll serve
|
||||
|
|
|
@ -3,30 +3,62 @@ abstract: "Detailed explanations of Kubernetes system concepts and abstractions.
|
|||
toc:
|
||||
- docs/concepts/index.md
|
||||
|
||||
- title: Kubectl Command Line
|
||||
- title: Overview
|
||||
section:
|
||||
- docs/concepts/tools/kubectl/object-management-overview.md
|
||||
- docs/concepts/tools/kubectl/object-management-using-imperative-commands.md
|
||||
- docs/concepts/tools/kubectl/object-management-using-imperative-config.md
|
||||
- docs/concepts/tools/kubectl/object-management-using-declarative-config.md
|
||||
|
||||
- title: Kubernetes Objects
|
||||
section:
|
||||
- docs/concepts/abstractions/overview.md
|
||||
- docs/concepts/abstractions/pod.md
|
||||
- docs/concepts/overview/what-is-kubernetes.md
|
||||
- docs/concepts/overview/components.md
|
||||
- title: Working with Kubernetes Objects
|
||||
section:
|
||||
- docs/concepts/overview/working-with-objects/kubernetes-objects.md
|
||||
- docs/concepts/overview/working-with-objects/labels.md
|
||||
- docs/concepts/overview/working-with-objects/annotations.md
|
||||
- docs/concepts/overview/kubernetes-api.md
|
||||
|
||||
- title: Workloads
|
||||
section:
|
||||
- title: Pods
|
||||
section:
|
||||
- docs/concepts/workloads/pods/pod-overview.md
|
||||
- docs/concepts/workloads/pods/pod-lifecycle.md
|
||||
- docs/concepts/workloads/pods/init-containers.md
|
||||
- title: Controllers
|
||||
section:
|
||||
- docs/concepts/abstractions/controllers/statefulsets.md
|
||||
- docs/concepts/workloads/controllers/statefulset.md
|
||||
- docs/concepts/workloads/controllers/petset.md
|
||||
- docs/concepts/workloads/controllers/garbage-collection.md
|
||||
- title: Jobs
|
||||
section:
|
||||
- docs/concepts/jobs/run-to-completion-finite-workloads.md
|
||||
|
||||
- title: Object Metadata
|
||||
- title: Cluster Administration
|
||||
section:
|
||||
- docs/concepts/object-metadata/annotations.md
|
||||
- docs/concepts/cluster-administration/manage-deployment.md
|
||||
- docs/concepts/cluster-administration/networking.md
|
||||
- docs/concepts/cluster-administration/network-plugins.md
|
||||
- docs/concepts/cluster-administration/logging.md
|
||||
- docs/concepts/cluster-administration/audit.md
|
||||
- docs/concepts/cluster-administration/out-of-resource.md
|
||||
- docs/concepts/cluster-administration/multiple-clusters.md
|
||||
- docs/concepts/cluster-administration/federation.md
|
||||
- docs/concepts/cluster-administration/federation-service-discovery.md
|
||||
- docs/concepts/cluster-administration/guaranteed-scheduling-critical-addon-pods.md
|
||||
- docs/concepts/cluster-administration/static-pod.md
|
||||
- docs/concepts/cluster-administration/sysctl-cluster.md
|
||||
- docs/concepts/cluster-administration/access-cluster.md
|
||||
- docs/concepts/cluster-administration/authenticate-across-clusters-kubeconfig.md
|
||||
|
||||
- title: Services, Load Balancing, and Networking
|
||||
section:
|
||||
- docs/concepts/services-networking/dns-pod-service.md
|
||||
- docs/concepts/services-networking/connect-applications-service.md
|
||||
|
||||
- title: Configuration
|
||||
section:
|
||||
- docs/concepts/configuration/overview.md
|
||||
- docs/concepts/configuration/container-command-args.md
|
||||
- docs/concepts/configuration/manage-compute-resources-container.md
|
||||
|
||||
- title: Policies
|
||||
section:
|
||||
- docs/concepts/policy/container-capabilities.md
|
||||
- docs/concepts/policy/resource-quotas.md
|
||||
|
|
|
@ -5,6 +5,8 @@ toc:
|
|||
|
||||
- docs/whatisk8s.md
|
||||
|
||||
- docs/user-guide/index.md
|
||||
|
||||
- title: Accessing the Cluster
|
||||
section:
|
||||
- docs/user-guide/prereqs.md
|
||||
|
@ -12,20 +14,15 @@ toc:
|
|||
- docs/user-guide/sharing-clusters.md
|
||||
- docs/user-guide/kubeconfig-file.md
|
||||
|
||||
- docs/user-guide/index.md
|
||||
|
||||
- docs/user-guide/ui.md
|
||||
|
||||
- title: Workload Deployment and Management
|
||||
section:
|
||||
- docs/user-guide/quick-start.md
|
||||
- docs/user-guide/deploying-applications.md
|
||||
- docs/user-guide/managing-deployments.md
|
||||
- docs/user-guide/replication-controller/operations.md
|
||||
- docs/user-guide/resizing-a-replication-controller.md
|
||||
- docs/user-guide/rolling-updates.md
|
||||
- docs/user-guide/update-demo/index.md
|
||||
- docs/user-guide/secrets/walkthrough.md
|
||||
- docs/user-guide/configmap/index.md
|
||||
- docs/user-guide/horizontal-pod-autoscaling/walkthrough.md
|
||||
- docs/user-guide/config-best-practices.md
|
||||
|
@ -56,14 +53,9 @@ toc:
|
|||
|
||||
- title: Containers and Pods
|
||||
section:
|
||||
- docs/user-guide/simple-nginx.md
|
||||
- docs/user-guide/pods/single-container.md
|
||||
- docs/user-guide/pods/multi-container.md
|
||||
- docs/user-guide/pods/init-container.md
|
||||
- docs/user-guide/configuring-containers.md
|
||||
- docs/user-guide/pod-templates.md
|
||||
- docs/user-guide/production-pods.md
|
||||
- docs/user-guide/containers.md
|
||||
- docs/user-guide/environment-guide/index.md
|
||||
- docs/user-guide/compute-resources.md
|
||||
- docs/user-guide/pod-states.md
|
||||
|
@ -72,7 +64,6 @@ toc:
|
|||
- docs/user-guide/node-selection/index.md
|
||||
- docs/user-guide/downward-api/index.md
|
||||
- docs/user-guide/downward-api/volume/index.md
|
||||
- docs/user-guide/persistent-volumes/walkthrough.md
|
||||
- docs/user-guide/petset/bootstrapping/index.md
|
||||
|
||||
- title: Monitoring, Logging, and Debugging Containers
|
||||
|
@ -178,6 +169,7 @@ toc:
|
|||
section:
|
||||
- docs/admin/index.md
|
||||
- docs/admin/cluster-management.md
|
||||
- docs/admin/upgrade-1-6.md
|
||||
- docs/admin/kubeadm.md
|
||||
- docs/admin/addons.md
|
||||
- docs/admin/audit.md
|
||||
|
@ -228,5 +220,5 @@ toc:
|
|||
- title: Federation Components
|
||||
section:
|
||||
- docs/admin/federation-apiserver.md
|
||||
- title : federation-controller-mananger
|
||||
- title : federation-controller-manager
|
||||
path: /docs/admin/federation-controller-manager
|
||||
|
|
|
@ -3,28 +3,54 @@ abstract: "Step-by-step instructions for performing operations with Kubernetes."
|
|||
toc:
|
||||
- docs/tasks/index.md
|
||||
|
||||
- title: Using the kubectl Command-Line
|
||||
section:
|
||||
- docs/tasks/kubectl/install.md
|
||||
- docs/tasks/kubectl/list-all-running-container-images.md
|
||||
- docs/tasks/kubectl/get-shell-running-container.md
|
||||
|
||||
- title: Configuring Pods and Containers
|
||||
section:
|
||||
- docs/tasks/configure-pod-container/define-environment-variable-container.md
|
||||
- docs/tasks/configure-pod-container/define-command-argument-container.md
|
||||
- docs/tasks/configure-pod-container/assign-cpu-ram-container.md
|
||||
- docs/tasks/configure-pod-container/limit-range.md
|
||||
- docs/tasks/configure-pod-container/apply-resource-quota-limit.md
|
||||
- docs/tasks/configure-pod-container/configure-volume-storage.md
|
||||
- docs/tasks/configure-pod-container/configure-persistent-volume-storage.md
|
||||
- docs/tasks/configure-pod-container/environment-variable-expose-pod-information.md
|
||||
- docs/tasks/configure-pod-container/downward-api-volume-expose-pod-information.md
|
||||
- docs/tasks/configure-pod-container/distribute-credentials-secure.md
|
||||
- docs/tasks/configure-pod-container/pull-image-private-registry.md
|
||||
- docs/tasks/configure-pod-container/configure-liveness-readiness-probes.md
|
||||
- docs/tasks/configure-pod-container/communicate-containers-same-pod.md
|
||||
- docs/tasks/configure-pod-container/configure-pod-initialization.md
|
||||
- docs/tasks/configure-pod-container/attach-handler-lifecycle-event.md
|
||||
- docs/tasks/configure-pod-container/configure-pod-disruption-budget.md
|
||||
|
||||
- title: Running Applications
|
||||
section:
|
||||
- docs/tasks/run-application/rolling-update-replication-controller.md
|
||||
|
||||
- title: Running Jobs
|
||||
section:
|
||||
- docs/tasks/job/parallel-processing-expansion.md
|
||||
- docs/tasks/job/coarse-parallel-processing-work-queue/index.md
|
||||
- docs/tasks/job/fine-parallel-processing-work-queue/index.md
|
||||
|
||||
- title: Accessing Applications in a Cluster
|
||||
section:
|
||||
- docs/tasks/access-application-cluster/port-forward-access-application-cluster.md
|
||||
- docs/tasks/access-application-cluster/load-balance-access-application-cluster.md
|
||||
- docs/tasks/access-application-cluster/configure-cloud-provider-firewall.md
|
||||
|
||||
- title: Debugging Applications in a Cluster
|
||||
- title: Monitoring, Logging, and Debugging
|
||||
section:
|
||||
- docs/tasks/debug-application-cluster/determine-reason-pod-failure.md
|
||||
- docs/tasks/debug-application-cluster/debug-init-containers.md
|
||||
- docs/tasks/debug-application-cluster/logging-stackdriver.md
|
||||
- docs/tasks/debug-application-cluster/monitor-node-health.md
|
||||
- docs/tasks/debug-application-cluster/logging-elasticsearch-kibana.md
|
||||
|
||||
- title: Accessing the Kubernetes API
|
||||
section:
|
||||
|
@ -36,6 +62,19 @@ toc:
|
|||
- docs/tasks/administer-cluster/dns-horizontal-autoscaling.md
|
||||
- docs/tasks/administer-cluster/safely-drain-node.md
|
||||
- docs/tasks/administer-cluster/change-pv-reclaim-policy.md
|
||||
- docs/tasks/administer-cluster/limit-storage-consumption.md
|
||||
- docs/tasks/administer-cluster/share-configuration.md
|
||||
|
||||
- title: Administering Federation
|
||||
section:
|
||||
- docs/tasks/administer-federation/configmap.md
|
||||
- docs/tasks/administer-federation/daemonset.md
|
||||
- docs/tasks/administer-federation/deployment.md
|
||||
- docs/tasks/administer-federation/events.md
|
||||
- docs/tasks/administer-federation/ingress.md
|
||||
- docs/tasks/administer-federation/namespaces.md
|
||||
- docs/tasks/administer-federation/replicaset.md
|
||||
- docs/tasks/administer-federation/secret.md
|
||||
|
||||
- title: Managing Stateful Applications
|
||||
section:
|
||||
|
@ -44,8 +83,3 @@ toc:
|
|||
- docs/tasks/manage-stateful-set/deleting-a-statefulset.md
|
||||
- docs/tasks/manage-stateful-set/debugging-a-statefulset.md
|
||||
- docs/tasks/manage-stateful-set/delete-pods.md
|
||||
|
||||
- title: Troubleshooting
|
||||
section:
|
||||
- docs/tasks/troubleshoot/debug-init-containers.md
|
||||
- docs/tasks/administer-cluster/access-control-identity-management/
|
||||
|
|
|
@ -32,11 +32,18 @@ toc:
|
|||
- title: Online Training Course
|
||||
path: https://www.udacity.com/course/scalable-microservices-with-kubernetes--ud615
|
||||
- docs/tutorials/stateless-application/hello-minikube.md
|
||||
- title: Object Management Using kubectl
|
||||
section:
|
||||
- docs/tutorials/object-management-kubectl/object-management.md
|
||||
- docs/tutorials/object-management-kubectl/imperative-object-management-command.md
|
||||
- docs/tutorials/object-management-kubectl/imperative-object-management-configuration.md
|
||||
- docs/tutorials/object-management-kubectl/declarative-object-management-configuration.md
|
||||
- title: Stateless Applications
|
||||
section:
|
||||
- docs/tutorials/stateless-application/run-stateless-application-deployment.md
|
||||
- docs/tutorials/stateless-application/expose-external-ip-address-service.md
|
||||
- docs/tutorials/stateless-application/expose-external-ip-address.md
|
||||
- docs/tutorials/stateless-application/run-stateless-ap-replication-controller.md
|
||||
- title: Stateful Applications
|
||||
section:
|
||||
- docs/tutorials/stateful-application/basic-stateful-set.md
|
||||
|
@ -46,6 +53,12 @@ toc:
|
|||
- title: Connecting Applications
|
||||
section:
|
||||
- docs/tutorials/connecting-apps/connecting-frontend-backend.md
|
||||
- title: Clusters
|
||||
section:
|
||||
- docs/tutorials/clusters/apparmor.md
|
||||
- title: Services
|
||||
section:
|
||||
- docs/tutorials/services/source-ip.md
|
||||
- title: Federated Cluster Administration
|
||||
section:
|
||||
- docs/tutorials/federation/set-up-cluster-federation-kubefed.md
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
***NOTE: This feature is beta in Kubernetes 1.5.***
|
|
@ -279,14 +279,120 @@
|
|||
logo: 'harbur',
|
||||
link: 'https://harbur.io/',
|
||||
blurb: 'Based in Barcelona, Harbur is a consulting firm that helps companies deploy self-healing solutions empowered by Container technologies'
|
||||
},
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'Endocode',
|
||||
logo: 'endocode',
|
||||
link: 'https://endocode.com/kubernetes/',
|
||||
blurb: 'Endocode practices and teaches the open source way. Kernel to cluster - Dev to Ops. We offer Kubernetes trainings, services and support.'
|
||||
}
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'Spotinst',
|
||||
logo: 'spotinst',
|
||||
link: 'http://blog.spotinst.com/2016/08/04/elastigroup-kubernetes-minions-steroids/',
|
||||
blurb: 'Spotinst uses a prediction algorithm in the Amazon EC2 Spot allowing k8s clusters to increase performance and lower the infrastructure costs'
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'inwinSTACK',
|
||||
logo: 'inwinstack',
|
||||
link: 'http://www.inwinstack.com/index.php/en/solutions-en/',
|
||||
blurb: 'Our container service leverages OpenStack-based infrastructure and its container orchestration engine Magnum to manage Kubernetes clusters.'
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'Semantix',
|
||||
logo: 'semantix',
|
||||
link: 'http://www.semantix.com.br/',
|
||||
blurb: 'Semantix is a company that works with data analytics and distributed systems. Kubernetes is used to orchestrate services for our customers.'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'ASM Technologies Limited',
|
||||
logo: 'asm',
|
||||
link: 'http://www.asmtech.com/',
|
||||
blurb: 'Our technology supply chain portfolio enables your software products to be accessible, viable and available more effectively.'
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'InfraCloud Technologies',
|
||||
logo: 'infracloud',
|
||||
link: 'http://blog.infracloud.io/state-of-kubernetes/',
|
||||
blurb: 'InfraCloud Technologies is software consultancy which provides services in Containers, Cloud and DevOps.'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'SignalFx',
|
||||
logo: 'signalfx',
|
||||
link: 'https://github.com/signalfx/integrations/tree/master/kubernetes',
|
||||
blurb: 'Gain real-time visibility across metrics & the most intelligent alerts for todays architectures, including deep integration with Kubernetes'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'NATS',
|
||||
logo: 'nats',
|
||||
link: 'https://github.com/pires/kubernetes-nats-cluster',
|
||||
blurb: 'NATS is a simple, secure, and scalable cloud native messaging system.'
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'RX-M',
|
||||
logo: 'rxm',
|
||||
link: 'http://rx-m.com/training/kubernetes-training/',
|
||||
blurb: 'Market neutral Kubernetes Dev, DevOps and Production training and consulting services'
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'Emerging Technology Advisors',
|
||||
logo: 'eta',
|
||||
link: 'https://www.emergingtechnologyadvisors.com/services/kubernetes.html',
|
||||
blurb: 'ETA helps companies architect, implement, and manage scalable applications using Kubernetes on on public or private cloud.'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'CloudPlex.io',
|
||||
logo: 'cloudplex',
|
||||
link: 'http://www.cloudplex.io',
|
||||
blurb: 'CloudPlex enables operations teams to visually deploy, orchestrate, manage, and monitor infrastructure, applications, and services in public or private cloud.'
|
||||
},
|
||||
{
|
||||
type: 1,
|
||||
name: 'Kumina',
|
||||
logo: 'kumina',
|
||||
link: 'https://www.kumina.nl/managed_kubernetes',
|
||||
blurb: 'Kumina creates Kubernetes solutions on your choice of infrastructure with around-the-clock management and unlimited support.'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'CA Technologies',
|
||||
logo: 'ca',
|
||||
link: 'https://www.ca.com/us/products/application-deployment.html',
|
||||
blurb: 'The RA CDE Kubernetes plugin enables an automated process for pushing changes to production by applying standard Kubernetes YAML files'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'CoScale',
|
||||
logo: 'coscale',
|
||||
link: 'http://www.coscale.com/blog/how-to-monitor-your-kubernetes-cluster',
|
||||
blurb: 'Full stack monitoring of containers and microservices orchestrated by Kubernetes. Powered by anomaly detection to find problems faster.'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'Supergiant.io',
|
||||
logo: 'supergiant',
|
||||
link: 'https://supergiant.io/blog/supergiant-packing-algorithm-unique-save-money',
|
||||
blurb: 'Supergiant autoscales hardware for Kubernetes. Open-source, it makes HA, distributed, stateful apps easy to deploy, manage, and scale.'
|
||||
},
|
||||
{
|
||||
type: 0,
|
||||
name: 'Avi Networks',
|
||||
logo: 'avinetworks',
|
||||
link: 'https://kb.avinetworks.com/avi-vantage-openshift-installation-guide/',
|
||||
blurb: 'Avis elastic application services fabric provides scalable, feature rich & integrated L4-7 networking for K8S environments.'
|
||||
}
|
||||
|
||||
]
|
||||
|
||||
var isvContainer = document.getElementById('isvContainer')
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
|
||||
<table style="background-color:#eeeeee">
|
||||
<tr>
|
||||
<td>
|
||||
<p><b>NOTICE</b></p>
|
||||
<p>As of March 14, 2017, the <a href="https://github.com/orgs/kubernetes/teams/sig-docs-maintainers">@kubernetes/sig-docs-maintainers</a> have begun migration of the User Guide content as announced previously to the <a href="https://github.com/kubernetes/community/tree/master/sig-docs">SIG Docs community</a> through the <a href="https://groups.google.com/forum/#!forum/kubernetes-sig-docs">kubernetes-sig-docs</a> group and <a href="https://kubernetes.slack.com/messages/sig-docs/">kubernetes.slack.com #sig-docs</a> channel.</p>
|
||||
<p>The user guides within this section are being refactored into topics within Tutorials, Tasks, and Concepts. Anything that has been moved will have a notice placed in its previous location as well as a link to its new location. The reorganization implements the table of contents (ToC) outlined in the <a href="https://docs.google.com/a/google.com/document/d/18hRCIorVarExB2eBVHTUR6eEJ2VVk5xq1iBmkQv8O6I/edit?usp=sharing">kubernetes-docs-toc</a> document and should improve the documentation's findability and readability for a wider range of audiences.</p>
|
||||
<p>For any questions, please contact: <a href="mailto:kubernetes-sig-docs@googlegroups.com">kubernetes-sig-docs@googlegroups.com</a></p>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
|
@ -3453,7 +3453,7 @@ Populated by the system when a graceful deletion is requested. Read-only. More i
|
|||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nodeSelector</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection/README">http://kubernetes.io/docs/user-guide/node-selection/README</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection">http://kubernetes.io/docs/user-guide/node-selection</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">object</p></td>
|
||||
<td class="tableblock halign-left valign-top"></td>
|
||||
|
|
|
@ -4172,7 +4172,7 @@ The resulting set of endpoints can be viewed as:<br>
|
|||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nodeSelector</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection/README">http://kubernetes.io/docs/user-guide/node-selection/README</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection">http://kubernetes.io/docs/user-guide/node-selection</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">object</p></td>
|
||||
<td class="tableblock halign-left valign-top"></td>
|
||||
|
@ -8146,7 +8146,7 @@ The resulting set of endpoints can be viewed as:<br>
|
|||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">unschedulable</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">Unschedulable controls node schedulability of new pods. By default, node is schedulable. More info: <a href="http://releases.k8s.io/HEAD/docs/admin/node.md#manual-node-administration"">http://releases.k8s.io/HEAD/docs/admin/node.md#manual-node-administration"</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">Unschedulable controls node schedulability of new pods. By default, node is schedulable. More info: <a href="http://releases.k8s.io/HEAD/docs/admin/node.md#manual-node-administration">http://releases.k8s.io/HEAD/docs/admin/node.md#manual-node-administration</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">boolean</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
|
@ -8263,4 +8263,4 @@ Last updated 2016-11-17 06:26:10 UTC
|
|||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
<li><a href="/docs/troubleshooting/" {% if toc.bigheader == "Support" %}class="YAH"{% endif %}>SUPPORT</a></li>
|
||||
</ul>
|
||||
<div id="searchBox">
|
||||
<input type="text" id="search" placeholder="Search" onkeydown="if (event.keyCode==13) window.location.replace('/docs/search/?q=' + this.value)">
|
||||
<input type="text" id="search" placeholder="Search" onkeydown="if (event.keyCode==13) window.location.replace('/docs/search/?q=' + this.value)" autofocus="autofocus">
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
|
|
@ -816,9 +816,9 @@ dd
|
|||
font-weight: 500
|
||||
|
||||
p
|
||||
font-size: 14px
|
||||
font-size: 16px
|
||||
font-weight: 300
|
||||
line-height: 1.25em
|
||||
line-height: 1.75em
|
||||
|
||||
p + p
|
||||
margin-top: 10px
|
||||
|
@ -856,6 +856,7 @@ dd
|
|||
display: block
|
||||
margin: 20px 0
|
||||
padding: 15px
|
||||
position: relative
|
||||
overflow-x: auto
|
||||
|
||||
h1 code, h2 code, h3 code, h4 code, h5 code, h6 code
|
||||
|
@ -893,6 +894,8 @@ dd
|
|||
|
||||
li
|
||||
margin-bottom: 0.75em
|
||||
font-size: 16px
|
||||
line-height: 1.75em
|
||||
|
||||
table
|
||||
width: 100%
|
||||
|
@ -1277,7 +1280,7 @@ $feature-box-div-margin-bottom: 40px
|
|||
background-color: $white
|
||||
box-shadow: 0 5px 5px rgba(0,0,0,.24),0 0 5px rgba(0,0,0,.12)
|
||||
|
||||
#calendarWrapper
|
||||
#calendarMeetings
|
||||
position: relative
|
||||
width: 80vw
|
||||
height: 60vw
|
||||
|
@ -1285,6 +1288,14 @@ $feature-box-div-margin-bottom: 40px
|
|||
max-height: 900px
|
||||
margin: 20px auto
|
||||
|
||||
#calendarEvents
|
||||
position: relative
|
||||
width: 80vw
|
||||
height: 30vw
|
||||
max-width: 1200px
|
||||
max-height: 450px
|
||||
margin: 20px auto
|
||||
|
||||
iframe
|
||||
position: absolute
|
||||
border: 0
|
||||
|
|
|
@ -94,6 +94,6 @@ cid: caseStudies
|
|||
|
||||
<div id="videoPlayer">
|
||||
<!--<iframe data-url="https://www.youtube.com/watch?v=B0_5Nms8sD0" frameborder="0" allowfullscreen></iframe>-->
|
||||
<iframe data-url="https://www.youtube.com/embed/4gyeixJLabo?autoplay=1" frameborder="0" allowfullscreen></iframe>
|
||||
<iframe data-url="https://www.youtube.com/embed/4gyeixJLabo?autoplay=1" frameborder="0" allowfullscreen="true"></iframe>
|
||||
<button id="closeButton"></button>
|
||||
</div>
|
||||
|
|
|
@ -27,10 +27,17 @@ cid: community
|
|||
<a href="https://github.com/kubernetes/kubernetes/wiki/Special-Interest-Groups-(SIGs)">lists of SIGs</a>,
|
||||
from AWS and Openstack to Big Data and Scalability, there's a place for you to contribute and instructions
|
||||
for forming a new SIG if your special interest isn't covered (yet).</p>
|
||||
|
||||
<p>As a member of the Kubernetes community, you are welcome to join any of the SIG meetings
|
||||
you are interested in. No registration required.</p>
|
||||
<div id="calendarMeetings">
|
||||
<iframe src="https://calendar.google.com/calendar/embed?src=cgnt364vd8s86hr2phapfjc6uk%40group.calendar.google.com&ctz=America/Los_Angeles"
|
||||
frameborder="0" scrolling="no"></iframe>
|
||||
</div>
|
||||
</div>
|
||||
<div class="content">
|
||||
<h3>Events</h3>
|
||||
<div id="calendarWrapper">
|
||||
<div id="calendarEvents">
|
||||
<iframe src="https://calendar.google.com/calendar/embed?src=nt2tcnbtbied3l6gi2h29slvc0%40group.calendar.google.com&ctz=America/Los_Angeles"
|
||||
frameborder="0" scrolling="no"></iframe>
|
||||
</div>
|
||||
|
|
|
@ -14,6 +14,7 @@ Add-ons in each section are sorted alphabetically - the ordering does not imply
|
|||
|
||||
* [Calico](http://docs.projectcalico.org/v2.0/getting-started/kubernetes/installation/hosted/) is a secure L3 networking and network policy provider.
|
||||
* [Canal](https://github.com/tigera/canal/tree/master/k8s-install/kubeadm) unites Flannel and Calico, providing networking and network policy.
|
||||
* [Contiv](http://contiv.github.io) provides configurable networking (native L3 using BGP, overlay using vxlan, classic L2, and Cisco-SDN/ACI) for various use cases and a rich policy framework. Contiv project is fully [open sourced](http://github.com/contiv). The [installer](http://github.com/contiv/install) provides both kubeadm and non-kubeadm based installation options.
|
||||
* [Flannel](https://github.com/coreos/flannel/blob/master/Documentation/kube-flannel.yml) is an overlay network provider that can be used with Kubernetes.
|
||||
* [Romana](http://romana.io) is a Layer 3 networking solution for pod networks that also supports the [NetworkPolicy API](/docs/user-guide/networkpolicies/). Kubeadm add-on installation details available [here](https://github.com/romana/romana/tree/master/containerize).
|
||||
* [Weave Net](https://www.weave.works/docs/net/latest/kube-addon/) provides networking and network policy, will carry on working on both sides of a network partition, and does not require an external database.
|
||||
|
|
|
@ -87,7 +87,7 @@ The ImagePolicyWebhook plug-in allows a backend webhook to make admission decisi
|
|||
```
|
||||
|
||||
#### Configuration File Format
|
||||
ImagePolicyWebhook uses the admission controller config file (`--admission-controller-config-file`) to set configuration options for the behavior of the backend. This file may be json or yaml and has the following format:
|
||||
ImagePolicyWebhook uses the admission config file `--admission-controller-config-file` to set configuration options for the behavior of the backend. This file may be json or yaml and has the following format:
|
||||
|
||||
```javascript
|
||||
{
|
||||
|
@ -101,7 +101,7 @@ ImagePolicyWebhook uses the admission controller config file (`--admission-contr
|
|||
}
|
||||
```
|
||||
|
||||
The config file must reference a [kubeconfig](/docs/user-guide/kubeconfig-file/) formatted file which sets up the connection to the backend. It is required that the backend communicate over TLS.
|
||||
The config file must reference a [kubeconfig](/docs/concepts/cluster-administration/authenticate-across-clusters-kubeconfig/) formatted file which sets up the connection to the backend. It is required that the backend communicate over TLS.
|
||||
|
||||
The kubeconfig file's cluster field must point to the remote service, and the user field must contain the returned authorizer.
|
||||
|
||||
|
@ -120,7 +120,7 @@ users:
|
|||
client-certificate: /path/to/cert.pem # cert for the webhook plugin to use
|
||||
client-key: /path/to/key.pem # key matching the cert
|
||||
```
|
||||
For additional HTTP configuration, refer to the [kubeconfig](/docs/user-guide/kubeconfig-file/) documentation.
|
||||
For additional HTTP configuration, refer to the [kubeconfig](/docs/concepts/cluster-administration/authenticate-across-clusters-kubeconfig/) documentation.
|
||||
|
||||
#### Request Payloads
|
||||
|
||||
|
@ -252,6 +252,11 @@ This plugin ignores any `PersistentVolumeClaim` updates, it acts only on creatio
|
|||
See [persistent volume](/docs/user-guide/persistent-volumes) documentation about persistent volume claims and
|
||||
storage classes and how to mark a storage class as default.
|
||||
|
||||
### DefaultTolerationSeconds
|
||||
|
||||
This plug-in sets the default forgiveness toleration for pods, which have no forgiveness tolerations, to tolerate
|
||||
the taints `notready:NoExecute` and `unreachable:NoExecute` for 5 minutes.
|
||||
|
||||
## Is there a recommended set of plug-ins to use?
|
||||
|
||||
Yes.
|
||||
|
|
|
@ -4,389 +4,6 @@ assignees:
|
|||
title: AppArmor
|
||||
---
|
||||
|
||||
AppArmor is a Linux kernel enhancement that can reduce the potential attack surface of an
|
||||
application and provide greater defense in depth for Applications. Beta support for AppArmor was
|
||||
added in Kubernetes v1.4.
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
## What is AppArmor
|
||||
|
||||
AppArmor is a Linux kernel security module that supplements the standard Linux user and group based
|
||||
permissions to confine programs to a limited set of resources. AppArmor can be configured for any
|
||||
application to reduce its potential attack surface and provide greater defense in depth. It is
|
||||
configured through profiles tuned to whitelist the access needed by a specific program or container,
|
||||
such as Linux capabilities, network access, file permissions, etc. Each profile can be run in either
|
||||
enforcing mode, which blocks access to disallowed resources, or complain mode, which only reports
|
||||
violations.
|
||||
|
||||
AppArmor can help you to run a more secure deployment by restricting what containers are allowed to
|
||||
do, and /or providing better auditing through system logs. However, it is important to keep in mind
|
||||
that AppArmor is not a silver bullet, and can only do so much to protect against exploits in your
|
||||
application code. It is important to provide good, restrictive profiles, and harden your
|
||||
applications and cluster from other angles as well.
|
||||
|
||||
AppArmor support in Kubernetes is currently in beta.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Kubernetes version is at least v1.4**. Kubernetes support for AppArmor was added in
|
||||
v1.4. Kubernetes components older than v1.4 are not aware of the new AppArmor annotations, and
|
||||
will **silently ignore** any AppArmor settings that are provided. To ensure that your Pods are
|
||||
receiving the expected protections, it is important to verify the Kubelet version of your nodes:
|
||||
|
||||
$ kubectl get nodes -o=jsonpath=$'{range .items[*]}{@.metadata.name}: {@.status.nodeInfo.kubeletVersion}\n{end}'
|
||||
gke-test-default-pool-239f5d02-gyn2: v1.4.0
|
||||
gke-test-default-pool-239f5d02-x1kf: v1.4.0
|
||||
gke-test-default-pool-239f5d02-xwux: v1.4.0
|
||||
|
||||
2. **AppArmor kernel module is enabled**. For the Linux kernel to enforce an AppArmor profile, the
|
||||
AppArmor kernel module must be installed and enabled. Several distributions enable the module by
|
||||
default, such as Ubuntu and SUSE, and many others provide optional support. To check whether the
|
||||
module is enabled, check the `/sys/module/apparmor/parameters/enabled` file:
|
||||
|
||||
$ cat /sys/module/apparmor/parameters/enabled
|
||||
Y
|
||||
|
||||
If the Kubelet contains AppArmor support (>= v1.4), it will refuse to run a Pod with AppArmor
|
||||
options if the kernel module is not enabled.
|
||||
|
||||
*Note: Ubuntu carries many AppArmor patches that have not been merged into the upstream Linux
|
||||
kernel, including patches that add additional hooks and features. Kubernetes has only been
|
||||
tested with the upstream version, and does not promise support for other features.*
|
||||
|
||||
3. **Container runtime is Docker**. Currently the only Kubernetes-supported container runtime that
|
||||
also supports AppArmor is Docker. As more runtimes add AppArmor support, the options will be
|
||||
expanded. You can verify that your nodes are running docker with:
|
||||
|
||||
$ kubectl get nodes -o=jsonpath=$'{range .items[*]}{@.metadata.name}: {@.status.nodeInfo.containerRuntimeVersion}\n{end}'
|
||||
gke-test-default-pool-239f5d02-gyn2: docker://1.11.2
|
||||
gke-test-default-pool-239f5d02-x1kf: docker://1.11.2
|
||||
gke-test-default-pool-239f5d02-xwux: docker://1.11.2
|
||||
|
||||
If the Kubelet contains AppArmor support (>= v1.4), it will refuse to run a Pod with AppArmor
|
||||
options if the runtime is not Docker.
|
||||
|
||||
4. **Profile is loaded**. AppArmor is applied to a Pod by specifying an AppArmor profile that each
|
||||
container should be run with. If any of the specified profiles is not already loaded in the
|
||||
kernel, the Kubelet (>= v1.4) will reject the Pod. You can view which profiles are loaded on a
|
||||
node by checking the `/sys/kernel/security/apparmor/profiles` file. For example:
|
||||
|
||||
$ ssh gke-test-default-pool-239f5d02-gyn2 "sudo cat /sys/kernel/security/apparmor/profiles | sort"
|
||||
apparmor-test-deny-write (enforce)
|
||||
apparmor-test-audit-write (enforce)
|
||||
docker-default (enforce)
|
||||
k8s-nginx (enforce)
|
||||
|
||||
For more details on loading profiles on nodes, see
|
||||
[Setting up nodes with profiles](#setting-up-nodes-with-profiles).
|
||||
|
||||
As long as the Kubelet version includes AppArmor support (>= v1.4), the Kubelet will reject a Pod
|
||||
with AppArmor options if any of the prerequisites are not met. You can also verify AppArmor support
|
||||
on nodes by checking the node ready condition message (though this is likely to be removed in a
|
||||
later release):
|
||||
|
||||
$ kubectl get nodes -o=jsonpath=$'{range .items[*]}{@.metadata.name}: {.status.conditions[?(@.reason=="KubeletReady")].message}\n{end}'
|
||||
gke-test-default-pool-239f5d02-gyn2: kubelet is posting ready status. AppArmor enabled
|
||||
gke-test-default-pool-239f5d02-x1kf: kubelet is posting ready status. AppArmor enabled
|
||||
gke-test-default-pool-239f5d02-xwux: kubelet is posting ready status. AppArmor enabled
|
||||
|
||||
## Securing a Pod
|
||||
|
||||
*Note: AppArmor is currently in beta, so options are specified as annotations. Once support graduates to
|
||||
general availability, the annotations will be replaced with first-class fields (more details in
|
||||
[Upgrade path to GA](#upgrade-path-to-general-availability)).*
|
||||
|
||||
AppArmor profiles are specified *per-container*. To specify the AppArmor profile to run a Pod
|
||||
container with, add an annotation to the Pod's metadata:
|
||||
|
||||
container.apparmor.security.beta.kubernetes.io/<container_name>: <profile_ref>
|
||||
|
||||
Where `<container_name>` is the name of the container to apply the profile to, and `<profile_ref>`
|
||||
specifies the profile to apply. The `profile_ref` can be one of:
|
||||
|
||||
- `runtime/default` to apply the runtime's default profile.
|
||||
- `localhost/<profile_name>` to apply the profile loaded on the host with the name `<profile_name>`
|
||||
|
||||
See the [API Reference](#api-reference) for the full details on the annotation and profile name formats.
|
||||
|
||||
The Kubernetes AppArmor enforcement works by first checking that all the prerequisites have been
|
||||
met, and then forwarding the profile selection to the container runtime for enforcement. If the
|
||||
prerequisites have not been met, the Pod will be rejected, and will not run.
|
||||
|
||||
To verify that the profile was applied, you can expect to see the AppArmor security option listed in the container created event:
|
||||
|
||||
$ kubectl get events | grep Created
|
||||
22s 22s 1 hello-apparmor Pod spec.containers{hello} Normal Created {kubelet e2e-test-stclair-minion-group-31nt} Created container with docker id 269a53b202d3; Security:[seccomp=unconfined apparmor=k8s-apparmor-example-deny-write]
|
||||
|
||||
You can also verify directly that the container's root process is running with the correct profile by checking its proc attr:
|
||||
|
||||
$ kubectl exec <pod_name> cat /proc/1/attr/current
|
||||
k8s-apparmor-example-deny-write (enforce)
|
||||
|
||||
## Example
|
||||
|
||||
In this example you'll see:
|
||||
|
||||
- One way to load a profile on a node
|
||||
- How to enforce the profile on a Pod
|
||||
- How to check that the profile is loaded
|
||||
- What happens when a profile is violated
|
||||
- What happens when a profile cannot be loaded
|
||||
|
||||
*This example assumes you have already set up a cluster with AppArmor support.*
|
||||
|
||||
First, we need to load the profile we want to use onto our nodes. The profile we'll use simply
|
||||
denies all file writes:
|
||||
|
||||
{% include code.html language="text" file="deny-write.profile" ghlink="/docs/admin/apparmor/deny-write.profile" %}
|
||||
|
||||
Since we don't know where the Pod will be scheduled, we'll need to load the profile on all our
|
||||
nodes. For this example we'll just use SSH to install the profiles, but other approaches are
|
||||
discussed in [Setting up nodes with profiles](#setting-up-nodes-with-profiles).
|
||||
|
||||
$ NODES=(
|
||||
# The SSH-accessible domain names of your nodes
|
||||
gke-test-default-pool-239f5d02-gyn2.us-central1-a.my-k8s
|
||||
gke-test-default-pool-239f5d02-x1kf.us-central1-a.my-k8s
|
||||
gke-test-default-pool-239f5d02-xwux.us-central1-a.my-k8s)
|
||||
$ for NODE in ${NODES[*]}; do ssh $NODE 'sudo apparmor_parser -q <<EOF
|
||||
#include <tunables/global>
|
||||
|
||||
profile k8s-apparmor-example-deny-write flags=(attach_disconnected) {
|
||||
#include <abstractions/base>
|
||||
|
||||
file,
|
||||
|
||||
# Deny all file writes.
|
||||
deny /** w,
|
||||
}
|
||||
EOF'
|
||||
done
|
||||
|
||||
Next, we'll run a simple "Hello AppArmor" pod with the deny-write profile:
|
||||
|
||||
{% include code.html language="yaml" file="hello-apparmor-pod.yaml" ghlink="/docs/admin/apparmor/hello-apparmor-pod.yaml" %}
|
||||
|
||||
$ kubectl create -f /dev/stdin <<EOF
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: hello-apparmor
|
||||
annotations:
|
||||
container.apparmor.security.beta.kubernetes.io/hello: localhost/k8s-apparmor-example-deny-write
|
||||
spec:
|
||||
containers:
|
||||
- name: hello
|
||||
image: busybox
|
||||
command: [ "sh", "-c", "echo 'Hello AppArmor!' && sleep 1h" ]
|
||||
EOF
|
||||
pod "hello-apparmor" created
|
||||
|
||||
If we look at the pod events, we can see that the Pod container was created with the AppArmor
|
||||
profile "k8s-apparmor-example-deny-write":
|
||||
|
||||
$ kubectl get events | grep hello-apparmor
|
||||
14s 14s 1 hello-apparmor Pod Normal Scheduled {default-scheduler } Successfully assigned hello-apparmor to gke-test-default-pool-239f5d02-gyn2
|
||||
14s 14s 1 hello-apparmor Pod spec.containers{hello} Normal Pulling {kubelet gke-test-default-pool-239f5d02-gyn2} pulling image "busybox"
|
||||
13s 13s 1 hello-apparmor Pod spec.containers{hello} Normal Pulled {kubelet gke-test-default-pool-239f5d02-gyn2} Successfully pulled image "busybox"
|
||||
13s 13s 1 hello-apparmor Pod spec.containers{hello} Normal Created {kubelet gke-test-default-pool-239f5d02-gyn2} Created container with docker id 06b6cd1c0989; Security:[seccomp=unconfined apparmor=k8s-apparmor-example-deny-write]
|
||||
13s 13s 1 hello-apparmor Pod spec.containers{hello} Normal Started {kubelet gke-test-default-pool-239f5d02-gyn2} Started container with docker id 06b6cd1c0989
|
||||
|
||||
We can verify that the container is actually running with that profile by checking its proc attr:
|
||||
|
||||
$ kubectl exec hello-apparmor cat /proc/1/attr/current
|
||||
k8s-apparmor-example-deny-write (enforce)
|
||||
|
||||
Finally, we can see what happens if we try to violate the profile by writing to a file:
|
||||
|
||||
$ kubectl exec hello-apparmor touch /tmp/test
|
||||
touch: /tmp/test: Permission denied
|
||||
error: error executing remote command: command terminated with non-zero exit code: Error executing in Docker Container: 1
|
||||
|
||||
To wrap up, let's look at what happens if we try to specify a profile that hasn't been loaded:
|
||||
|
||||
$ kubectl create -f /dev/stdin <<EOF
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: hello-apparmor-2
|
||||
annotations:
|
||||
container.apparmor.security.beta.kubernetes.io/hello: localhost/k8s-apparmor-example-allow-write
|
||||
spec:
|
||||
containers:
|
||||
- name: hello
|
||||
image: busybox
|
||||
command: [ "sh", "-c", "echo 'Hello AppArmor!' && sleep 1h" ]
|
||||
EOF
|
||||
pod "hello-apparmor-2" created
|
||||
|
||||
$ kubectl describe pod hello-apparmor-2
|
||||
Name: hello-apparmor-2
|
||||
Namespace: default
|
||||
Node: gke-test-default-pool-239f5d02-x1kf/
|
||||
Start Time: Tue, 30 Aug 2016 17:58:56 -0700
|
||||
Labels: <none>
|
||||
Status: Failed
|
||||
Reason: AppArmor
|
||||
Message: Pod Cannot enforce AppArmor: profile "k8s-apparmor-example-allow-write" is not loaded
|
||||
IP:
|
||||
Controllers: <none>
|
||||
Containers:
|
||||
hello:
|
||||
Image: busybox
|
||||
Port:
|
||||
Command:
|
||||
sh
|
||||
-c
|
||||
echo 'Hello AppArmor!' && sleep 1h
|
||||
Requests:
|
||||
cpu: 100m
|
||||
Environment Variables: <none>
|
||||
Volumes:
|
||||
default-token-dnz7v:
|
||||
Type: Secret (a volume populated by a Secret)
|
||||
SecretName: default-token-dnz7v
|
||||
QoS Tier: Burstable
|
||||
Events:
|
||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||
23s 23s 1 {default-scheduler } Normal Scheduled Successfully assigned hello-apparmor-2 to e2e-test-stclair-minion-group-t1f5
|
||||
23s 23s 1 {kubelet e2e-test-stclair-minion-group-t1f5} Warning AppArmor Cannot enforce AppArmor: profile "k8s-apparmor-example-allow-write" is not loaded
|
||||
|
||||
Note the pod status is Failed, with a helpful error message: `Pod Cannot enforce AppArmor: profile
|
||||
"k8s-apparmor-example-allow-write" is not loaded`. An event was also recorded with the same message.
|
||||
|
||||
## Administration
|
||||
|
||||
### Setting up nodes with profiles
|
||||
|
||||
Kubernetes does not currently provide any native mechanisms for loading AppArmor profiles onto
|
||||
nodes. There are lots of ways to setup the profiles though, such as:
|
||||
|
||||
- Through a [DaemonSet](../daemons/) that runs a Pod on each node to
|
||||
ensure the correct profiles are loaded. An example implementation can be found
|
||||
[here](https://github.com/kubernetes/contrib/tree/master/apparmor/loader).
|
||||
- At node initialization time, using your node initialization scripts (e.g. Salt, Ansible, etc.) or
|
||||
image.
|
||||
- By copying the profiles to each node and loading them through SSH, as demonstrated in the
|
||||
[Example](#example).
|
||||
|
||||
The scheduler is not aware of which profiles are loaded onto which node, so the full set of profiles
|
||||
must be loaded onto every node. An alternative approach is to add a node label for each profile (or
|
||||
class of profiles) on the node, and use a
|
||||
[node selector](../../user-guide/node-selection/) to ensure the Pod is run on a
|
||||
node with the required profile.
|
||||
|
||||
### Restricting profiles with the PodSecurityPolicy
|
||||
|
||||
If the PodSecurityPolicy extension is enabled, cluster-wide AppArmor restrictions can be applied. To
|
||||
enable the PodSecurityPolicy, two flags must be set on the `apiserver`:
|
||||
|
||||
--admission-control=PodSecurityPolicy[,others...]
|
||||
--runtime-config=extensions/v1beta1/podsecuritypolicy[,others...]
|
||||
|
||||
With the extension enabled, the AppArmor options can be specified as annotations on the PodSecurityPolicy:
|
||||
|
||||
apparmor.security.beta.kubernetes.io/defaultProfileName: <profile_ref>
|
||||
apparmor.security.beta.kubernetes.io/allowedProfileNames: <profile_ref>[,others...]
|
||||
|
||||
The default profile name option specifies the profile to apply to containers by default when none is
|
||||
specified. The allowed profile names option specifies a list of profiles that Pod containers are
|
||||
allowed to be run with. If both options are provided, the default must be allowed. The profiles are
|
||||
specified in the same format as on containers. See the [API Reference](#api-reference) for the full
|
||||
specification.
|
||||
|
||||
### Disabling AppArmor
|
||||
|
||||
If you do not want AppArmor to be available on your cluster, it can be disabled by a command-line flag:
|
||||
|
||||
--feature-gates=AppArmor=false
|
||||
|
||||
When disabled, any Pod that includes an AppArmor profile will fail validation with a "Forbidden"
|
||||
error. Note that by default docker always enables the "docker-default" profile on non-privileged
|
||||
pods (if the AppArmor kernel module is enabled), and will continue to do so even if the feature-gate
|
||||
is disabled. The option to disable AppArmor will be removed when AppArmor graduates to general
|
||||
availability (GA).
|
||||
|
||||
### Upgrading to Kubernetes v1.4 with AppArmor
|
||||
|
||||
No action is required with respect to AppArmor to upgrade your cluster to v1.4. However, if any
|
||||
existing pods had an AppArmor annotation, they will not go through validation (or PodSecurityPolicy
|
||||
admission). If permissive profiles are loaded on the nodes, a malicious user could pre-apply a
|
||||
permissive profile to escalate the pod privileges above the docker-default. If this is a concern, it
|
||||
is recommended to scrub the cluster of any pods containing an annotation with
|
||||
`apparmor.security.beta.kubernetes.io`.
|
||||
|
||||
### Upgrade path to General Availability
|
||||
|
||||
When AppArmor is ready to be graduated to general availability (GA), the options currently specified
|
||||
through annotations will be converted to fields. Supporting all the upgrade and downgrade paths
|
||||
through the transition is very nuanced, and will be explained in detail when the transition
|
||||
occurs. We will commit to supporting both fields and annotations for at least 2 releases, and will
|
||||
explicitly reject the annotations for at least 2 releases after that.
|
||||
|
||||
## Authoring Profiles
|
||||
|
||||
Getting AppArmor profiles specified correctly can be a tricky business. Fortunately there are some
|
||||
tools to help with that:
|
||||
|
||||
- `aa-genprof` and `aa-logprof` generate profile rules by monitoring an application's activity and
|
||||
logs, and admitting the actions it takes. Further instructions are provided by the
|
||||
[AppArmor documentation](http://wiki.apparmor.net/index.php/Profiling_with_tools).
|
||||
- [bane](https://github.com/jfrazelle/bane) is an AppArmor profile generator for Docker that uses a
|
||||
simplified profile language.
|
||||
|
||||
It is recommended to run your application through Docker on a development workstation to generate
|
||||
the profiles, but there is nothing preventing running the tools on the Kubernetes node where your
|
||||
Pod is running.
|
||||
|
||||
To debug problems with AppArmor, you can check the system logs to see what, specifically, was
|
||||
denied. AppArmor logs verbose messages to `dmesg`, and errors can usually be found in the system
|
||||
logs or through `journalctl`. More information is provided in
|
||||
[AppArmor failures](http://wiki.apparmor.net/index.php/AppArmor_Failures).
|
||||
|
||||
Additional resources:
|
||||
|
||||
- [Quick guide to the AppArmor profile language](http://wiki.apparmor.net/index.php/QuickProfileLanguage)
|
||||
- [AppArmor core policy reference](http://wiki.apparmor.net/index.php/ProfileLanguage)
|
||||
|
||||
## API Reference
|
||||
|
||||
**Pod Annotation**:
|
||||
|
||||
Specifying the profile a container will run with:
|
||||
|
||||
- **key**: `container.apparmor.security.beta.kubernetes.io/<container_name>`
|
||||
Where `<container_name>` matches the name of a container in the Pod.
|
||||
A separate profile can be specified for each container in the Pod.
|
||||
- **value**: a profile reference, described below
|
||||
|
||||
**Profile Reference**:
|
||||
|
||||
- `runtime/default`: Refers to the default runtime profile.
|
||||
- Equivalent to not specifying a profile (without a PodSecurityPolicy default), except it still
|
||||
requires AppArmor to be enabled.
|
||||
- For Docker, this resolves to the
|
||||
[`docker-default`](https://docs.docker.com/engine/security/apparmor/) profile for non-privileged
|
||||
containers, and unconfined (no profile) for privileged containers.
|
||||
- `localhost/<profile_name>`: Refers to a profile loaded on the node (localhost) by name.
|
||||
- The possible profile names are detailed in the
|
||||
[core policy reference](http://wiki.apparmor.net/index.php/AppArmor_Core_Policy_Reference#Profile_names_and_attachment_specifications)
|
||||
|
||||
Any other profile reference format is invalid.
|
||||
|
||||
**PodSecurityPolicy Annotations**
|
||||
|
||||
Specifying the default profile to apply to containers when none is provided:
|
||||
|
||||
- **key**: `apparmor.security.beta.kubernetes.io/defaultProfileName`
|
||||
- **value**: a profile reference, described above
|
||||
|
||||
Specifying the list of profiles Pod containers is allowed to specify:
|
||||
|
||||
- **key**: `apparmor.security.beta.kubernetes.io/allowedProfileNames`
|
||||
- **value**: a comma-separated list of profile references (described above)
|
||||
- Although an escaped comma is a legal character in a profile name, it cannot be explicitly
|
||||
allowed here
|
||||
[AppArmor](/docs/tutorials/clusters/apparmor/)
|
||||
|
|
|
@ -5,63 +5,6 @@ assignees:
|
|||
title: Audit in Kubernetes
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
Kubernetes Audit provides a security-relevant chronological set of records documenting
|
||||
the sequence of activities that have affected system by individual users, administrators
|
||||
or other components of the system. It allows cluster administrator to
|
||||
answer the following questions:
|
||||
- what happened?
|
||||
- when did it happen?
|
||||
- who initiated it?
|
||||
- on what did it happen?
|
||||
- where was it observed?
|
||||
- from where was it initiated?
|
||||
- to where was it going?
|
||||
|
||||
NOTE: Currently, Kubernetes provides only basic audit capabilities, there is still a lot
|
||||
of work going on to provide fully featured auditing capabilities (see [this issue](https://github.com/kubernetes/features/issues/22)).
|
||||
|
||||
Kubernetes audit is part of [kube-apiserver](/docs/admin/kube-apiserver) logging all requests
|
||||
coming to the server. Each audit log contains two entries:
|
||||
|
||||
1. The request line containing:
|
||||
- unique id allowing to match the response line (see 2)
|
||||
- source ip of the request
|
||||
- HTTP method being invoked
|
||||
- original user invoking the operation
|
||||
- impersonated user for the operation
|
||||
- namespace of the request or <none>
|
||||
- URI as requested
|
||||
2. The response line containing:
|
||||
- the unique id from 1
|
||||
- response code
|
||||
|
||||
Example output for user `admin` asking for a list of pods:
|
||||
|
||||
```
|
||||
2016-09-07T13:03:57.400333046Z AUDIT: id="5c3b8227-4af9-4322-8a71-542231c3887b" ip="127.0.0.1" method="GET" user="admin" as="<self>" namespace="default" uri="/api/v1/namespaces/default/pods"
|
||||
2016-09-07T13:03:57.400710987Z AUDIT: id="5c3b8227-4af9-4322-8a71-542231c3887b" response="200"
|
||||
```
|
||||
|
||||
NOTE: The audit capabilities are available *only* for the secured endpoint of the API server.
|
||||
|
||||
## Configuration
|
||||
|
||||
[Kube-apiserver](/docs/admin/kube-apiserver) provides following options which are responsible
|
||||
for configuring where and how audit logs are handled:
|
||||
|
||||
- `audit-log-path` - enables the audit log pointing to a file where the requests are being logged to.
|
||||
- `audit-log-maxage` - specifies maximum number of days to retain old audit log files based on the timestamp encoded in their filename.
|
||||
- `audit-log-maxbackup` - specifies maximum number of old audit log files to retain.
|
||||
- `audit-log-maxsize` - specifies maximum size in megabytes of the audit log file before it gets rotated. Defaults to 100MB
|
||||
|
||||
If an audit log file already exists, Kubernetes appends new audit logs to that file.
|
||||
Otherwise, Kubernetes creates an audit log file at the location you specified in
|
||||
`audit-log-path`. If the audit log file exceeds the size you specify in `audit-log-maxsize`,
|
||||
Kubernetes will rename the current log file by appending the current timestamp on
|
||||
the file name (before the file extension) and create a new audit log file.
|
||||
Kubernetes may delete old log files when creating a new log file; you can configure
|
||||
how many files are retained and how old they can be by specifying the `audit-log-maxbackup`
|
||||
and `audit-log-maxage` options.
|
||||
[Auditing](/docs/concepts/cluster-administration/audit/)
|
||||
|
|
|
@ -85,9 +85,9 @@ See [APPENDIX](#appendix) for how to generate a client cert.
|
|||
The API server reads bearer tokens from a file when given the `--token-auth-file=SOMEFILE` option on the command line. Currently, tokens last indefinitely, and the token list cannot be
|
||||
changed without restarting API server.
|
||||
|
||||
The token file format is implemented in `plugin/pkg/auth/authenticator/token/tokenfile/...`
|
||||
and is a csv file with a minimum of 3 columns: token, user name, user uid, followed by
|
||||
optional group names. Note, if you have more than one group the column must be double quoted e.g.
|
||||
The token file is a csv file with a minimum of 3 columns: token, user name, user uid,
|
||||
followed by optional group names. Note, if you have more than one group the column must be
|
||||
double quoted e.g.
|
||||
|
||||
```conf
|
||||
token,user,uid,"group1,group2,group3"
|
||||
|
@ -115,9 +115,9 @@ and the password cannot be changed without restarting API server. Note that basi
|
|||
authentication is currently supported for convenience while we finish making the
|
||||
more secure modes described above easier to use.
|
||||
|
||||
The basic auth file format is implemented in `plugin/pkg/auth/authenticator/password/passwordfile/...`
|
||||
and is a csv file with a minimum of 3 columns: password, user name, user id, followed by
|
||||
optional group names. Note, if you have more than one group the column must be double quoted e.g.
|
||||
The basic auth file is a csv file with a minimum of 3 columns: password,
|
||||
user name, user id, followed by optional group names. Note, if you have more than
|
||||
one group the column must be double quoted e.g.
|
||||
|
||||
```conf
|
||||
password,user,uid,"group1,group2,group3"
|
||||
|
@ -346,7 +346,7 @@ Webhook authentication is a hook for verifying bearer tokens.
|
|||
* `--authentication-token-webhook-config-file` a kubeconfig file describing how to access the remote webhook service.
|
||||
* `--authentication-token-webhook-cache-ttl` how long to cache authentication decisions. Defaults to two minutes.
|
||||
|
||||
The configuration file uses the [kubeconfig](/docs/user-guide/kubeconfig-file/)
|
||||
The configuration file uses the [kubeconfig](/docs/concepts/cluster-administration/authenticate-across-clusters-kubeconfig/)
|
||||
file format. Within the file "users" refers to the API server webhook and
|
||||
"clusters" refers to the remote service. An example would be:
|
||||
|
||||
|
@ -541,8 +541,8 @@ Finally, add the following parameters into API server start parameters:
|
|||
1. Generate server certificate and key.
|
||||
(build-server-full [filename]: Generate a keypair and sign locally for a client or server)
|
||||
|
||||
./easyrsa --subject-alt-name="IP:${MASTER_IP}" build-server-full kubernetes-master nopass
|
||||
1. Copy `pki/ca.crt`, `pki/issued/kubernetes-master.crt`, and `pki/private/kubernetes-master.key` to your directory.
|
||||
./easyrsa --subject-alt-name="IP:${MASTER_IP}" build-server-full server nopass
|
||||
1. Copy `pki/ca.crt`, `pki/issued/server.crt`, and `pki/private/server.key` to your directory.
|
||||
1. Fill in and add the following parameters into the API server start parameters:
|
||||
|
||||
--client-ca-file=/yourdirectory/ca.crt
|
||||
|
|
|
@ -87,15 +87,25 @@ properties:
|
|||
- Subject-matching properties:
|
||||
- `user`, type string; the user-string from `--token-auth-file`. If you specify `user`, it must match the username of the authenticated user.
|
||||
- `group`, type string; if you specify `group`, it must match one of the groups of the authenticated user. `system:authenticated` matches all authenticated requests. `system:unauthenticated` matches all unauthenticated requests.
|
||||
- `readonly`, type boolean, when true, means that the policy only applies to get, list, and watch operations.
|
||||
- Resource-matching properties:
|
||||
- `apiGroup`, type string; an API group, such as `extensions`. `*` matches all API groups.
|
||||
- `namespace`, type string; a namespace string. `*` matches all resource requests.
|
||||
- `resource`, type string; a resource, such as `pods`. `*` matches all resource requests.
|
||||
- `apiGroup`, type string; an API group.
|
||||
- Ex: `extensions`
|
||||
- Wildard: `*` matches all API groups.
|
||||
- `namespace`, type string; a namespace.
|
||||
- Ex: `kube-system`
|
||||
- Wildard: `*` matches all resource requests.
|
||||
- `resource`, type string; a resource type
|
||||
- Ex: `pods`
|
||||
- Wildcard: `*` matches all resource requests.
|
||||
- Non-resource-matching properties:
|
||||
- `nonResourcePath`, type string; matches the non-resource request paths (like `/version` and `/apis`). `*` matches all non-resource requests. `/foo/*` matches `/foo/` and all of its subpaths.
|
||||
- `nonResourcePath`, type string; non-resource request paths.
|
||||
- Ex: `/version` or `/apis`
|
||||
- Wildcard:
|
||||
- `*` matches all non-resource requests.
|
||||
- `/foo/*` matches `/foo/` and all of its subpaths.
|
||||
- `readonly`, type boolean, when true, means that the policy only applies to get, list, and watch operations.
|
||||
|
||||
An unset property is the same as a property set to the zero value for its type
|
||||
**NOTES:** An unset property is the same as a property set to the zero value for its type
|
||||
(e.g. empty string, 0, false). However, unset should be preferred for
|
||||
readability.
|
||||
|
||||
|
@ -221,20 +231,20 @@ don't already have even when the RBAC authorizer it disabled__. If "user-1"
|
|||
does not have the ability to read secrets in "namespace-a", they cannot create
|
||||
a binding that would grant that permission to themselves or any other user.
|
||||
|
||||
For bootstrapping the first roles, it becomes necessary for someone to get
|
||||
around these limitations. For the alpha release of RBAC, an API Server flag was
|
||||
added to allow one user to step around all RBAC authorization and privilege
|
||||
escalation checks. NOTE: _This is subject to change with future releases._
|
||||
When bootstrapping, superuser credentials should include the `system:masters`
|
||||
group, for example by creating a client cert with `/O=system:masters`. This
|
||||
gives those credentials full access to the API and allows an admin to then set
|
||||
up bindings for other users.
|
||||
|
||||
In Kubernetes versions 1.4 and 1.5, there was a similar flag that gave a user
|
||||
full access:
|
||||
|
||||
```
|
||||
--authorization-rbac-super-user=admin
|
||||
```
|
||||
|
||||
Once set the specified super user, in this case "admin", can be used to create
|
||||
the roles and role bindings to initialize the system.
|
||||
|
||||
This flag is optional and once the initial bootstrapping is performed can be
|
||||
unset.
|
||||
__This flag will be removed in 1.6__. Admins should prefer the `system:masters`
|
||||
group when setting up clusters.
|
||||
|
||||
### Roles, RolesBindings, ClusterRoles, and ClusterRoleBindings
|
||||
|
||||
|
@ -445,6 +455,7 @@ subjects:
|
|||
```
|
||||
|
||||
For all authenticated users:
|
||||
|
||||
```yaml
|
||||
subjects:
|
||||
- kind: Group
|
||||
|
@ -452,6 +463,7 @@ subjects:
|
|||
```
|
||||
|
||||
For all unauthenticated users:
|
||||
|
||||
```yaml
|
||||
subjects:
|
||||
- kind: Group
|
||||
|
@ -459,6 +471,7 @@ subjects:
|
|||
```
|
||||
|
||||
For all users:
|
||||
|
||||
```yaml
|
||||
subjects:
|
||||
- kind: Group
|
||||
|
@ -477,7 +490,7 @@ service when determining user privileges.
|
|||
Mode `Webhook` requires a file for HTTP configuration, specify by the
|
||||
`--authorization-webhook-config-file=SOME_FILENAME` flag.
|
||||
|
||||
The configuration file uses the [kubeconfig](/docs/user-guide/kubeconfig-file/)
|
||||
The configuration file uses the [kubeconfig](/docs/concepts/cluster-administration/authenticate-across-clusters-kubeconfig/)
|
||||
file format. Within the file "users" refers to the API Server webhook and
|
||||
"clusters" refers to the remote service.
|
||||
|
||||
|
|
|
@ -4,133 +4,6 @@ assignees:
|
|||
title: Kubernetes Components
|
||||
---
|
||||
|
||||
This document outlines the various binary components that need to run to
|
||||
deliver a functioning Kubernetes cluster.
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
## Master Components
|
||||
|
||||
Master components are those that provide the cluster's control plane. For
|
||||
example, master components are responsible for making global decisions about the
|
||||
cluster (e.g., scheduling), and detecting and responding to cluster events
|
||||
(e.g., starting up a new pod when a replication controller's 'replicas' field is
|
||||
unsatisfied).
|
||||
|
||||
Master components could in theory be run on any node in the cluster. However,
|
||||
for simplicity, current set up scripts typically start all master components on
|
||||
the same VM, and does not run user containers on this VM. See
|
||||
[high-availability.md](/docs/admin/high-availability) for an example multi-master-VM setup.
|
||||
|
||||
Even in the future, when Kubernetes is fully self-hosting, it will probably be
|
||||
wise to only allow master components to schedule on a subset of nodes, to limit
|
||||
co-running with user-run pods, reducing the possible scope of a
|
||||
node-compromising security exploit.
|
||||
|
||||
### kube-apiserver
|
||||
|
||||
[kube-apiserver](/docs/admin/kube-apiserver) exposes the Kubernetes API; it is the front-end for the
|
||||
Kubernetes control plane. It is designed to scale horizontally (i.e., one scales
|
||||
it by running more of them-- [high-availability.md](/docs/admin/high-availability)).
|
||||
|
||||
### etcd
|
||||
|
||||
[etcd](/docs/admin/etcd) is used as Kubernetes' backing store. All cluster data is stored here.
|
||||
Proper administration of a Kubernetes cluster includes a backup plan for etcd's
|
||||
data.
|
||||
|
||||
### kube-controller-manager
|
||||
|
||||
[kube-controller-manager](/docs/admin/kube-controller-manager) is a binary that runs controllers, which are the
|
||||
background threads that handle routine tasks in the cluster. Logically, each
|
||||
controller is a separate process, but to reduce the number of moving pieces in
|
||||
the system, they are all compiled into a single binary and run in a single
|
||||
process.
|
||||
|
||||
These controllers include:
|
||||
|
||||
* Node Controller: Responsible for noticing & responding when nodes go down.
|
||||
* Replication Controller: Responsible for maintaining the correct number of pods for every replication
|
||||
controller object in the system.
|
||||
* Endpoints Controller: Populates the Endpoints object (i.e., join Services & Pods).
|
||||
* Service Account & Token Controllers: Create default accounts and API access tokens for new namespaces.
|
||||
* ... and others.
|
||||
|
||||
### kube-scheduler
|
||||
|
||||
[kube-scheduler](/docs/admin/kube-scheduler) watches newly created pods that have no node assigned, and
|
||||
selects a node for them to run on.
|
||||
|
||||
### addons
|
||||
|
||||
Addons are pods and services that implement cluster features. The pods may be managed
|
||||
by Deployments, ReplicationContollers, etc. Namespaced addon objects are created in
|
||||
the "kube-system" namespace.
|
||||
|
||||
Addon manager takes the responsibility for creating and maintaining addon resources.
|
||||
See [here](http://releases.k8s.io/HEAD/cluster/addons) for more details.
|
||||
|
||||
#### DNS
|
||||
|
||||
While the other addons are not strictly required, all Kubernetes
|
||||
clusters should have [cluster DNS](/docs/admin/dns/), as many examples rely on it.
|
||||
|
||||
Cluster DNS is a DNS server, in addition to the other DNS server(s) in your
|
||||
environment, which serves DNS records for Kubernetes services.
|
||||
|
||||
Containers started by Kubernetes automatically include this DNS server
|
||||
in their DNS searches.
|
||||
|
||||
#### User interface
|
||||
|
||||
The kube-ui provides a read-only overview of the cluster state. Access
|
||||
[the UI using kubectl proxy](/docs/user-guide/connecting-to-applications-proxy/#connecting-to-the-kube-ui-service-from-your-local-workstation)
|
||||
|
||||
#### Container Resource Monitoring
|
||||
|
||||
[Container Resource Monitoring](/docs/user-guide/monitoring) records generic time-series metrics
|
||||
about containers in a central database, and provides a UI for browsing that data.
|
||||
|
||||
#### Cluster-level Logging
|
||||
|
||||
A [Cluster-level logging](/docs/user-guide/logging/overview) mechanism is responsible for
|
||||
saving container logs to a central log store with search/browsing interface.
|
||||
|
||||
## Node components
|
||||
|
||||
Node components run on every node, maintaining running pods and providing them
|
||||
the Kubernetes runtime environment.
|
||||
|
||||
### kubelet
|
||||
|
||||
[kubelet](/docs/admin/kubelet) is the primary node agent. It:
|
||||
|
||||
* Watches for pods that have been assigned to its node (either by apiserver
|
||||
or via local configuration file) and:
|
||||
* Mounts the pod's required volumes
|
||||
* Downloads the pod's secrets
|
||||
* Runs the pod's containers via docker (or, experimentally, rkt).
|
||||
* Periodically executes any requested container liveness probes.
|
||||
* Reports the status of the pod back to the rest of the system, by creating a
|
||||
"mirror pod" if necessary.
|
||||
* Reports the status of the node back to the rest of the system.
|
||||
|
||||
### kube-proxy
|
||||
|
||||
[kube-proxy](/docs/admin/kube-proxy) enables the Kubernetes service abstraction by maintaining
|
||||
network rules on the host and performing connection forwarding.
|
||||
|
||||
### docker
|
||||
|
||||
`docker` is of course used for actually running containers.
|
||||
|
||||
### rkt
|
||||
|
||||
`rkt` is supported experimentally as an alternative to docker.
|
||||
|
||||
### supervisord
|
||||
|
||||
`supervisord` is a lightweight process babysitting system for keeping kubelet and docker
|
||||
running.
|
||||
|
||||
### fluentd
|
||||
|
||||
`fluentd` is a daemon which helps provide [cluster-level logging](#cluster-level-logging).
|
||||
[Kubernetes Components](/docs/concepts/overview/components/)
|
||||
|
|
|
@ -19,7 +19,9 @@ To install Kubernetes on a set of machines, consult one of the existing [Getting
|
|||
|
||||
## Upgrading a cluster
|
||||
|
||||
The current state of cluster upgrades is provider dependent.
|
||||
The current state of cluster upgrades is provider dependent, and some releases may require special care when upgrading. It is recommended that administrators consult both the [release notes](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md), as well as the version specific upgrade notes prior to upgrading their clusters.
|
||||
|
||||
* [Upgrading to 1.6](/docs/admin/upgrade)
|
||||
|
||||
### Upgrading Google Compute Engine clusters
|
||||
|
||||
|
@ -56,8 +58,12 @@ The node upgrade process is user-initiated and is described in the [GKE document
|
|||
|
||||
### Upgrading clusters on other platforms
|
||||
|
||||
The `cluster/kube-push.sh` script will do a rudimentary update. This process is still quite experimental, we
|
||||
recommend testing the upgrade on an experimental cluster before performing the update on a production cluster.
|
||||
Different providers, and tools, will manage upgrades differently. It is recommended that you consult their main documentation regarding upgrades.
|
||||
|
||||
* [kops](https://github.com/kubernetes/kops)
|
||||
* [kargo](https://github.com/kubernetes-incubator/kargo)
|
||||
* [CoreOS Tectonic](https://coreos.com/tectonic/docs/latest/admin/upgrade.html)
|
||||
* ...
|
||||
|
||||
## Resizing a cluster
|
||||
|
||||
|
@ -92,7 +98,7 @@ an extended period of time (10min but it may change in the future).
|
|||
Cluster autoscaler is configured per instance group (GCE) or node pool (GKE).
|
||||
|
||||
If you are using GCE then you can either enable it while creating a cluster with kube-up.sh script.
|
||||
To configure cluster autoscaler you have to set 3 environment variables:
|
||||
To configure cluster autoscaler you have to set three environment variables:
|
||||
|
||||
* `KUBE_ENABLE_CLUSTER_AUTOSCALER` - it enables cluster autoscaler if set to true.
|
||||
* `KUBE_AUTOSCALER_MIN_NODES` - minimum number of nodes in the cluster.
|
||||
|
|
|
@ -51,7 +51,7 @@ A pod template in a DaemonSet must have a [`RestartPolicy`](/docs/user-guide/pod
|
|||
### Pod Selector
|
||||
|
||||
The `.spec.selector` field is a pod selector. It works the same as the `.spec.selector` of
|
||||
a [Job](/docs/user-guide/jobs/) or other new resources.
|
||||
a [Job](/docs/concepts/jobs/run-to-completion-finite-workloads/) or other new resources.
|
||||
|
||||
The `spec.selector` is an object consisting of two fields:
|
||||
|
||||
|
|
|
@ -3,93 +3,7 @@ assignees:
|
|||
- davidopp
|
||||
title: Pod Disruption Budget
|
||||
---
|
||||
This guide is for anyone wishing to specify safety constraints on pods or anyone
|
||||
wishing to write software (typically automation software) that respects those
|
||||
constraints.
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
## Rationale
|
||||
|
||||
Various cluster management operations may voluntarily evict pods. "Voluntary"
|
||||
means an eviction can be safely delayed for a reasonable period of time. The
|
||||
principal examples today are draining a node for maintenance or upgrade
|
||||
(`kubectl drain`), and cluster autoscaling down. In the future the
|
||||
[rescheduler](https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/rescheduling.md)
|
||||
may also perform voluntary evictions. By contrast, something like evicting pods
|
||||
because a node has become unreachable or reports `NotReady`, is not "voluntary."
|
||||
|
||||
For voluntary evictions, it can be useful for applications to be able to limit
|
||||
the number of pods that are down simultaneously. For example, a quorum-based application would
|
||||
like to ensure that the number of replicas running is never brought below the
|
||||
number needed for a quorum, even temporarily. Or a web front end might want to
|
||||
ensure that the number of replicas serving load never falls below a certain
|
||||
percentage of the total, even briefly. `PodDisruptionBudget` is an API object
|
||||
that specifies the minimum number or percentage of replicas of a collection that
|
||||
must be up at a time. Components that wish to evict a pod subject to disruption
|
||||
budget use the `/eviction` subresource; unlike a regular pod deletion, this
|
||||
operation may be rejected by the API server if the eviction would cause a
|
||||
disruption budget to be violated.
|
||||
|
||||
## Specifying a PodDisruptionBudget
|
||||
|
||||
A `PodDisruptionBudget` has two components: a label selector `selector` to specify the set of
|
||||
pods to which it applies, and `minAvailable` which is a description of the number of pods from that
|
||||
set that must still be available after the eviction, i.e. even in the absence
|
||||
of the evicted pod. `minAvailable` can be either an absolute number or a percentage.
|
||||
So for example, 100% means no voluntary evictions from the set are permitted. In
|
||||
typical usage, a single budget would be used for a collection of pods managed by
|
||||
a controller—for example, the pods in a single ReplicaSet.
|
||||
|
||||
Note that a disruption budget does not truly guarantee that the specified
|
||||
number/percentage of pods will always be up. For example, a node that hosts a
|
||||
pod from the collection may fail when the collection is at the minimum size
|
||||
specified in the budget, thus bringing the number of available pods from the
|
||||
collection below the specified size. The budget can only protect against
|
||||
voluntary evictions, not all causes of unavailability.
|
||||
|
||||
## Requesting an eviction
|
||||
|
||||
If you are writing infrastructure software that wants to produce these voluntary
|
||||
evictions, you will need to use the eviction API. The eviction subresource of a
|
||||
pod can be thought of as a kind of policy-controlled DELETE operation on the pod
|
||||
itself. To attempt an eviction (perhaps more REST-precisely, to attempt to
|
||||
*create* an eviction), you POST an attempted operation. Here's an example:
|
||||
|
||||
```json
|
||||
{
|
||||
"apiVersion": "policy/v1beta1",
|
||||
"kind": "Eviction",
|
||||
"metadata": {
|
||||
"name": "quux",
|
||||
"namespace": "default"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
You can attempt an eviction using `curl`:
|
||||
|
||||
```bash
|
||||
$ curl -v -H 'Content-type: application/json' http://127.0.0.1:8080/api/v1/namespaces/default/pods/quux/eviction -d @eviction.json
|
||||
```
|
||||
|
||||
The API can respond in one of three ways.
|
||||
|
||||
1. If the eviction is granted, then the pod is deleted just as if you had sent
|
||||
a `DELETE` request to the pod's URL and you get back `200 OK`.
|
||||
2. If the current state of affairs wouldn't allow an eviction by the rules set
|
||||
forth in the budget, you get back `429 Too Many Requests`. This is
|
||||
typically used for generic rate limiting of *any* requests, but here we mean
|
||||
that this request isn't allowed *right now* but it may be allowed later.
|
||||
Currently, callers do not get any `Retry-After` advice, but they may in
|
||||
future versions.
|
||||
3. If there is some kind of misconfiguration, like multiple budgets pointing at
|
||||
the same pod, you will get `500 Internal Server Error`.
|
||||
|
||||
For a given eviction request, there are two cases.
|
||||
|
||||
1. There is no budget that matches this pod. In this case, the server always
|
||||
returns `200 OK`.
|
||||
2. There is at least one budget. In this case, any of the three above responses may
|
||||
apply.
|
||||
[Configuring a Pod Disruption Budget](/docs/tasks/configure-pod-container/configure-pod-disruption-budget/)
|
||||
|
|
|
@ -5,385 +5,6 @@ assignees:
|
|||
title: Using DNS Pods and Services
|
||||
---
|
||||
|
||||
## Introduction
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
As of Kubernetes 1.3, DNS is a built-in service launched automatically using the addon manager [cluster add-on](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/README.md).
|
||||
|
||||
Kubernetes DNS schedules a DNS Pod and Service on the cluster, and configures
|
||||
the kubelets to tell individual containers to use the DNS Service's IP to
|
||||
resolve DNS names.
|
||||
|
||||
## What things get DNS names?
|
||||
|
||||
Every Service defined in the cluster (including the DNS server itself) is
|
||||
assigned a DNS name. By default, a client Pod's DNS search list will
|
||||
include the Pod's own namespace and the cluster's default domain. This is best
|
||||
illustrated by example:
|
||||
|
||||
Assume a Service named `foo` in the Kubernetes namespace `bar`. A Pod running
|
||||
in namespace `bar` can look up this service by simply doing a DNS query for
|
||||
`foo`. A Pod running in namespace `quux` can look up this service by doing a
|
||||
DNS query for `foo.bar`.
|
||||
|
||||
## Supported DNS schema
|
||||
|
||||
The following sections detail the supported record types and layout that is
|
||||
supported. Any other layout or names or queries that happen to work are
|
||||
considered implementation details and are subject to change without warning.
|
||||
|
||||
### Services
|
||||
|
||||
#### A records
|
||||
|
||||
"Normal" (not headless) Services are assigned a DNS A record for a name of the
|
||||
form `my-svc.my-namespace.svc.cluster.local`. This resolves to the cluster IP
|
||||
of the Service.
|
||||
|
||||
"Headless" (without a cluster IP) Services are also assigned a DNS A record for
|
||||
a name of the form `my-svc.my-namespace.svc.cluster.local`. Unlike normal
|
||||
Services, this resolves to the set of IPs of the pods selected by the Service.
|
||||
Clients are expected to consume the set or else use standard round-robin
|
||||
selection from the set.
|
||||
|
||||
### SRV records
|
||||
|
||||
SRV Records are created for named ports that are part of normal or [Headless
|
||||
Services](http://releases.k8s.io/docs/user-guide/services/#headless-services).
|
||||
For each named port, the SRV record would have the form
|
||||
`_my-port-name._my-port-protocol.my-svc.my-namespace.svc.cluster.local`.
|
||||
For a regular service, this resolves to the port number and the CNAME:
|
||||
`my-svc.my-namespace.svc.cluster.local`.
|
||||
For a headless service, this resolves to multiple answers, one for each pod
|
||||
that is backing the service, and contains the port number and a CNAME of the pod
|
||||
of the form `auto-generated-name.my-svc.my-namespace.svc.cluster.local`.
|
||||
|
||||
### Backwards compatibility
|
||||
|
||||
Previous versions of kube-dns made names of the form
|
||||
`my-svc.my-namespace.cluster.local` (the 'svc' level was added later). This
|
||||
is no longer supported.
|
||||
|
||||
### Pods
|
||||
|
||||
#### A Records
|
||||
|
||||
When enabled, pods are assigned a DNS A record in the form of `pod-ip-address.my-namespace.pod.cluster.local`.
|
||||
|
||||
For example, a pod with IP `1.2.3.4` in the namespace `default` with a DNS name of `cluster.local` would have an entry: `1-2-3-4.default.pod.cluster.local`.
|
||||
|
||||
#### A Records and hostname based on Pod's hostname and subdomain fields
|
||||
|
||||
Currently when a pod is created, its hostname is the Pod's `metadata.name` value.
|
||||
|
||||
With v1.2, users can specify a Pod annotation, `pod.beta.kubernetes.io/hostname`, to specify what the Pod's hostname should be.
|
||||
The Pod annotation, if specified, takes precedence over the Pod's name, to be the hostname of the pod.
|
||||
For example, given a Pod with annotation `pod.beta.kubernetes.io/hostname: my-pod-name`, the Pod will have its hostname set to "my-pod-name".
|
||||
|
||||
With v1.3, the PodSpec has a `hostname` field, which can be used to specify the Pod's hostname. This field value takes precedence over the
|
||||
`pod.beta.kubernetes.io/hostname` annotation value.
|
||||
|
||||
v1.2 introduces a beta feature where the user can specify a Pod annotation, `pod.beta.kubernetes.io/subdomain`, to specify the Pod's subdomain.
|
||||
The final domain will be "<hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>".
|
||||
For example, a Pod with the hostname annotation set to "foo", and the subdomain annotation set to "bar", in namespace "my-namespace", will have the FQDN "foo.bar.my-namespace.svc.cluster.local"
|
||||
|
||||
With v1.3, the PodSpec has a `subdomain` field, which can be used to specify the Pod's subdomain. This field value takes precedence over the
|
||||
`pod.beta.kubernetes.io/subdomain` annotation value.
|
||||
|
||||
Example:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: default-subdomain
|
||||
spec:
|
||||
selector:
|
||||
name: busybox
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: foo # Actually, no port is needed.
|
||||
port: 1234
|
||||
targetPort: 1234
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: busybox1
|
||||
labels:
|
||||
name: busybox
|
||||
spec:
|
||||
hostname: busybox-1
|
||||
subdomain: default-subdomain
|
||||
containers:
|
||||
- image: busybox
|
||||
command:
|
||||
- sleep
|
||||
- "3600"
|
||||
name: busybox
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: busybox2
|
||||
labels:
|
||||
name: busybox
|
||||
spec:
|
||||
hostname: busybox-2
|
||||
subdomain: default-subdomain
|
||||
containers:
|
||||
- image: busybox
|
||||
command:
|
||||
- sleep
|
||||
- "3600"
|
||||
name: busybox
|
||||
```
|
||||
|
||||
If there exists a headless service in the same namespace as the pod and with the same name as the subdomain, the cluster's KubeDNS Server also returns an A record for the Pod's fully qualified hostname.
|
||||
Given a Pod with the hostname set to "busybox-1" and the subdomain set to "default-subdomain", and a headless Service named "default-subdomain" in the same namespace, the pod will see it's own FQDN as "busybox-1.default-subdomain.my-namespace.svc.cluster.local". DNS serves an A record at that name, pointing to the Pod's IP. Both pods "busybox1" and "busybox2" can have their distinct A records.
|
||||
|
||||
As of Kubernetes v1.2, the Endpoints object also has the annotation `endpoints.beta.kubernetes.io/hostnames-map`. Its value is the json representation of map[string(IP)][endpoints.HostRecord], for example: '{"10.245.1.6":{HostName: "my-webserver"}}'.
|
||||
If the Endpoints are for a headless service, an A record is created with the format <hostname>.<service name>.<pod namespace>.svc.<cluster domain>
|
||||
For the example json, if endpoints are for a headless service named "bar", and one of the endpoints has IP "10.245.1.6", an A record is created with the name "my-webserver.bar.my-namespace.svc.cluster.local" and the A record lookup would return "10.245.1.6".
|
||||
This endpoints annotation generally does not need to be specified by end-users, but can used by the internal service controller to deliver the aforementioned feature.
|
||||
|
||||
With v1.3, The Endpoints object can specify the `hostname` for any endpoint, along with its IP. The hostname field takes precedence over the hostname value
|
||||
that might have been specified via the `endpoints.beta.kubernetes.io/hostnames-map` annotation.
|
||||
|
||||
With v1.3, the following annotations are deprecated: `pod.beta.kubernetes.io/hostname`, `pod.beta.kubernetes.io/subdomain`, `endpoints.beta.kubernetes.io/hostnames-map`
|
||||
|
||||
## How do I test if it is working?
|
||||
|
||||
### Create a simple Pod to use as a test environment
|
||||
|
||||
Create a file named busybox.yaml with the
|
||||
following contents:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: busybox
|
||||
namespace: default
|
||||
spec:
|
||||
containers:
|
||||
- image: busybox
|
||||
command:
|
||||
- sleep
|
||||
- "3600"
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: busybox
|
||||
restartPolicy: Always
|
||||
```
|
||||
|
||||
Then create a pod using this file:
|
||||
|
||||
```
|
||||
kubectl create -f busybox.yaml
|
||||
```
|
||||
|
||||
### Wait for this pod to go into the running state
|
||||
|
||||
You can get its status with:
|
||||
```
|
||||
kubectl get pods busybox
|
||||
```
|
||||
|
||||
You should see:
|
||||
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
busybox 1/1 Running 0 <some-time>
|
||||
```
|
||||
|
||||
### Validate that DNS is working
|
||||
|
||||
Once that pod is running, you can exec nslookup in that environment:
|
||||
|
||||
```
|
||||
kubectl exec -ti busybox -- nslookup kubernetes.default
|
||||
```
|
||||
|
||||
You should see something like:
|
||||
|
||||
```
|
||||
Server: 10.0.0.10
|
||||
Address 1: 10.0.0.10
|
||||
|
||||
Name: kubernetes.default
|
||||
Address 1: 10.0.0.1
|
||||
```
|
||||
|
||||
If you see that, DNS is working correctly.
|
||||
|
||||
### Troubleshooting Tips
|
||||
|
||||
If the nslookup command fails, check the following:
|
||||
|
||||
#### Check the local DNS configuration first
|
||||
Take a look inside the resolv.conf file. (See "Inheriting DNS from the node" and "Known issues" below for more information)
|
||||
|
||||
```
|
||||
kubectl exec busybox cat /etc/resolv.conf
|
||||
```
|
||||
|
||||
Verify that the search path and name server are set up like the following (note that search path may vary for different cloud providers):
|
||||
|
||||
```
|
||||
search default.svc.cluster.local svc.cluster.local cluster.local google.internal c.gce_project_id.internal
|
||||
nameserver 10.0.0.10
|
||||
options ndots:5
|
||||
```
|
||||
|
||||
#### Quick diagnosis
|
||||
|
||||
Errors such as the following indicate a problem with the kube-dns add-on or associated Services:
|
||||
|
||||
```
|
||||
$ kubectl exec -ti busybox -- nslookup kubernetes.default
|
||||
Server: 10.0.0.10
|
||||
Address 1: 10.0.0.10
|
||||
|
||||
nslookup: can't resolve 'kubernetes.default'
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
$ kubectl exec -ti busybox -- nslookup kubernetes.default
|
||||
Server: 10.0.0.10
|
||||
Address 1: 10.0.0.10 kube-dns.kube-system.svc.cluster.local
|
||||
|
||||
nslookup: can't resolve 'kubernetes.default'
|
||||
```
|
||||
|
||||
#### Check if the DNS pod is running
|
||||
|
||||
Use the kubectl get pods command to verify that the DNS pod is running.
|
||||
|
||||
```
|
||||
kubectl get pods --namespace=kube-system -l k8s-app=kube-dns
|
||||
```
|
||||
|
||||
You should see something like:
|
||||
|
||||
```
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
...
|
||||
kube-dns-v19-ezo1y 3/3 Running 0 1h
|
||||
...
|
||||
```
|
||||
|
||||
If you see that no pod is running or that the pod has failed/completed, the DNS add-on may not be deployed by default in your current environment and you will have to deploy it manually.
|
||||
|
||||
#### Check for Errors in the DNS pod
|
||||
|
||||
Use `kubectl logs` command to see logs for the DNS daemons.
|
||||
|
||||
```
|
||||
kubectl logs --namespace=kube-system $(kubectl get pods --namespace=kube-system -l k8s-app=kube-dns -o name) -c kubedns
|
||||
kubectl logs --namespace=kube-system $(kubectl get pods --namespace=kube-system -l k8s-app=kube-dns -o name) -c dnsmasq
|
||||
kubectl logs --namespace=kube-system $(kubectl get pods --namespace=kube-system -l k8s-app=kube-dns -o name) -c healthz
|
||||
```
|
||||
|
||||
See if there is any suspicious log. W, E, F letter at the beginning represent Warning, Error and Failure. Please search for entries that have these as the logging level and use [kubernetes issues](https://github.com/kubernetes/kubernetes/issues) to report unexpected errors.
|
||||
|
||||
#### Is DNS service up?
|
||||
|
||||
Verify that the DNS service is up by using the `kubectl get service` command.
|
||||
|
||||
```
|
||||
kubectl get svc --namespace=kube-system
|
||||
```
|
||||
|
||||
You should see:
|
||||
|
||||
```
|
||||
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
||||
...
|
||||
kube-dns 10.0.0.10 <none> 53/UDP,53/TCP 1h
|
||||
...
|
||||
```
|
||||
|
||||
If you have created the service or in the case it should be created by default but it does not appear, see this [debugging services page](http://kubernetes.io/docs/user-guide/debugging-services/) for more information.
|
||||
|
||||
#### Are DNS endpoints exposed?
|
||||
|
||||
You can verify that DNS endpoints are exposed by using the `kubectl get endpoints` command.
|
||||
|
||||
```
|
||||
kubectl get ep kube-dns --namespace=kube-system
|
||||
```
|
||||
|
||||
You should see something like:
|
||||
```
|
||||
NAME ENDPOINTS AGE
|
||||
kube-dns 10.180.3.17:53,10.180.3.17:53 1h
|
||||
```
|
||||
|
||||
If you do not see the endpoints, see endpoints section in the [debugging services documentation](http://kubernetes.io/docs/user-guide/debugging-services/).
|
||||
|
||||
For additional Kubernetes DNS examples, see the [cluster-dns examples](https://github.com/kubernetes/kubernetes/tree/master/examples/cluster-dns) in the Kubernetes GitHub repository.
|
||||
|
||||
## Kubernetes Federation (Multiple Zone support)
|
||||
|
||||
Release 1.3 introduced Cluster Federation support for multi-site
|
||||
Kubernetes installations. This required some minor
|
||||
(backward-compatible) changes to the way
|
||||
the Kubernetes cluster DNS server processes DNS queries, to facilitate
|
||||
the lookup of federated services (which span multiple Kubernetes clusters).
|
||||
See the [Cluster Federation Administrators' Guide](/docs/admin/federation) for more
|
||||
details on Cluster Federation and multi-site support.
|
||||
|
||||
## How it Works
|
||||
|
||||
The running Kubernetes DNS pod holds 3 containers - kubedns, dnsmasq and a health check called healthz.
|
||||
The kubedns process watches the Kubernetes master for changes in Services and Endpoints, and maintains
|
||||
in-memory lookup structures to service DNS requests. The dnsmasq container adds DNS caching to improve
|
||||
performance. The healthz container provides a single health check endpoint while performing dual healthchecks
|
||||
(for dnsmasq and kubedns).
|
||||
|
||||
The DNS pod is exposed as a Kubernetes Service with a static IP. Once assigned the
|
||||
kubelet passes DNS configured using the `--cluster-dns=10.0.0.10` flag to each
|
||||
container.
|
||||
|
||||
DNS names also need domains. The local domain is configurable, in the kubelet using
|
||||
the flag `--cluster-domain=<default local domain>`
|
||||
|
||||
The Kubernetes cluster DNS server (based off the [SkyDNS](https://github.com/skynetservices/skydns) library)
|
||||
supports forward lookups (A records), service lookups (SRV records) and reverse IP address lookups (PTR records).
|
||||
|
||||
## Inheriting DNS from the node
|
||||
When running a pod, kubelet will prepend the cluster DNS server and search
|
||||
paths to the node's own DNS settings. If the node is able to resolve DNS names
|
||||
specific to the larger environment, pods should be able to, also. See "Known
|
||||
issues" below for a caveat.
|
||||
|
||||
If you don't want this, or if you want a different DNS config for pods, you can
|
||||
use the kubelet's `--resolv-conf` flag. Setting it to "" means that pods will
|
||||
not inherit DNS. Setting it to a valid file path means that kubelet will use
|
||||
this file instead of `/etc/resolv.conf` for DNS inheritance.
|
||||
|
||||
## Known issues
|
||||
Kubernetes installs do not configure the nodes' resolv.conf files to use the
|
||||
cluster DNS by default, because that process is inherently distro-specific.
|
||||
This should probably be implemented eventually.
|
||||
|
||||
Linux's libc is impossibly stuck ([see this bug from
|
||||
2005](https://bugzilla.redhat.com/show_bug.cgi?id=168253)) with limits of just
|
||||
3 DNS `nameserver` records and 6 DNS `search` records. Kubernetes needs to
|
||||
consume 1 `nameserver` record and 3 `search` records. This means that if a
|
||||
local installation already uses 3 `nameserver`s or uses more than 3 `search`es,
|
||||
some of those settings will be lost. As a partial workaround, the node can run
|
||||
`dnsmasq` which will provide more `nameserver` entries, but not more `search`
|
||||
entries. You can also use kubelet's `--resolv-conf` flag.
|
||||
|
||||
If you are using Alpine version 3.3 or earlier as your base image, DNS may not
|
||||
work properly owing to a known issue with Alpine. Check [here](https://github.com/kubernetes/kubernetes/issues/30215)
|
||||
for more information.
|
||||
|
||||
## References
|
||||
|
||||
- [Docs for the DNS cluster addon](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/dns/README.md)
|
||||
|
||||
## What's next
|
||||
- [Autoscaling the DNS Service in a Cluster](/docs/tasks/administer-cluster/dns-horizontal-autoscaling/).
|
||||
[DNS Pods and Services](/docs/concepts/services-networking/dns-pod-service/)
|
||||
|
|
|
@ -20,7 +20,7 @@ Data Reliability: for reasonable safety, either etcd needs to be run as a
|
|||
etcd) or etcd's data directory should be located on durable storage (e.g., GCE's
|
||||
persistent disk). In either case, if high availability is required--as it might
|
||||
be in a production cluster--the data directory ought to be [backed up
|
||||
periodically](https://coreos.com/etcd/docs/2.2.1/admin_guide.html#disaster-recovery),
|
||||
periodically](https://coreos.com/etcd/docs/latest/op-guide/recovery.html),
|
||||
to reduce downtime in case of corruption.
|
||||
|
||||
## Default configuration
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
---
|
||||
title: federation-controller-mananger
|
||||
title: federation-controller-manager
|
||||
notitle: true
|
||||
---
|
||||
|
||||
|
|
|
@ -4,205 +4,6 @@ assignees:
|
|||
title: Setting up Cluster Federation with Kubefed
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
Kubernetes version 1.5 includes a new command line tool called
|
||||
`kubefed` to help you administrate your federated clusters.
|
||||
`kubefed` helps you to deploy a new Kubernetes cluster federation
|
||||
control plane, and to add clusters to or remove clusters from an
|
||||
existing federation control plane.
|
||||
|
||||
This guide explains how to administer a Kubernetes Cluster Federation
|
||||
using `kubefed`.
|
||||
|
||||
> Note: `kubefed` is an alpha feature in Kubernetes 1.5.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
This guide assumes that you have a running Kubernetes cluster. Please
|
||||
see one of the [getting started](/docs/getting-started-guides/) guides
|
||||
for installation instructions for your platform.
|
||||
|
||||
|
||||
## Getting `kubefed`
|
||||
|
||||
Download the client tarball corresponding to Kubernetes version 1.5
|
||||
or later
|
||||
[from the release page](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md),
|
||||
extract the binaries in the tarball to one of the directories
|
||||
in your `$PATH` and set the executable permission on those binaries.
|
||||
|
||||
Note: The URL in the curl command below downloads the binaries for
|
||||
Linux amd64. If you are on a different platform, please use the URL
|
||||
for the binaries appropriate for your platform. You can find the list
|
||||
of available binaries on the [release page](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md#client-binaries-3)
|
||||
|
||||
|
||||
```shell
|
||||
curl -O https://storage.googleapis.com/kubernetes-release/release/v1.5.0/kubernetes-client-linux-amd64.tar.gz
|
||||
tar -xzvf kubernetes-client-linux-amd64.tar.gz
|
||||
sudo cp kubernetes/client/bin/kubefed /usr/local/bin
|
||||
sudo chmod +x /usr/local/bin/kubefed
|
||||
sudo cp kubernetes/client/bin/kubectl /usr/local/bin
|
||||
sudo chmod +x /usr/local/bin/kubectl
|
||||
```
|
||||
|
||||
|
||||
## Choosing a host cluster.
|
||||
|
||||
You'll need to choose one of your Kubernetes clusters to be the
|
||||
*host cluster*. The host cluster hosts the components that make up
|
||||
your federation control plane. Ensure that you have a `kubeconfig`
|
||||
entry in your local `kubeconfig` that corresponds to the host cluster.
|
||||
You can verify that you have the required `kubeconfig` entry by
|
||||
running:
|
||||
|
||||
```shell
|
||||
kubectl config get-contexts
|
||||
```
|
||||
|
||||
The output should contain an entry corresponding to your host cluster,
|
||||
similar to the following:
|
||||
|
||||
```
|
||||
CURRENT NAME CLUSTER AUTHINFO NAMESPACE
|
||||
gke_myproject_asia-east1-b_gce-asia-east1 gke_myproject_asia-east1-b_gce-asia-east1 gke_myproject_asia-east1-b_gce-asia-east1
|
||||
```
|
||||
|
||||
|
||||
You'll need to provide the `kubeconfig` context (called name in the
|
||||
entry above) for your host cluster when you deploy your federation
|
||||
control plane.
|
||||
|
||||
|
||||
## Deploying a federation control plane.
|
||||
|
||||
"To deploy a federation control plane on your host cluster, run
|
||||
`kubefed init` command. When you use `kubefed init`, you must provide
|
||||
the following:
|
||||
|
||||
* Federation name
|
||||
* `--host-cluster-context`, the `kubeconfig` context for the host cluster
|
||||
* `--dns-zone-name`, a domain name suffix for your federated services
|
||||
|
||||
The following example command deploys a federation control plane with
|
||||
the name `fellowship`, a host cluster context `rivendell`, and the
|
||||
domain suffix `example.com`:
|
||||
|
||||
```shell
|
||||
kubefed init fellowship --host-cluster-context=rivendell --dns-zone-name="example.com"
|
||||
```
|
||||
|
||||
The domain suffix you specify in `--dns-zone-name` must be an existing
|
||||
domain that you control, and that is programmable by your DNS provider.
|
||||
|
||||
`kubefed init` sets up the federation control plane in the host
|
||||
cluster and also adds an entry for the federation API server in your
|
||||
local kubeconfig. Note that in the alpha release in Kubernetes 1.5,
|
||||
`kubefed init` does not automatically set the current context to the
|
||||
newly deployed federation. You can set the current context manually by
|
||||
running:
|
||||
|
||||
```shell
|
||||
kubectl config use-context fellowship
|
||||
```
|
||||
|
||||
where `fellowship` is the name of your federation.
|
||||
|
||||
|
||||
## Adding a cluster to a federation
|
||||
|
||||
Once you've deployed a federation control plane, you'll need to make
|
||||
that control plane aware of the clusters it should manage. You can add
|
||||
a cluster to your federation by using the `kubefed join` command.
|
||||
|
||||
To use `kubefed join`, you'll need to provide the name of the cluster
|
||||
you want to add to the federation, and the `--host-cluster-context`
|
||||
for the federation control plane's host cluster.
|
||||
|
||||
The following example command adds the cluster `gondor` to the
|
||||
federation with host cluster `rivendell`:
|
||||
|
||||
```
|
||||
kubefed join gondor --host-cluster-context=rivendell
|
||||
```
|
||||
|
||||
> Note: Kubernetes requires that you manually join clusters to a
|
||||
federation because the federation control plane manages only those
|
||||
clusters that it is responsible for managing. Adding a cluster tells
|
||||
the federation control plane that it is responsible for managing that
|
||||
cluster.
|
||||
|
||||
### Naming rules and customization
|
||||
|
||||
The cluster name you supply to `kubefed join` must be a valid RFC 1035
|
||||
label.
|
||||
|
||||
Furthermore, federation control plane requires credentials of the
|
||||
joined clusters to operate on them. These credentials are obtained
|
||||
from the local kubeconfig. `kubefed join` uses the cluster name
|
||||
specified as the argument to look for the cluster's context in the
|
||||
local kubeconfig. If it fails to find a matching context, it exits
|
||||
with an error.
|
||||
|
||||
This might cause issues in cases where context names for each cluster
|
||||
in the federation don't follow
|
||||
[RFC 1035](https://www.ietf.org/rfc/rfc1035.txt) label naming rules.
|
||||
In such cases, you can specify a cluster name that conforms to the
|
||||
[RFC 1035](https://www.ietf.org/rfc/rfc1035.txt) label naming rules
|
||||
and specify the cluster context using the `--cluster-context` flag.
|
||||
For example, if context of the cluster your are joining is
|
||||
`gondor_needs-no_king`, then you can join the cluster by running:
|
||||
|
||||
```shell
|
||||
kubefed join gondor --host-cluster-context=rivendell --cluster-context=gondor_needs-no_king
|
||||
```
|
||||
|
||||
#### Secret name
|
||||
|
||||
Cluster credentials required by the federation control plane as
|
||||
described above are stored as a secret in the host cluster. The name
|
||||
of the secret is also derived from the cluster name.
|
||||
|
||||
However, the name of a secret object in Kubernetes should conform
|
||||
to the DNS subdomain name specification described in
|
||||
[RFC 1123](https://tools.ietf.org/html/rfc1123). If this isn't the
|
||||
case, you can pass the secret name to `kubefed join` using the
|
||||
`--secret-name` flag. For example, if the cluster name is `noldor` and
|
||||
the secret name is `11kingdom`, you can join the cluster by
|
||||
running:
|
||||
|
||||
```shell
|
||||
kubefed join noldor --host-cluster-context=rivendell --secret-name=11kingdom
|
||||
```
|
||||
|
||||
Note: If your cluster name does not conform to the DNS subdomain name
|
||||
specification, all you need to do is supply the secret name via the
|
||||
`--secret-name` flag. `kubefed join` automatically creates the secret
|
||||
for you.
|
||||
|
||||
|
||||
## Removing a cluster from a federation
|
||||
|
||||
To remove a cluster from a federation, run the `kubefed unjoin`
|
||||
command with the cluster name and the federation's
|
||||
`--host-cluster-context`:
|
||||
|
||||
```
|
||||
kubefed unjoin gondor --host-cluster-context=rivendell
|
||||
```
|
||||
|
||||
|
||||
## Turning down the federation control plane:
|
||||
|
||||
Proper cleanup of federation control plane is not fully implemented in
|
||||
this alpha release of `kubefed`. However, for the time being, deleting
|
||||
the federation system namespace should remove all the resources except
|
||||
the persistent storage volume dynamically provisioned for the
|
||||
federation control plane's etcd. You can delete the federation
|
||||
namespace by running the following command:
|
||||
|
||||
```
|
||||
$ kubectl delete ns federation-system
|
||||
```
|
||||
[Setting up Cluster Federation with kubefed](/docs/tutorials/federation/set-up-cluster-federation-kubefed/)
|
||||
|
|
|
@ -23,10 +23,10 @@ threshold has been met.
|
|||
|
||||
### Container Collection
|
||||
|
||||
The policy for garbage collecting containers considers three user-defined variables. `MinAge` is the minimum age at which a container can be garbage collected. `MaxPerPodContainer` is the maximum number of dead containers any single
|
||||
pod (UID, container name) pair is allowed to have. `MaxContainers` is the maximum number of total dead containers. These variables can be individually disabled by setting 'MinAge' to zero and setting 'MaxPerPodContainer' and 'MaxContainers' respectively to less than zero.
|
||||
The policy for garbage collecting containers considers three user-defined variables. `MinAge` is the minimum age at which a container can be garbage collected. `MaxPerPodContainer` is the maximum number of dead containers every single
|
||||
pod (UID, container name) pair is allowed to have. `MaxContainers` is the maximum number of total dead containers. These variables can be individually disabled by setting `MinAge` to zero and setting `MaxPerPodContainer` and `MaxContainers` respectively to less than zero.
|
||||
|
||||
Kubelet will act on containers that are unidentified, deleted, or outside of the boundaries set by the previously mentioned flags. The oldest containers will generally be removed first. 'MaxPerPodContainer' and 'MaxContainer' may potentially conflict with each other in situations where retaining the maximum number of containers per pod ('MaxPerPodContainer') would go outside the allowable range of global dead containers ('MaxContainers'). 'MaxPerPodContainer' would be adjusted in this situation: A worst case scenario would be to downgrade 'MaxPerPodContainer' to 1 and evict the oldest containers. Additionally, containers owned by pods that have been deleted are removed once they are older than `MinAge`.
|
||||
Kubelet will act on containers that are unidentified, deleted, or outside of the boundaries set by the previously mentioned flags. The oldest containers will generally be removed first. `MaxPerPodContainer` and `MaxContainer` may potentially conflict with each other in situations where retaining the maximum number of containers per pod (`MaxPerPodContainer`) would go outside the allowable range of global dead containers (`MaxContainers`). `MaxPerPodContainer` would be adjusted in this situation: A worst case scenario would be to downgrade `MaxPerPodContainer` to 1 and evict the oldest containers. Additionally, containers owned by pods that have been deleted are removed once they are older than `MinAge`.
|
||||
|
||||
Containers that are not managed by kubelet are not subject to container garbage collection.
|
||||
|
||||
|
@ -42,15 +42,34 @@ to free. Default is 80%.
|
|||
We also allow users to customize garbage collection policy through the following kubelet flags:
|
||||
|
||||
1. `minimum-container-ttl-duration`, minimum age for a finished container before it is
|
||||
garbage collected. Default is 1 minute.
|
||||
2. `maximum-dead-containers-per-container`, maximum number of old instances to retain
|
||||
per container. Default is 2.
|
||||
garbage collected. Default is 0 minute, which means every finished container will be garbaged collected.
|
||||
2. `maximum-dead-containers-per-container`, maximum number of old instances to be retained
|
||||
per container. Default is 1.
|
||||
3. `maximum-dead-containers`, maximum number of old instances of containers to retain globally.
|
||||
Default is 100.
|
||||
Default is -1, which means there is no global limit.
|
||||
|
||||
Containers can potentially be garbage collected before their usefulness has expired. These containers
|
||||
can contain logs and other data that can be useful for troubleshooting. A sufficiently large value for
|
||||
`maximum-dead-containers-per-container` is highly recommended to allow at least 2 dead containers to be
|
||||
`maximum-dead-containers-per-container` is highly recommended to allow at least 1 dead container to be
|
||||
retained per expected container. A higher value for `maximum-dead-containers` is also recommended for a
|
||||
similar reason.
|
||||
See [this issue](https://github.com/kubernetes/kubernetes/issues/13287) for more details.
|
||||
|
||||
|
||||
### Deprecation
|
||||
|
||||
Some kubelet Garbage Collection features in this doc will be replaced by kubelet eviction in the future.
|
||||
|
||||
Including:
|
||||
|
||||
| Existing Flag | New Flag | Rationale |
|
||||
| ------------- | -------- | --------- |
|
||||
| `--image-gc-high-threshold` | `--eviction-hard` or `--eviction-soft` | existing eviction signals can trigger image garbage collection |
|
||||
| `--image-gc-low-threshold` | `--eviction-minimum-reclaim` | eviction reclaims achieve the same behavior |
|
||||
| `--maximum-dead-containers` | | deprecated once old logs are stored outside of container's context |
|
||||
| `--maximum-dead-containers-per-container` | | deprecated once old logs are stored outside of container's context |
|
||||
| `--minimum-container-ttl-duration` | | deprecated once old logs are stored outside of container's context |
|
||||
| `--low-diskspace-threshold-mb` | `--eviction-hard` or `eviction-soft` | eviction generalizes disk thresholds to other resources |
|
||||
| `--outofdisk-transition-frequency` | `--eviction-pressure-transition-period` | eviction generalizes disk pressure transition to other resources |
|
||||
|
||||
See [Configuring Out Of Resource Handling](/docs/admin/out-of-resource/) for more details.
|
||||
|
|
|
@ -107,7 +107,7 @@ This operation may be sped up by migrating etcd data directory, as described [he
|
|||
|
||||
## Implementation notes
|
||||
|
||||
![](ha-master-gce.png)
|
||||
![ha-master-gce](/images/docs/ha-master-gce.png)
|
||||
|
||||
### Overview
|
||||
|
||||
|
|
|
@ -71,10 +71,17 @@ DynamicKubeletConfig=true|false (ALPHA - default=false)
|
|||
DynamicVolumeProvisioning=true|false (ALPHA - default=true)
|
||||
ExperimentalHostUserNamespaceDefaulting=true|false (ALPHA - default=false)
|
||||
StreamingProxyRedirects=true|false (ALPHA - default=false)
|
||||
--google-json-key string The Google Cloud Platform Service Account JSON Key to use for authentication.
|
||||
--insecure-allow-any-token username/group1,group2 If set, your server will be INSECURE. Any token will be allowed and user information will be parsed from the token as username/group1,group2
|
||||
--insecure-bind-address ip The IP address on which to serve the --insecure-port (set to 0.0.0.0 for all interfaces). Defaults to localhost. (default 127.0.0.1)
|
||||
--insecure-port int The port on which to serve unsecured, unauthenticated access. Default 8080. It is assumed that firewall rules are set up such that this port is not reachable from outside of the cluster and that port 443 on the cluster's public address is proxied to this port. This is performed by nginx in the default setup. (default 8080)
|
||||
--ir-data-source string Data source used by InitialResources. Supported options: influxdb, gcm. (default "influxdb")
|
||||
--ir-dbname string InfluxDB database name which contains metrics required by InitialResources. (default "k8s")
|
||||
--ir-hawkular string Hawkular configuration URL.
|
||||
--ir-influxdb-host string Address of InfluxDB which contains metrics required by InitialResources. (default "localhost:8080/api/v1/proxy/namespaces/kube-system/services/monitoring-influxdb:api")
|
||||
--ir-namespace-only Whether the estimation should be made only based on data from the same namespace.
|
||||
--ir-password string Password used for connecting to InfluxDB. (default "root")
|
||||
--ir-percentile int Which percentile of samples should InitialResources use when estimating resources. For experiment purposes. (default 90)
|
||||
--ir-user string User used for connecting to InfluxDB. (default "root")
|
||||
--kubelet-certificate-authority string Path to a cert file for the certificate authority.
|
||||
--kubelet-client-certificate string Path to a client cert file for TLS.
|
||||
--kubelet-client-key string Path to a client key file for TLS.
|
||||
|
|
|
@ -28,6 +28,7 @@ kube-controller-manager
|
|||
```
|
||||
--address ip The IP address to serve on (set to 0.0.0.0 for all interfaces) (default 0.0.0.0)
|
||||
--allocate-node-cidrs Should CIDRs for Pods be allocated and set on the cloud provider.
|
||||
--allow-verification-with-non-compliant-keys Allow a SignatureVerifier to use keys which are technically non-compliant with RFC6962.
|
||||
--cloud-config string The path to the cloud provider configuration file. Empty string for no configuration file.
|
||||
--cloud-provider string The provider for cloud services. Empty string for no provider.
|
||||
--cluster-cidr string CIDR Range for Pods in cluster.
|
||||
|
@ -59,7 +60,6 @@ DynamicVolumeProvisioning=true|false (ALPHA - default=true)
|
|||
ExperimentalHostUserNamespaceDefaulting=true|false (ALPHA - default=false)
|
||||
StreamingProxyRedirects=true|false (ALPHA - default=false)
|
||||
--flex-volume-plugin-dir string Full path of the directory in which the flex volume plugin should search for additional third party volume plugins. (default "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/")
|
||||
--google-json-key string The Google Cloud Platform Service Account JSON Key to use for authentication.
|
||||
--horizontal-pod-autoscaler-sync-period duration The period for syncing the number of pods in horizontal pod autoscaler. (default 30s)
|
||||
--insecure-experimental-approve-all-kubelet-csrs-for-group string The group for which the controller-manager will auto approve all CSRs for kubelet client certificates.
|
||||
--kube-api-burst int32 Burst to use while talking with Kubernetes apiserver (default 30)
|
||||
|
|
|
@ -41,7 +41,6 @@ DynamicKubeletConfig=true|false (ALPHA - default=false)
|
|||
DynamicVolumeProvisioning=true|false (ALPHA - default=true)
|
||||
ExperimentalHostUserNamespaceDefaulting=true|false (ALPHA - default=false)
|
||||
StreamingProxyRedirects=true|false (ALPHA - default=false)
|
||||
--google-json-key string The Google Cloud Platform Service Account JSON Key to use for authentication.
|
||||
--healthz-bind-address ip The IP address for the health check server to serve on, defaulting to 127.0.0.1 (set to 0.0.0.0 for all interfaces) (default 127.0.0.1)
|
||||
--healthz-port int32 The port to bind the health check server. Use 0 to disable. (default 10249)
|
||||
--hostname-override string If non-empty, will use this string as identification instead of the actual hostname.
|
||||
|
|
|
@ -36,7 +36,6 @@ DynamicKubeletConfig=true|false (ALPHA - default=false)
|
|||
DynamicVolumeProvisioning=true|false (ALPHA - default=true)
|
||||
ExperimentalHostUserNamespaceDefaulting=true|false (ALPHA - default=false)
|
||||
StreamingProxyRedirects=true|false (ALPHA - default=false)
|
||||
--google-json-key string The Google Cloud Platform Service Account JSON Key to use for authentication.
|
||||
--hard-pod-affinity-symmetric-weight int RequiredDuringScheduling affinity is not symmetric, but there is an implicit PreferredDuringScheduling affinity rule corresponding to every RequiredDuringScheduling affinity rule. --hard-pod-affinity-symmetric-weight represents the weight of implicit PreferredDuringScheduling affinity rule. (default 1)
|
||||
--kube-api-burst int32 Burst to use while talking with Kubernetes apiserver (default 100)
|
||||
--kube-api-content-type string Content type of requests sent to apiserver. (default "application/vnd.kubernetes.protobuf")
|
||||
|
|
|
@ -31,10 +31,10 @@ server, as well as an additional kubeconfig file for administration.
|
|||
controller manager and scheduler, and placing them in
|
||||
`/etc/kubernetes/manifests`. The kubelet watches this directory for static
|
||||
resources to create on startup. These are the core components of Kubernetes, and
|
||||
once they are up and running we can use `kubectl` to set up/manage any
|
||||
once they are up and running we can use `kubectl` to set up or manage any
|
||||
additional components.
|
||||
|
||||
1. kubeadm installs any add-on components, such as DNS or discovery, via the API
|
||||
1. kubeadm installs some add-on components, such as DNS or discovery, via the API
|
||||
server.
|
||||
|
||||
Running `kubeadm join` on each node in the cluster consists of the following steps:
|
||||
|
@ -180,48 +180,49 @@ available as configuration file options.
|
|||
|
||||
### Sample Master Configuration
|
||||
|
||||
```yaml
|
||||
apiVersion: kubeadm.k8s.io/v1alpha1
|
||||
kind: MasterConfiguration
|
||||
api:
|
||||
advertiseAddresses:
|
||||
- <address1|string>
|
||||
- <address2|string>
|
||||
bindPort: <int>
|
||||
externalDNSNames:
|
||||
- <dnsname1|string>
|
||||
- <dnsname2|string>
|
||||
cloudProvider: <string>
|
||||
discovery:
|
||||
bindPort: <int>
|
||||
etcd:
|
||||
endpoints:
|
||||
- <endpoint1|string>
|
||||
- <endpoint2|string>
|
||||
caFile: <path|string>
|
||||
certFile: <path|string>
|
||||
keyFile: <path|string>
|
||||
kubernetesVersion: <string>
|
||||
networking:
|
||||
dnsDomain: <string>
|
||||
serviceSubnet: <cidr>
|
||||
podSubnet: <cidr>
|
||||
secrets:
|
||||
givenToken: <token|string>
|
||||
```
|
||||
```yaml
|
||||
apiVersion: kubeadm.k8s.io/v1alpha1
|
||||
kind: MasterConfiguration
|
||||
api:
|
||||
advertiseAddresses:
|
||||
- <address1|string>
|
||||
- <address2|string>
|
||||
bindPort: <int>
|
||||
externalDNSNames:
|
||||
- <dnsname1|string>
|
||||
- <dnsname2|string>
|
||||
authorizationMode: <string>
|
||||
cloudProvider: <string>
|
||||
discovery:
|
||||
bindPort: <int>
|
||||
etcd:
|
||||
endpoints:
|
||||
- <endpoint1|string>
|
||||
- <endpoint2|string>
|
||||
caFile: <path|string>
|
||||
certFile: <path|string>
|
||||
keyFile: <path|string>
|
||||
kubernetesVersion: <string>
|
||||
networking:
|
||||
dnsDomain: <string>
|
||||
serviceSubnet: <cidr>
|
||||
podSubnet: <cidr>
|
||||
secrets:
|
||||
givenToken: <token|string>
|
||||
```
|
||||
|
||||
### Sample Node Configuration
|
||||
|
||||
```yaml
|
||||
apiVersion: kubeadm.k8s.io/v1alpha1
|
||||
kind: NodeConfiguration
|
||||
apiPort: <int>
|
||||
discoveryPort: <int>
|
||||
masterAddresses:
|
||||
- <master1>
|
||||
secrets:
|
||||
givenToken: <token|string>
|
||||
```
|
||||
```yaml
|
||||
apiVersion: kubeadm.k8s.io/v1alpha1
|
||||
kind: NodeConfiguration
|
||||
apiPort: <int>
|
||||
discoveryPort: <int>
|
||||
masterAddresses:
|
||||
- <master1>
|
||||
secrets:
|
||||
givenToken: <token|string>
|
||||
```
|
||||
|
||||
## Automating kubeadm
|
||||
|
||||
|
@ -257,6 +258,24 @@ These environment variables are a short-term solution, eventually they will be i
|
|||
| `KUBE_ETCD_IMAGE` | `gcr.io/google_containers/etcd-<arch>:2.2.5` | The etcd container image to use. |
|
||||
| `KUBE_REPO_PREFIX` | `gcr.io/google_containers` | The image prefix for all images that are used. |
|
||||
|
||||
If you want to use kubeadm with an http proxy, you may need to configure it to support http_proxy, https_proxy, or no_proxy.
|
||||
|
||||
For example, if your kube master node IP address is 10.18.17.16 and you have proxy support both http/https on 10.18.17.16 port 8080, you can use the following command:
|
||||
|
||||
You can using following command
|
||||
|
||||
```bash
|
||||
export PROXY_PORT=8080
|
||||
export PROXY_IP=10.18.17.16
|
||||
export http_proxy=http://$PROXY_IP:$PROXY_PORT
|
||||
export HTTP_PROXY=$http_proxy
|
||||
export https_proxy=$http_proxy
|
||||
export HTTPS_PROXY=$http_proxy
|
||||
export no_proxy="localhost,127.0.0.1,localaddress,.localdomain.com,example.com,10.18.17.16"
|
||||
```
|
||||
|
||||
Remember to change ```proxy_ip``` and add a kube master node IP address to ```no_proxy```.
|
||||
|
||||
## Releases and release notes
|
||||
|
||||
If you already have kubeadm installed and want to upgrade, run `apt-get update && apt-get upgrade` or `yum update` to get the latest version of kubeadm.
|
||||
|
|
|
@ -96,7 +96,7 @@ StreamingProxyRedirects=true|false (ALPHA - default=false)
|
|||
--google-json-key string The Google Cloud Platform Service Account JSON Key to use for authentication.
|
||||
--hairpin-mode string How should the kubelet setup hairpin NAT. This allows endpoints of a Service to loadbalance back to themselves if they should try to access their own Service. Valid values are "promiscuous-bridge", "hairpin-veth" and "none". (default "promiscuous-bridge")
|
||||
--healthz-bind-address ip The IP address for the healthz server to serve on, defaulting to 127.0.0.1 (set to 0.0.0.0 for all interfaces) (default 127.0.0.1)
|
||||
--healthz-port int32 The port of the localhost healthz endpoint (default 10248)
|
||||
--healthz-port int32 (Deprecated) The port of the localhost healthz endpoint (default 10248)
|
||||
--host-ipc-sources stringSlice Comma-separated list of sources from which the Kubelet allows pods to use the host ipc namespace. [default="*"] (default [*])
|
||||
--host-network-sources stringSlice Comma-separated list of sources from which the Kubelet allows pods to use of host network. [default="*"] (default [*])
|
||||
--host-pid-sources stringSlice Comma-separated list of sources from which the Kubelet allows pods to use the host pid namespace. [default="*"] (default [*])
|
||||
|
@ -137,7 +137,7 @@ StreamingProxyRedirects=true|false (ALPHA - default=false)
|
|||
--pods-per-core int32 Number of Pods per core that can run on this Kubelet. The total number of Pods on this Kubelet cannot exceed max-pods, so max-pods will be used if this calculation results in a larger number of Pods allowed on the Kubelet. A value of 0 disables this limit.
|
||||
--port int32 The port for the Kubelet to serve on. (default 10250)
|
||||
--protect-kernel-defaults Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.
|
||||
--read-only-port int32 The read-only port for the Kubelet to serve on with no authentication/authorization (set to 0 to disable) (default 10255)
|
||||
--read-only-port int32 The read-only port for the Kubelet to serve on with no authentication/authorization, and for localhost healthz endpoint (set to 0 to disable) (default 10255)
|
||||
--really-crash-for-testing If true, when panics occur crash. Intended for testing.
|
||||
--register-node Register the node with the apiserver (defaults to true if --api-servers is set) (default true)
|
||||
--register-schedulable Register the node as schedulable. Won't have any effect if register-node is false. [default=true] (default true)
|
||||
|
|
|
@ -5,210 +5,6 @@ assignees:
|
|||
title: Setting Pod CPU and Memory Limits
|
||||
---
|
||||
|
||||
By default, pods run with unbounded CPU and memory limits. This means that any pod in the
|
||||
system will be able to consume as much CPU and memory on the node that executes the pod.
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
Users may want to impose restrictions on the amount of resources a single pod in the system may consume
|
||||
for a variety of reasons.
|
||||
|
||||
For example:
|
||||
|
||||
1. Each node in the cluster has 2GB of memory. The cluster operator does not want to accept pods
|
||||
that require more than 2GB of memory since no node in the cluster can support the requirement. To prevent a
|
||||
pod from being permanently unscheduled to a node, the operator instead chooses to reject pods that exceed 2GB
|
||||
of memory as part of admission control.
|
||||
2. A cluster is shared by two communities in an organization that runs production and development workloads
|
||||
respectively. Production workloads may consume up to 8GB of memory, but development workloads may consume up
|
||||
to 512MB of memory. The cluster operator creates a separate namespace for each workload, and applies limits to
|
||||
each namespace.
|
||||
3. Users may create a pod which consumes resources just below the capacity of a machine. The left over space
|
||||
may be too small to be useful, but big enough for the waste to be costly over the entire cluster. As a result,
|
||||
the cluster operator may want to set limits that a pod must consume at least 20% of the memory and CPU of their
|
||||
average node size in order to provide for more uniform scheduling and to limit waste.
|
||||
|
||||
This example demonstrates how limits can be applied to a Kubernetes [namespace](/docs/admin/namespaces/walkthrough/) to control
|
||||
min/max resource limits per pod. In addition, this example demonstrates how you can
|
||||
apply default resource limits to pods in the absence of an end-user specified value.
|
||||
|
||||
See [LimitRange design doc](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/admission_control_limit_range.md) for more information. For a detailed description of the Kubernetes resource model, see [Resources](/docs/user-guide/compute-resources/)
|
||||
|
||||
## Step 0: Prerequisites
|
||||
|
||||
This example requires a running Kubernetes cluster. See the [Getting Started guides](/docs/getting-started-guides/) for how to get started.
|
||||
|
||||
Change to the `<kubernetes>` directory if you're not already there.
|
||||
|
||||
## Step 1: Create a namespace
|
||||
|
||||
This example will work in a custom namespace to demonstrate the concepts involved.
|
||||
|
||||
Let's create a new namespace called limit-example:
|
||||
|
||||
```shell
|
||||
$ kubectl create namespace limit-example
|
||||
namespace "limit-example" created
|
||||
```
|
||||
|
||||
Note that `kubectl` commands will print the type and name of the resource created or mutated, which can then be used in subsequent commands:
|
||||
|
||||
```shell
|
||||
$ kubectl get namespaces
|
||||
NAME STATUS AGE
|
||||
default Active 51s
|
||||
limit-example Active 45s
|
||||
```
|
||||
|
||||
## Step 2: Apply a limit to the namespace
|
||||
|
||||
Let's create a simple limit in our namespace.
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/limitrange/limits.yaml --namespace=limit-example
|
||||
limitrange "mylimits" created
|
||||
```
|
||||
|
||||
Let's describe the limits that we have imposed in our namespace.
|
||||
|
||||
```shell
|
||||
$ kubectl describe limits mylimits --namespace=limit-example
|
||||
Name: mylimits
|
||||
Namespace: limit-example
|
||||
Type Resource Min Max Default Request Default Limit Max Limit/Request Ratio
|
||||
---- -------- --- --- --------------- ------------- -----------------------
|
||||
Pod cpu 200m 2 - - -
|
||||
Pod memory 6Mi 1Gi - - -
|
||||
Container cpu 100m 2 200m 300m -
|
||||
Container memory 3Mi 1Gi 100Mi 200Mi -
|
||||
```
|
||||
|
||||
In this scenario, we have said the following:
|
||||
|
||||
1. If a max constraint is specified for a resource (2 CPU and 1Gi memory in this case), then a limit
|
||||
must be specified for that resource across all containers. Failure to specify a limit will result in
|
||||
a validation error when attempting to create the pod. Note that a default value of limit is set by
|
||||
*default* in file `limits.yaml` (300m CPU and 200Mi memory).
|
||||
2. If a min constraint is specified for a resource (100m CPU and 3Mi memory in this case), then a
|
||||
request must be specified for that resource across all containers. Failure to specify a request will
|
||||
result in a validation error when attempting to create the pod. Note that a default value of request is
|
||||
set by *defaultRequest* in file `limits.yaml` (200m CPU and 100Mi memory).
|
||||
3. For any pod, the sum of all containers memory requests must be >= 6Mi and the sum of all containers
|
||||
memory limits must be <= 1Gi; the sum of all containers CPU requests must be >= 200m and the sum of all
|
||||
containers CPU limits must be <= 2.
|
||||
|
||||
## Step 3: Enforcing limits at point of creation
|
||||
|
||||
The limits enumerated in a namespace are only enforced when a pod is created or updated in
|
||||
the cluster. If you change the limits to a different value range, it does not affect pods that
|
||||
were previously created in a namespace.
|
||||
|
||||
If a resource (CPU or memory) is being restricted by a limit, the user will get an error at time
|
||||
of creation explaining why.
|
||||
|
||||
Let's first spin up a [Deployment](/docs/user-guide/deployments) that creates a single container Pod to demonstrate
|
||||
how default values are applied to each pod.
|
||||
|
||||
```shell
|
||||
$ kubectl run nginx --image=nginx --replicas=1 --namespace=limit-example
|
||||
deployment "nginx" created
|
||||
```
|
||||
|
||||
Note that `kubectl run` creates a Deployment named "nginx" on Kubernetes cluster >= v1.2. If you are running older versions, it creates replication controllers instead.
|
||||
If you want to obtain the old behavior, use `--generator=run/v1` to create replication controllers. See [`kubectl run`](/docs/user-guide/kubectl/kubectl_run/) for more details.
|
||||
The Deployment manages 1 replica of single container Pod. Let's take a look at the Pod it manages. First, find the name of the Pod:
|
||||
|
||||
```shell
|
||||
$ kubectl get pods --namespace=limit-example
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
nginx-2040093540-s8vzu 1/1 Running 0 11s
|
||||
```
|
||||
|
||||
Let's print this Pod with yaml output format (using `-o yaml` flag), and then `grep` the `resources` field. Note that your pod name will be different.
|
||||
|
||||
```shell
|
||||
$ kubectl get pods nginx-2040093540-s8vzu --namespace=limit-example -o yaml | grep resources -C 8
|
||||
resourceVersion: "57"
|
||||
selfLink: /api/v1/namespaces/limit-example/pods/nginx-2040093540-ivimu
|
||||
uid: 67b20741-f53b-11e5-b066-64510658e388
|
||||
spec:
|
||||
containers:
|
||||
- image: nginx
|
||||
imagePullPolicy: Always
|
||||
name: nginx
|
||||
resources:
|
||||
limits:
|
||||
cpu: 300m
|
||||
memory: 200Mi
|
||||
requests:
|
||||
cpu: 200m
|
||||
memory: 100Mi
|
||||
terminationMessagePath: /dev/termination-log
|
||||
volumeMounts:
|
||||
```
|
||||
|
||||
Note that our nginx container has picked up the namespace default CPU and memory resource *limits* and *requests*.
|
||||
|
||||
Let's create a pod that exceeds our allowed limits by having it have a container that requests 3 CPU cores.
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/limitrange/invalid-pod.yaml --namespace=limit-example
|
||||
Error from server: error when creating "docs/admin/limitrange/invalid-pod.yaml": Pod "invalid-pod" is forbidden: [Maximum cpu usage per Pod is 2, but limit is 3., Maximum cpu usage per Container is 2, but limit is 3.]
|
||||
```
|
||||
|
||||
Let's create a pod that falls within the allowed limit boundaries.
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/limitrange/valid-pod.yaml --namespace=limit-example
|
||||
pod "valid-pod" created
|
||||
```
|
||||
|
||||
Now look at the Pod's resources field:
|
||||
|
||||
```shell
|
||||
$ kubectl get pods valid-pod --namespace=limit-example -o yaml | grep -C 6 resources
|
||||
uid: 3b1bfd7a-f53c-11e5-b066-64510658e388
|
||||
spec:
|
||||
containers:
|
||||
- image: gcr.io/google_containers/serve_hostname
|
||||
imagePullPolicy: Always
|
||||
name: kubernetes-serve-hostname
|
||||
resources:
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 512Mi
|
||||
requests:
|
||||
cpu: "1"
|
||||
memory: 512Mi
|
||||
```
|
||||
|
||||
Note that this pod specifies explicit resource *limits* and *requests* so it did not pick up the namespace
|
||||
default values.
|
||||
|
||||
Note: The *limits* for CPU resource are enforced in the default Kubernetes setup on the physical node
|
||||
that runs the container unless the administrator deploys the kubelet with the following flag:
|
||||
|
||||
```shell
|
||||
$ kubelet --help
|
||||
Usage of kubelet
|
||||
....
|
||||
--cpu-cfs-quota[=true]: Enable CPU CFS quota enforcement for containers that specify CPU limits
|
||||
$ kubelet --cpu-cfs-quota=false ...
|
||||
```
|
||||
|
||||
## Step 4: Cleanup
|
||||
|
||||
To remove the resources used by this example, you can just delete the limit-example namespace.
|
||||
|
||||
```shell
|
||||
$ kubectl delete namespace limit-example
|
||||
namespace "limit-example" deleted
|
||||
$ kubectl get namespaces
|
||||
NAME STATUS AGE
|
||||
default Active 12m
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
Cluster operators that want to restrict the amount of resources a single container or pod may consume
|
||||
are able to define allowable ranges per Kubernetes namespace. In the absence of any explicit assignments,
|
||||
the Kubernetes system is able to apply default resource *limits* and *requests* if desired in order to
|
||||
constrain the amount of resource a pod consumes on a node.
|
||||
[Setting Pod CPU and Memory Limits](/docs/tasks/configure-pod-container/limit-range/)
|
||||
|
|
|
@ -4,63 +4,6 @@ assignees:
|
|||
title: Using Multiple Clusters
|
||||
---
|
||||
|
||||
You may want to set up multiple Kubernetes clusters, both to
|
||||
have clusters in different regions to be nearer to your users, and to tolerate failures and/or invasive maintenance.
|
||||
This document describes some of the issues to consider when making a decision about doing so.
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
If you decide to have multiple clusters, Kubernetes provides a way to [federate them](/docs/admin/federation/).
|
||||
|
||||
## Scope of a single cluster
|
||||
|
||||
On IaaS providers such as Google Compute Engine or Amazon Web Services, a VM exists in a
|
||||
[zone](https://cloud.google.com/compute/docs/zones) or [availability
|
||||
zone](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html).
|
||||
We suggest that all the VMs in a Kubernetes cluster should be in the same availability zone, because:
|
||||
|
||||
- compared to having a single global Kubernetes cluster, there are fewer single-points of failure
|
||||
- compared to a cluster that spans availability zones, it is easier to reason about the availability properties of a
|
||||
single-zone cluster.
|
||||
- when the Kubernetes developers are designing the system (e.g. making assumptions about latency, bandwidth, or
|
||||
correlated failures) they are assuming all the machines are in a single data center, or otherwise closely connected.
|
||||
|
||||
It is okay to have multiple clusters per availability zone, though on balance we think fewer is better.
|
||||
Reasons to prefer fewer clusters are:
|
||||
|
||||
- improved bin packing of Pods in some cases with more nodes in one cluster (less resource fragmentation)
|
||||
- reduced operational overhead (though the advantage is diminished as ops tooling and processes matures)
|
||||
- reduced costs for per-cluster fixed resource costs, e.g. apiserver VMs (but small as a percentage
|
||||
of overall cluster cost for medium to large clusters).
|
||||
|
||||
Reasons to have multiple clusters include:
|
||||
|
||||
- strict security policies requiring isolation of one class of work from another (but, see Partitioning Clusters
|
||||
below).
|
||||
- test clusters to canary new Kubernetes releases or other cluster software.
|
||||
|
||||
## Selecting the right number of clusters
|
||||
|
||||
The selection of the number of Kubernetes clusters may be a relatively static choice, only revisited occasionally.
|
||||
By contrast, the number of nodes in a cluster and the number of pods in a service may change frequently according to
|
||||
load and growth.
|
||||
|
||||
To pick the number of clusters, first, decide which regions you need to be in to have adequate latency to all your end users, for services that will run
|
||||
on Kubernetes (if you use a Content Distribution Network, the latency requirements for the CDN-hosted content need not
|
||||
be considered). Legal issues might influence this as well. For example, a company with a global customer base might decide to have clusters in US, EU, AP, and SA regions.
|
||||
Call the number of regions to be in `R`.
|
||||
|
||||
Second, decide how many clusters should be able to be unavailable at the same time, while still being available. Call
|
||||
the number that can be unavailable `U`. If you are not sure, then 1 is a fine choice.
|
||||
|
||||
If it is allowable for load-balancing to direct traffic to any region in the event of a cluster failure, then
|
||||
you need at least the larger of `R` or `U + 1` clusters. If it is not (e.g. you want to ensure low latency for all
|
||||
users in the event of a cluster failure), then you need to have `R * (U + 1)` clusters
|
||||
(`U + 1` in each of `R` regions). In any case, try to put each cluster in a different zone.
|
||||
|
||||
Finally, if any of your clusters would need more than the maximum recommended number of nodes for a Kubernetes cluster, then
|
||||
you may need even more clusters. Kubernetes v1.3 supports clusters up to 1000 nodes in size.
|
||||
|
||||
## Working with multiple clusters
|
||||
|
||||
When you have multiple clusters, you would typically create services with the same config in each cluster and put each of those
|
||||
service instances behind a load balancer (AWS Elastic Load Balancer, GCE Forwarding Rule or HTTP Load Balancer) spanning all of them, so that
|
||||
failures of a single cluster are not visible to end users.
|
||||
[Using Multiple Clusters](/docs/concepts/cluster-administration/multiple-clusters/)
|
||||
|
|
|
@ -34,7 +34,7 @@ to build the image:
|
|||
|
||||
```docker
|
||||
FROM busybox
|
||||
ADD _output/local/go/bin/kube-scheduler /usr/local/bin/kube-scheduler
|
||||
ADD ./_output/dockerized/bin/linux/amd64/kube-scheduler /usr/local/bin/kube-scheduler
|
||||
```
|
||||
|
||||
Save the file as `Dockerfile`, build the image and push it to a registry. This example
|
||||
|
@ -45,7 +45,7 @@ For more details, please read the GCR
|
|||
|
||||
```shell
|
||||
docker build -t my-kube-scheduler:1.0 .
|
||||
gcloud docker push gcr.io/my-gcp-project/my-kube-scheduler:1.0
|
||||
gcloud docker -- push gcr.io/my-gcp-project/my-kube-scheduler:1.0
|
||||
```
|
||||
|
||||
### 2. Define a Kubernetes Deployment for the scheduler
|
||||
|
@ -61,7 +61,7 @@ config. Save it as `my-scheduler.yaml`:
|
|||
{% include code.html language="yaml" file="multiple-schedulers/my-scheduler.yaml" ghlink="/docs/admin/multiple-schedulers/my-scheduler.yaml" %}
|
||||
|
||||
An important thing to note here is that the name of the scheduler specified as an
|
||||
argument to the scheduler command in the container spec should be unique. This is the name that is matched against the value of the optional `scheduler.alpha.kubernetes.io/name` annotation on pods, to determine whether this scheduler is responsible for scheduling a particular pod.
|
||||
argument to the scheduler command in the container spec should be unique. This is the name that is matched against the value of the optional `spec.schedulername` on pods, to determine whether this scheduler is responsible for scheduling a particular pod.
|
||||
|
||||
Please see the
|
||||
[kube-scheduler documentation](/docs/admin/kube-scheduler/) for
|
||||
|
@ -92,14 +92,14 @@ pod in this list.
|
|||
### 4. Specify schedulers for pods
|
||||
|
||||
Now that our second scheduler is running, let's create some pods, and direct them to be scheduled by either the default scheduler or the one we just deployed. In order to schedule a given pod using a specific scheduler, we specify the name of the
|
||||
scheduler as an annotation in that pod spec. Let's look at three examples.
|
||||
scheduler in that pod spec. Let's look at three examples.
|
||||
|
||||
|
||||
1. Pod spec without any scheduler annotation
|
||||
- Pod spec without any scheduler name
|
||||
|
||||
{% include code.html language="yaml" file="multiple-schedulers/pod1.yaml" ghlink="/docs/admin/multiple-schedulers/pod1.yaml" %}
|
||||
|
||||
When no scheduler annotation is supplied, the pod is automatically scheduled using the
|
||||
When no scheduler name is supplied, the pod is automatically scheduled using the
|
||||
default-scheduler.
|
||||
|
||||
Save this file as `pod1.yaml` and submit it to the Kubernetes cluster.
|
||||
|
@ -108,12 +108,11 @@ scheduler as an annotation in that pod spec. Let's look at three examples.
|
|||
kubectl create -f pod1.yaml
|
||||
```
|
||||
|
||||
2. Pod spec with `default-scheduler` annotation
|
||||
- Pod spec with `default-scheduler`
|
||||
|
||||
{% include code.html language="yaml" file="multiple-schedulers/pod2.yaml" ghlink="/docs/admin/multiple-schedulers/pod2.yaml" %}
|
||||
|
||||
A scheduler is specified by supplying the scheduler name as a value to the annotation
|
||||
with key `scheduler.alpha.kubernetes.io/name`. In this case, we supply the name of the
|
||||
A scheduler is specified by supplying the scheduler name as a value to `spec.schedulername`. In this case, we supply the name of the
|
||||
default scheduler which is `default-scheduler`.
|
||||
|
||||
Save this file as `pod2.yaml` and submit it to the Kubernetes cluster.
|
||||
|
@ -122,26 +121,25 @@ scheduler as an annotation in that pod spec. Let's look at three examples.
|
|||
kubectl create -f pod2.yaml
|
||||
```
|
||||
|
||||
3. Pod spec with `my-scheduler` annotation
|
||||
- Pod spec with `my-scheduler`
|
||||
|
||||
{% include code.html language="yaml" file="multiple-schedulers/pod3.yaml" ghlink="/docs/admin/multiple-schedulers/pod3.yaml" %}
|
||||
|
||||
In this case, we specify that this pod should be scheduled using the scheduler that we
|
||||
deployed - `my-scheduler`. Note that the value of the annotation with key
|
||||
`scheduler.alpha.kubernetes.io/name` should match the name supplied to the scheduler
|
||||
deployed - `my-scheduler`. Note that the value of `spec.schedulername` should match the name supplied to the scheduler
|
||||
command as an argument in the deployment config for the scheduler.
|
||||
|
||||
Save this file as `pod3.yaml` and submit it to the Kubernetes cluster.
|
||||
|
||||
```shell
|
||||
kubectl create -f pod3.yaml
|
||||
```
|
||||
```shell
|
||||
kubectl create -f pod3.yaml
|
||||
```
|
||||
|
||||
Verify that all three pods are running.
|
||||
|
||||
```shell
|
||||
kubectl get pods
|
||||
```
|
||||
```shell
|
||||
kubectl get pods
|
||||
```
|
||||
|
||||
### Verifying that the pods were scheduled using the desired schedulers
|
||||
|
||||
|
@ -149,9 +147,9 @@ In order to make it easier to work through these examples, we did not verify tha
|
|||
pods were actually scheduled using the desired schedulers. We can verify that by
|
||||
changing the order of pod and deployment config submissions above. If we submit all the
|
||||
pod configs to a Kubernetes cluster before submitting the scheduler deployment config,
|
||||
we see that the pod `annotation-second-scheduler` remains in "Pending" state forever
|
||||
we see that the pod `second-scheduler` remains in "Pending" state forever
|
||||
while the other two pods get scheduled. Once we submit the scheduler deployment config
|
||||
and our new scheduler starts running, the `annotation-second-scheduler` pod gets
|
||||
and our new scheduler starts running, the `second-scheduler` pod gets
|
||||
scheduled as well.
|
||||
|
||||
Alternatively, one could just look at the "Scheduled" entries in the event logs to
|
||||
|
|
|
@ -16,8 +16,11 @@ spec:
|
|||
version: second
|
||||
spec:
|
||||
containers:
|
||||
- command: [/usr/local/bin/kube-scheduler, --address=0.0.0.0,
|
||||
--scheduler-name=my-scheduler]
|
||||
- command:
|
||||
- /usr/local/bin/kube-scheduler
|
||||
- --address=0.0.0.0
|
||||
- --leader-elect=false
|
||||
- --scheduler-name=my-scheduler
|
||||
image: gcr.io/my-gcp-project/my-kube-scheduler:1.0
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
|
@ -37,4 +40,4 @@ spec:
|
|||
volumeMounts: []
|
||||
hostNetwork: false
|
||||
hostPID: false
|
||||
volumes: []
|
||||
volumes: []
|
||||
|
|
|
@ -2,11 +2,10 @@ apiVersion: v1
|
|||
kind: Pod
|
||||
metadata:
|
||||
name: annotation-default-scheduler
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/name: default-scheduler
|
||||
labels:
|
||||
name: multischeduler-example
|
||||
spec:
|
||||
schedulername: default-scheduler
|
||||
containers:
|
||||
- name: pod-with-default-annotation-container
|
||||
image: gcr.io/google_containers/pause:2.0
|
|
@ -2,11 +2,10 @@ apiVersion: v1
|
|||
kind: Pod
|
||||
metadata:
|
||||
name: annotation-second-scheduler
|
||||
annotations:
|
||||
scheduler.alpha.kubernetes.io/name: my-scheduler
|
||||
labels:
|
||||
name: multischeduler-example
|
||||
spec:
|
||||
schedulername: my-scheduler
|
||||
containers:
|
||||
- name: pod-with-second-annotation-container
|
||||
image: gcr.io/google_containers/pause:2.0
|
|
@ -44,9 +44,9 @@ You can list the current namespaces in a cluster using:
|
|||
|
||||
```shell
|
||||
$ kubectl get namespaces
|
||||
NAME LABELS STATUS
|
||||
default <none> Active
|
||||
kube-system <none> Active
|
||||
NAME STATUS AGE
|
||||
default Active 11d
|
||||
kube-system Active 11d
|
||||
```
|
||||
|
||||
Kubernetes starts with two initial namespaces:
|
||||
|
|
|
@ -145,7 +145,7 @@ dev
|
|||
|
||||
At this point, all requests we make to the Kubernetes cluster from the command line are scoped to the development namespace.
|
||||
|
||||
Let's create some content.
|
||||
Let's create some contents.
|
||||
|
||||
```shell
|
||||
$ kubectl run snowflake --image=kubernetes/serve_hostname --replicas=2
|
||||
|
|
|
@ -6,68 +6,6 @@ assignees:
|
|||
title: Network Plugins
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
__Disclaimer__: Network plugins are in alpha. Its contents will change rapidly.
|
||||
|
||||
Network plugins in Kubernetes come in a few flavors:
|
||||
|
||||
* CNI plugins: adhere to the appc/CNI specification, designed for interoperability.
|
||||
* Kubenet plugin: implements basic `cbr0` using the `bridge` and `host-local` CNI plugins
|
||||
|
||||
## Installation
|
||||
|
||||
The kubelet has a single default network plugin, and a default network common to the entire cluster. It probes for plugins when it starts up, remembers what it found, and executes the selected plugin at appropriate times in the pod lifecycle (this is only true for docker, as rkt manages its own CNI plugins). There are two Kubelet command line parameters to keep in mind when using plugins:
|
||||
|
||||
* `network-plugin-dir`: Kubelet probes this directory for plugins on startup
|
||||
* `network-plugin`: The network plugin to use from `network-plugin-dir`. It must match the name reported by a plugin probed from the plugin directory. For CNI plugins, this is simply "cni".
|
||||
|
||||
## Network Plugin Requirements
|
||||
|
||||
Besides providing the [`NetworkPlugin` interface](https://github.com/kubernetes/kubernetes/tree/{{page.version}}/pkg/kubelet/network/plugins.go) to configure and clean up pod networking, the plugin may also need specific support for kube-proxy. The iptables proxy obviously depends on iptables, and the plugin may need to ensure that container traffic is made available to iptables. For example, if the plugin connects containers to a Linux bridge, the plugin must set the `net/bridge/bridge-nf-call-iptables` sysctl to `1` to ensure that the iptables proxy functions correctly. If the plugin does not use a Linux bridge (but instead something like Open vSwitch or some other mechanism) it should ensure container traffic is appropriately routed for the proxy.
|
||||
|
||||
By default if no kubelet network plugin is specified, the `noop` plugin is used, which sets `net/bridge/bridge-nf-call-iptables=1` to ensure simple configurations (like docker with a bridge) work correctly with the iptables proxy.
|
||||
|
||||
### CNI
|
||||
|
||||
The CNI plugin is selected by passing Kubelet the `--network-plugin=cni` command-line option. Kubelet reads a file from `--cni-conf-dir` (default `/etc/cni/net.d`) and uses the CNI configuration from that file to set up each pod's network. The CNI configuration file must match the [CNI specification](https://github.com/containernetworking/cni/blob/master/SPEC.md#network-configuration), and any required CNI plugins referenced by the configuration must be present in `--cni-bin-dir` (default `/opt/cni/bin`).
|
||||
|
||||
If there are multiple CNI configuration files in the directory, the first one in lexicographic order of file name is used.
|
||||
|
||||
In addition to the CNI plugin specified by the configuration file, Kubernetes requires the standard CNI [`lo`](https://github.com/containernetworking/cni/blob/master/plugins/main/loopback/loopback.go) plugin, at minimum version 0.2.0
|
||||
|
||||
Limitation: Due to [#31307](https://github.com/kubernetes/kubernetes/issues/31307), `HostPort` won't work with CNI networking plugin at the moment. That means all `hostPort` attribute in pod would be simply ignored.
|
||||
|
||||
### kubenet
|
||||
|
||||
Kubenet is a very basic, simple network plugin, on Linux only. It does not, of itself, implement more advanced features like cross-node networking or network policy. It is typically used together with a cloud provider that sets up routing rules for communication between nodes, or in single-node environments.
|
||||
|
||||
Kubenet creates a Linux bridge named `cbr0` and creates a veth pair for each pod with the host end of each pair connected to `cbr0`. The pod end of the pair is assigned an IP address allocated from a range assigned to the node either through configuration or by the controller-manager. `cbr0` is assigned an MTU matching the smallest MTU of an enabled normal interface on the host.
|
||||
|
||||
The plugin requires a few things:
|
||||
|
||||
* The standard CNI `bridge`, `lo` and `host-local` plugins are required, at minimum version 0.2.0. Kubenet will first search for them in `/opt/cni/bin`. Specify `network-plugin-dir` to supply additional search path. The first found match will take effect.
|
||||
* Kubelet must be run with the `--network-plugin=kubenet` argument to enable the plugin
|
||||
* Kubelet should also be run with the `--non-masquerade-cidr=<clusterCidr>` argumment to ensure traffic to IPs outside this range will use IP masquerade.
|
||||
* The node must be assigned an IP subnet through either the `--pod-cidr` kubelet command-line option or the `--allocate-node-cidrs=true --cluster-cidr=<cidr>` controller-manager command-line options.
|
||||
|
||||
### Customizing the MTU (with kubenet)
|
||||
|
||||
The MTU should always be configured correctly to get the best networking performance. Network plugins will usually try
|
||||
to infer a sensible MTU, but sometimes the logic will not result in an optimal MTU. For example, if the
|
||||
Docker bridge or another interface has a small MTU, kubenet will currently select that MTU. Or if you are
|
||||
using IPSEC encapsulation, the MTU must be reduced, and this calculation is out-of-scope for
|
||||
most network plugins.
|
||||
|
||||
Where needed, you can specify the MTU explicitly with the `network-plugin-mtu` kubelet option. For example,
|
||||
on AWS the `eth0` MTU is typically 9001, so you might specify `--network-plugin-mtu=9001`. If you're using IPSEC you
|
||||
might reduce it to allow for encapsulation overhead e.g. `--network-plugin-mtu=8873`.
|
||||
|
||||
This option is provided to the network-plugin; currently **only kubenet supports `network-plugin-mtu`**.
|
||||
|
||||
## Usage Summary
|
||||
|
||||
* `--network-plugin=cni` specifies that we use the `cni` network plugin with actual CNI plugin binaries located in `--cni-bin-dir` (default `/opt/cni/bin`) and CNI plugin configuration located in `--cni-conf-dir` (default `/etc/cni/net.d`).
|
||||
* `--network-plugin=kubenet` specifies that we use the `kubenet` network plugin with CNI `bridge` and `host-local` plugins placed in `/opt/cni/bin` or `network-plugin-dir`.
|
||||
* `--network-plugin-mtu=9001` specifies the MTU to use, currently only used by the `kubenet` network plugin.
|
||||
[Network Plugins](/docs/concepts/cluster-administration/network-plugins/)
|
||||
|
|
|
@ -4,212 +4,6 @@ assignees:
|
|||
title: Networking in Kubernetes
|
||||
---
|
||||
|
||||
Kubernetes approaches networking somewhat differently than Docker does by
|
||||
default. There are 4 distinct networking problems to solve:
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
1. Highly-coupled container-to-container communications: this is solved by
|
||||
[pods](/docs/user-guide/pods/) and `localhost` communications.
|
||||
2. Pod-to-Pod communications: this is the primary focus of this document.
|
||||
3. Pod-to-Service communications: this is covered by [services](/docs/user-guide/services/).
|
||||
4. External-to-Service communications: this is covered by [services](/docs/user-guide/services/).
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
Kubernetes assumes that pods can communicate with other pods, regardless of
|
||||
which host they land on. We give every pod its own IP address so you do not
|
||||
need to explicitly create links between pods and you almost never need to deal
|
||||
with mapping container ports to host ports. This creates a clean,
|
||||
backwards-compatible model where pods can be treated much like VMs or physical
|
||||
hosts from the perspectives of port allocation, naming, service discovery, load
|
||||
balancing, application configuration, and migration.
|
||||
|
||||
To achieve this we must impose some requirements on how you set up your cluster
|
||||
networking.
|
||||
|
||||
## Docker model
|
||||
|
||||
Before discussing the Kubernetes approach to networking, it is worthwhile to
|
||||
review the "normal" way that networking works with Docker. By default, Docker
|
||||
uses host-private networking. It creates a virtual bridge, called `docker0` by
|
||||
default, and allocates a subnet from one of the private address blocks defined
|
||||
in [RFC1918](https://tools.ietf.org/html/rfc1918) for that bridge. For each
|
||||
container that Docker creates, it allocates a virtual ethernet device (called
|
||||
`veth`) which is attached to the bridge. The veth is mapped to appear as `eth0`
|
||||
in the container, using Linux namespaces. The in-container `eth0` interface is
|
||||
given an IP address from the bridge's address range.
|
||||
|
||||
The result is that Docker containers can talk to other containers only if they
|
||||
are on the same machine (and thus the same virtual bridge). Containers on
|
||||
different machines can not reach each other - in fact they may end up with the
|
||||
exact same network ranges and IP addresses.
|
||||
|
||||
In order for Docker containers to communicate across nodes, they must be
|
||||
allocated ports on the machine's own IP address, which are then forwarded or
|
||||
proxied to the containers. This obviously means that containers must either
|
||||
coordinate which ports they use very carefully or else be allocated ports
|
||||
dynamically.
|
||||
|
||||
## Kubernetes model
|
||||
|
||||
Coordinating ports across multiple developers is very difficult to do at
|
||||
scale and exposes users to cluster-level issues outside of their control.
|
||||
Dynamic port allocation brings a lot of complications to the system - every
|
||||
application has to take ports as flags, the API servers have to know how to
|
||||
insert dynamic port numbers into configuration blocks, services have to know
|
||||
how to find each other, etc. Rather than deal with this, Kubernetes takes a
|
||||
different approach.
|
||||
|
||||
Kubernetes imposes the following fundamental requirements on any networking
|
||||
implementation (barring any intentional network segmentation policies):
|
||||
|
||||
* all containers can communicate with all other containers without NAT
|
||||
* all nodes can communicate with all containers (and vice-versa) without NAT
|
||||
* the IP that a container sees itself as is the same IP that others see it as
|
||||
|
||||
What this means in practice is that you can not just take two computers
|
||||
running Docker and expect Kubernetes to work. You must ensure that the
|
||||
fundamental requirements are met.
|
||||
|
||||
This model is not only less complex overall, but it is principally compatible
|
||||
with the desire for Kubernetes to enable low-friction porting of apps from VMs
|
||||
to containers. If your job previously ran in a VM, your VM had an IP and could
|
||||
talk to other VMs in your project. This is the same basic model.
|
||||
|
||||
Until now this document has talked about containers. In reality, Kubernetes
|
||||
applies IP addresses at the `Pod` scope - containers within a `Pod` share their
|
||||
network namespaces - including their IP address. This means that containers
|
||||
within a `Pod` can all reach each other's ports on `localhost`. This does imply
|
||||
that containers within a `Pod` must coordinate port usage, but this is no
|
||||
different than processes in a VM. We call this the "IP-per-pod" model. This
|
||||
is implemented in Docker as a "pod container" which holds the network namespace
|
||||
open while "app containers" (the things the user specified) join that namespace
|
||||
with Docker's `--net=container:<id>` function.
|
||||
|
||||
As with Docker, it is possible to request host ports, but this is reduced to a
|
||||
very niche operation. In this case a port will be allocated on the host `Node`
|
||||
and traffic will be forwarded to the `Pod`. The `Pod` itself is blind to the
|
||||
existence or non-existence of host ports.
|
||||
|
||||
## How to achieve this
|
||||
|
||||
There are a number of ways that this network model can be implemented. This
|
||||
document is not an exhaustive study of the various methods, but hopefully serves
|
||||
as an introduction to various technologies and serves as a jumping-off point.
|
||||
|
||||
The following networking options are sorted alphabetically - the order does not
|
||||
imply any preferential status.
|
||||
|
||||
### Contiv
|
||||
|
||||
[Contiv](https://github.com/contiv/netplugin) provides configurable networking (native l3 using BGP, overlay using vxlan, classic l2, or Cisco-SDN/ACI) for various use cases. [Contiv](http://contiv.io) is all open sourced.
|
||||
|
||||
### Flannel
|
||||
|
||||
[Flannel](https://github.com/coreos/flannel#flannel) is a very simple overlay
|
||||
network that satisfies the Kubernetes requirements. Many
|
||||
people have reported success with Flannel and Kubernetes.
|
||||
|
||||
### Google Compute Engine (GCE)
|
||||
|
||||
For the Google Compute Engine cluster configuration scripts, we use [advanced
|
||||
routing](https://cloud.google.com/compute/docs/networking#routing) to
|
||||
assign each VM a subnet (default is `/24` - 254 IPs). Any traffic bound for that
|
||||
subnet will be routed directly to the VM by the GCE network fabric. This is in
|
||||
addition to the "main" IP address assigned to the VM, which is NAT'ed for
|
||||
outbound internet access. A linux bridge (called `cbr0`) is configured to exist
|
||||
on that subnet, and is passed to docker's `--bridge` flag.
|
||||
|
||||
We start Docker with:
|
||||
|
||||
```shell
|
||||
DOCKER_OPTS="--bridge=cbr0 --iptables=false --ip-masq=false"
|
||||
```
|
||||
|
||||
This bridge is created by Kubelet (controlled by the `--network-plugin=kubenet`
|
||||
flag) according to the `Node`'s `spec.podCIDR`.
|
||||
|
||||
Docker will now allocate IPs from the `cbr-cidr` block. Containers can reach
|
||||
each other and `Nodes` over the `cbr0` bridge. Those IPs are all routable
|
||||
within the GCE project network.
|
||||
|
||||
GCE itself does not know anything about these IPs, though, so it will not NAT
|
||||
them for outbound internet traffic. To achieve that we use an iptables rule to
|
||||
masquerade (aka SNAT - to make it seem as if packets came from the `Node`
|
||||
itself) traffic that is bound for IPs outside the GCE project network
|
||||
(10.0.0.0/8).
|
||||
|
||||
```shell
|
||||
iptables -t nat -A POSTROUTING ! -d 10.0.0.0/8 -o eth0 -j MASQUERADE
|
||||
```
|
||||
|
||||
Lastly we enable IP forwarding in the kernel (so the kernel will process
|
||||
packets for bridged containers):
|
||||
|
||||
```shell
|
||||
sysctl net.ipv4.ip_forward=1
|
||||
```
|
||||
|
||||
The result of all this is that all `Pods` can reach each other and can egress
|
||||
traffic to the internet.
|
||||
|
||||
### L2 networks and linux bridging
|
||||
|
||||
If you have a "dumb" L2 network, such as a simple switch in a "bare-metal"
|
||||
environment, you should be able to do something similar to the above GCE setup.
|
||||
Note that these instructions have only been tried very casually - it seems to
|
||||
work, but has not been thoroughly tested. If you use this technique and
|
||||
perfect the process, please let us know.
|
||||
|
||||
Follow the "With Linux Bridge devices" section of [this very nice
|
||||
tutorial](http://blog.oddbit.com/2014/08/11/four-ways-to-connect-a-docker/) from
|
||||
Lars Kellogg-Stedman.
|
||||
|
||||
### Nuage Networks VCS (Virtualized Cloud Services)
|
||||
|
||||
[Nuage](http://www.nuagenetworks.net) provides a highly scalable policy-based Software-Defined Networking (SDN) platform. Nuage uses the open source Open vSwitch for the data plane along with a feature rich SDN Controller built on open standards.
|
||||
|
||||
The Nuage platform uses overlays to provide seamless policy-based networking between Kubernetes Pods and non-Kubernetes environments (VMs and bare metal servers). Nuage's policy abstraction model is designed with applications in mind and makes it easy to declare fine-grained policies for applications.The platform's real-time analytics engine enables visibility and security monitoring for Kubernetes applications.
|
||||
|
||||
### OpenVSwitch
|
||||
|
||||
[OpenVSwitch](/docs/admin/ovs-networking) is a somewhat more mature but also
|
||||
complicated way to build an overlay network. This is endorsed by several of the
|
||||
"Big Shops" for networking.
|
||||
|
||||
### OVN (Open Virtual Networking)
|
||||
|
||||
OVN is an opensource network virtualization solution developed by the
|
||||
Open vSwitch community. It lets one create logical switches, logical routers,
|
||||
stateful ACLs, load-balancers etc to build different virtual networking
|
||||
topologies. The project has a specific Kubernetes plugin and documentation
|
||||
at [ovn-kubernetes](https://github.com/openvswitch/ovn-kubernetes).
|
||||
|
||||
### Project Calico
|
||||
|
||||
[Project Calico](http://docs.projectcalico.org/) is an open source container networking provider and network policy engine.
|
||||
|
||||
Calico provides a highly scalable networking and network policy solution for connecting Kubernetes pods based on the same IP networking principles as the internet. Calico can be deployed without encapsulation or overlays to provide high-performance, high-scale data center networking. Calico also provides fine-grained, intent based network security policy for Kubernetes pods via its distributed firewall.
|
||||
|
||||
Calico can also be run in policy enforcement mode in conjunction with other networking solutions such as Flannel, aka [canal](https://github.com/tigera/canal), or native GCE networking.
|
||||
|
||||
### Romana
|
||||
|
||||
[Romana](http://romana.io) is an open source network and security automation solution that lets you deploy Kubernetes without an overlay network. Romana supports Kubernetes [Network Policy](/docs/user-guide/networkpolicies/) to provide isolation across network namespaces.
|
||||
|
||||
### Weave Net from Weaveworks
|
||||
|
||||
[Weave Net](https://www.weave.works/products/weave-net/) is a
|
||||
resilient and simple to use network for Kubernetes and its hosted applications.
|
||||
Weave Net runs as a [CNI plug-in](https://www.weave.works/docs/net/latest/cni-plugin/)
|
||||
or stand-alone. In either version, it doesn't require any configuration or extra code
|
||||
to run, and in both cases, the network provides one IP address per pod - as is standard for Kubernetes.
|
||||
|
||||
## Other reading
|
||||
|
||||
The early design of the networking model and its rationale, and some future
|
||||
plans are described in more detail in the [networking design
|
||||
document](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/networking.md).
|
||||
[Cluster Networking](/docs/concepts/cluster-administration/networking/)
|
||||
|
|
|
@ -5,244 +5,6 @@ assignees:
|
|||
title: Monitoring Node Health
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
## Node Problem Detector
|
||||
|
||||
*Node problem detector* is a [DaemonSet](/docs/admin/daemons/) monitoring the
|
||||
node health. It collects node problems from various daemons and reports them
|
||||
to the apiserver as [NodeCondition](/docs/admin/node/#node-condition) and
|
||||
[Event](/docs/api-reference/v1/definitions/#_v1_event).
|
||||
|
||||
It supports some known kernel issue detection now, and will detect more and
|
||||
more node problems over time.
|
||||
|
||||
Currently Kubernetes won't take any action on the node conditions and events
|
||||
generated by node problem detector. In the future, a remedy system could be
|
||||
introduced to deal with node problems.
|
||||
|
||||
See more information
|
||||
[here](https://github.com/kubernetes/node-problem-detector).
|
||||
|
||||
## Limitations
|
||||
|
||||
* The kernel issue detection of node problem detector only supports file based
|
||||
kernel log now. It doesn't support log tools like journald.
|
||||
|
||||
* The kernel issue detection of node problem detector has assumption on kernel
|
||||
log format, and now it only works on Ubuntu and Debian. However, it is easy to extend
|
||||
it to [support other log format](/docs/admin/node-problem/#support-other-log-format).
|
||||
|
||||
## Enable/Disable in GCE cluster
|
||||
|
||||
Node problem detector is [running as a cluster addon](cluster-large.md/#addon-resources) enabled by default in the
|
||||
gce cluster.
|
||||
|
||||
You can enable/disable it by setting the environment variable
|
||||
`KUBE_ENABLE_NODE_PROBLEM_DETECTOR` before `kube-up.sh`.
|
||||
|
||||
## Use in Other Environment
|
||||
|
||||
To enable node problem detector in other environment outside of GCE, you can use
|
||||
either `kubectl` or addon pod.
|
||||
|
||||
### Kubectl
|
||||
|
||||
This is the recommended way to start node problem detector outside of GCE. It
|
||||
provides more flexible management, such as overwriting the default
|
||||
configuration to fit it into your environment or detect
|
||||
customized node problems.
|
||||
|
||||
* **Step 1:** Create `node-problem-detector.yaml`:
|
||||
|
||||
```yaml
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector-v0.1
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: node-problem-detector
|
||||
version: v0.1
|
||||
kubernetes.io/cluster-service: "true"
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: node-problem-detector
|
||||
version: v0.1
|
||||
kubernetes.io/cluster-service: "true"
|
||||
spec:
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
image: gcr.io/google_containers/node-problem-detector:v0.1
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "100Mi"
|
||||
requests:
|
||||
cpu: "20m"
|
||||
memory: "20Mi"
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /log
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: log
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
```
|
||||
|
||||
***Notice that you should make sure the system log directory is right for your
|
||||
OS distro.***
|
||||
|
||||
* **Step 2:** Start node problem detector with `kubectl`:
|
||||
|
||||
```shell
|
||||
kubectl create -f node-problem-detector.yaml
|
||||
```
|
||||
|
||||
### Addon Pod
|
||||
|
||||
This is for those who have their own cluster bootstrap solution, and don't need
|
||||
to overwrite the default configuration. They could leverage the addon pod to
|
||||
further automate the deployment.
|
||||
|
||||
Just create `node-problem-detector.yaml`, and put it under the addon pods directory
|
||||
`/etc/kubernetes/addons/node-problem-detector` on master node.
|
||||
|
||||
## Overwrite the Configuration
|
||||
|
||||
The [default configuration](https://github.com/kubernetes/node-problem-detector/tree/v0.1/config)
|
||||
is embedded when building the docker image of node problem detector.
|
||||
|
||||
However, you can use [ConfigMap](/docs/user-guide/configmap/) to overwrite it
|
||||
following the steps:
|
||||
|
||||
* **Step 1:** Change the config files in `config/`.
|
||||
* **Step 2:** Create the ConfigMap `node-problem-detector-config` with `kubectl create configmap
|
||||
node-problem-detector-config --from-file=config/`.
|
||||
* **Step 3:** Change the `node-problem-detector.yaml` to use the ConfigMap:
|
||||
|
||||
```yaml
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector-v0.1
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: node-problem-detector
|
||||
version: v0.1
|
||||
kubernetes.io/cluster-service: "true"
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: node-problem-detector
|
||||
version: v0.1
|
||||
kubernetes.io/cluster-service: "true"
|
||||
spec:
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
image: gcr.io/google_containers/node-problem-detector:v0.1
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "100Mi"
|
||||
requests:
|
||||
cpu: "20m"
|
||||
memory: "20Mi"
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /log
|
||||
readOnly: true
|
||||
- name: config # Overwrite the config/ directory with ConfigMap volume
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: log
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: config # Define ConfigMap volume
|
||||
configMap:
|
||||
name: node-problem-detector-config
|
||||
```
|
||||
|
||||
* **Step 4:** Re-create the node problem detector with the new yaml file:
|
||||
|
||||
```shell
|
||||
kubectl delete -f node-problem-detector.yaml # If you have a node-problem-detector running
|
||||
kubectl create -f node-problem-detector.yaml
|
||||
```
|
||||
|
||||
***Notice that this approach only applies to node problem detector started with `kubectl`.***
|
||||
|
||||
For node problem detector running as cluster addon, because addon manager doesn't support
|
||||
ConfigMap, configuration overwriting is not supported now.
|
||||
|
||||
## Kernel Monitor
|
||||
|
||||
*Kernel Monitor* is a problem daemon in node problem detector. It monitors kernel log
|
||||
and detects known kernel issues following predefined rules.
|
||||
|
||||
The Kernel Monitor matches kernel issues according to a set of predefined rule list in
|
||||
[`config/kernel-monitor.json`](https://github.com/kubernetes/node-problem-detector/blob/v0.1/config/kernel-monitor.json).
|
||||
The rule list is extensible, and you can always extend it by [overwriting the
|
||||
configuration](/docs/admin/node-problem/#overwrite-the-configuration).
|
||||
|
||||
### Add New NodeConditions
|
||||
|
||||
To support new node conditions, you can extend the `conditions` field in
|
||||
`config/kernel-monitor.json` with new condition definition:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "NodeConditionType",
|
||||
"reason": "CamelCaseDefaultNodeConditionReason",
|
||||
"message": "arbitrary default node condition message"
|
||||
}
|
||||
```
|
||||
|
||||
### Detect New Problems
|
||||
|
||||
To detect new problems, you can extend the `rules` field in `config/kernel-monitor.json`
|
||||
with new rule definition:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "temporary/permanent",
|
||||
"condition": "NodeConditionOfPermanentIssue",
|
||||
"reason": "CamelCaseShortReason",
|
||||
"message": "regexp matching the issue in the kernel log"
|
||||
}
|
||||
```
|
||||
|
||||
### Change Log Path
|
||||
|
||||
Kernel log in different OS distros may locate in different path. The `log`
|
||||
field in `config/kernel-monitor.json` is the log path inside the container.
|
||||
You can always configure it to match your OS distro.
|
||||
|
||||
### Support Other Log Format
|
||||
|
||||
Kernel monitor uses [`Translator`](https://github.com/kubernetes/node-problem-detector/blob/v0.1/pkg/kernelmonitor/translator/translator.go)
|
||||
plugin to translate kernel log the internal data structure. It is easy to
|
||||
implement a new translator for a new log format.
|
||||
|
||||
## Caveats
|
||||
|
||||
It is recommended to run the node problem detector in your cluster to monitor
|
||||
the node health. However, you should be aware that this will introduce extra
|
||||
resource overhead on each node. Usually this is fine, because:
|
||||
|
||||
* The kernel log is generated relatively slowly.
|
||||
* Resource limit is set for node problem detector.
|
||||
* Even under high load, the resource usage is acceptable.
|
||||
(see [benchmark result](https://github.com/kubernetes/node-problem-detector/issues/2#issuecomment-220255629))
|
||||
[Monitoring Node Health](/docs/tasks/debug-application-cluster/monitor-node-health/)
|
||||
|
|
|
@ -172,10 +172,13 @@ register itself with the API server. This is the preferred pattern, used by mos
|
|||
|
||||
For self-registration, the kubelet is started with the following options:
|
||||
|
||||
- `--api-servers=` - Location of the apiservers.
|
||||
- `--kubeconfig=` - Path to credentials to authenticate itself to the apiserver.
|
||||
- `--cloud-provider=` - How to talk to a cloud provider to read metadata about itself.
|
||||
- `--api-servers` - Location of the apiservers.
|
||||
- `--kubeconfig` - Path to credentials to authenticate itself to the apiserver.
|
||||
- `--cloud-provider` - How to talk to a cloud provider to read metadata about itself.
|
||||
- `--register-node` - Automatically register with the API server.
|
||||
- `--node-ip` IP address of the node.
|
||||
- `--node-labels` - Labels to add when registering the node in the cluster.
|
||||
- `--node-status-update-frequency` - Specifies how often kubelet posts node status to master.
|
||||
|
||||
Currently, any kubelet is authorized to create/modify any node resource, but in practice it only creates/modifies
|
||||
its own. (In the future, we plan to only allow a kubelet to modify its own node resource.)
|
||||
|
|
|
@ -6,364 +6,6 @@ assignees:
|
|||
title: Configuring Out Of Resource Handling
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
The `kubelet` needs to preserve node stability when available compute resources are low.
|
||||
|
||||
This is especially important when dealing with incompressible resources such as memory or disk.
|
||||
|
||||
If either resource is exhausted, the node would become unstable.
|
||||
|
||||
## Eviction Policy
|
||||
|
||||
The `kubelet` can pro-actively monitor for and prevent against total starvation of a compute resource. In
|
||||
cases where it could appear to occur, the `kubelet` can pro-actively fail one or more pods in order to reclaim
|
||||
the starved resource. When the `kubelet` fails a pod, it terminates all containers in the pod, and the `PodPhase`
|
||||
is transitioned to `Failed`.
|
||||
|
||||
### Eviction Signals
|
||||
|
||||
The `kubelet` can support the ability to trigger eviction decisions on the signals described in the
|
||||
table below. The value of each signal is described in the description column based on the `kubelet`
|
||||
summary API.
|
||||
|
||||
| Eviction Signal | Description |
|
||||
|----------------------------|-----------------------------------------------------------------------|
|
||||
| `memory.available` | `memory.available` := `node.status.capacity[memory]` - `node.stats.memory.workingSet` |
|
||||
| `nodefs.available` | `nodefs.available` := `node.stats.fs.available` |
|
||||
| `nodefs.inodesFree` | `nodefs.inodesFree` := `node.stats.fs.inodesFree` |
|
||||
| `imagefs.available` | `imagefs.available` := `node.stats.runtime.imagefs.available` |
|
||||
| `imagefs.inodesFree` | `imagefs.inodesFree` := `node.stats.runtime.imagefs.inodesFree` |
|
||||
|
||||
Each of the above signals support either a literal or percentage based value. The percentage based value
|
||||
is calculated relative to the total capacity associated with each signal.
|
||||
|
||||
`kubelet` supports only two filesystem partitions.
|
||||
|
||||
1. The `nodefs` filesystem that kubelet uses for volumes, daemon logs, etc.
|
||||
1. The `imagefs` filesystem that container runtimes uses for storing images and container writable layers.
|
||||
|
||||
`imagefs` is optional. `kubelet` auto-discovers these filesystems using cAdvisor. `kubelet` does not care about any
|
||||
other filesystems. Any other types of configurations are not currently supported by the kubelet. For example, it is
|
||||
*not OK* to store volumes and logs in a dedicated `filesystem`.
|
||||
|
||||
In future releases, the `kubelet` will deprecate the existing [garbage collection](/docs/admin/garbage-collection/)
|
||||
support in favor of eviction in response to disk pressure.
|
||||
|
||||
### Eviction Thresholds
|
||||
|
||||
The `kubelet` supports the ability to specify eviction thresholds that trigger the `kubelet` to reclaim resources.
|
||||
|
||||
Each threshold is of the following form:
|
||||
|
||||
`<eviction-signal><operator><quantity>`
|
||||
|
||||
* valid `eviction-signal` tokens as defined above.
|
||||
* valid `operator` tokens are `<`
|
||||
* valid `quantity` tokens must match the quantity representation used by Kubernetes
|
||||
* an eviction threshold can be expressed as a percentage if ends with `%` token.
|
||||
|
||||
For example, if a node has `10Gi` of memory, and the desire is to induce eviction
|
||||
if available memory falls below `1Gi`, an eviction threshold can be specified as either
|
||||
of the following (but not both).
|
||||
|
||||
* `memory.available<10%`
|
||||
* `memory.available<1Gi`
|
||||
|
||||
#### Soft Eviction Thresholds
|
||||
|
||||
A soft eviction threshold pairs an eviction threshold with a required
|
||||
administrator specified grace period. No action is taken by the `kubelet`
|
||||
to reclaim resources associated with the eviction signal until that grace
|
||||
period has been exceeded. If no grace period is provided, the `kubelet` will
|
||||
error on startup.
|
||||
|
||||
In addition, if a soft eviction threshold has been met, an operator can
|
||||
specify a maximum allowed pod termination grace period to use when evicting
|
||||
pods from the node. If specified, the `kubelet` will use the lesser value among
|
||||
the `pod.Spec.TerminationGracePeriodSeconds` and the max allowed grace period.
|
||||
If not specified, the `kubelet` will kill pods immediately with no graceful
|
||||
termination.
|
||||
|
||||
To configure soft eviction thresholds, the following flags are supported:
|
||||
|
||||
* `eviction-soft` describes a set of eviction thresholds (e.g. `memory.available<1.5Gi`) that if met over a
|
||||
corresponding grace period would trigger a pod eviction.
|
||||
* `eviction-soft-grace-period` describes a set of eviction grace periods (e.g. `memory.available=1m30s`) that
|
||||
correspond to how long a soft eviction threshold must hold before triggering a pod eviction.
|
||||
* `eviction-max-pod-grace-period` describes the maximum allowed grace period (in seconds) to use when terminating
|
||||
pods in response to a soft eviction threshold being met.
|
||||
|
||||
#### Hard Eviction Thresholds
|
||||
|
||||
A hard eviction threshold has no grace period, and if observed, the `kubelet`
|
||||
will take immediate action to reclaim the associated starved resource. If a
|
||||
hard eviction threshold is met, the `kubelet` will kill the pod immediately
|
||||
with no graceful termination.
|
||||
|
||||
To configure hard eviction thresholds, the following flag is supported:
|
||||
|
||||
* `eviction-hard` describes a set of eviction thresholds (e.g. `memory.available<1Gi`) that if met
|
||||
would trigger a pod eviction.
|
||||
|
||||
The `kubelet` has the following default hard eviction thresholds:
|
||||
|
||||
* `--eviction-hard=memory.available<100Mi`
|
||||
|
||||
### Eviction Monitoring Interval
|
||||
|
||||
The `kubelet` evaluates eviction thresholds per its configured housekeeping interval.
|
||||
|
||||
* `housekeeping-interval` is the interval between container housekeepings.
|
||||
|
||||
### Node Conditions
|
||||
|
||||
The `kubelet` will map one or more eviction signals to a corresponding node condition.
|
||||
|
||||
If a hard eviction threshold has been met, or a soft eviction threshold has been met
|
||||
independent of its associated grace period, the `kubelet` will report a condition that
|
||||
reflects the node is under pressure.
|
||||
|
||||
The following node conditions are defined that correspond to the specified eviction signal.
|
||||
|
||||
| Node Condition | Eviction Signal | Description |
|
||||
|-------------------------|-------------------------------|--------------------------------------------|
|
||||
| `MemoryPressure` | `memory.available` | Available memory on the node has satisfied an eviction threshold |
|
||||
| `DiskPressure` | `nodefs.available`, `nodefs.inodesFree`, `imagefs.available`, or `imagefs.inodesFree` | Available disk space and inodes on either the node's root filesytem or image filesystem has satisfied an eviction threshold |
|
||||
|
||||
The `kubelet` will continue to report node status updates at the frequency specified by
|
||||
`--node-status-update-frequency` which defaults to `10s`.
|
||||
|
||||
### Oscillation of node conditions
|
||||
|
||||
If a node is oscillating above and below a soft eviction threshold, but not exceeding
|
||||
its associated grace period, it would cause the corresponding node condition to
|
||||
constantly oscillate between true and false, and could cause poor scheduling decisions
|
||||
as a consequence.
|
||||
|
||||
To protect against this oscillation, the following flag is defined to control how
|
||||
long the `kubelet` must wait before transitioning out of a pressure condition.
|
||||
|
||||
* `eviction-pressure-transition-period` is the duration for which the `kubelet` has
|
||||
to wait before transitioning out of an eviction pressure condition.
|
||||
|
||||
The `kubelet` would ensure that it has not observed an eviction threshold being met
|
||||
for the specified pressure condition for the period specified before toggling the
|
||||
condition back to `false`.
|
||||
|
||||
### Reclaiming node level resources
|
||||
|
||||
If an eviction threshold has been met and the grace period has passed,
|
||||
the `kubelet` will initiate the process of reclaiming the pressured resource
|
||||
until it has observed the signal has gone below its defined threshold.
|
||||
|
||||
The `kubelet` attempts to reclaim node level resources prior to evicting end-user pods. If
|
||||
disk pressure is observed, the `kubelet` reclaims node level resources differently if the
|
||||
machine has a dedicated `imagefs` configured for the container runtime.
|
||||
|
||||
#### With Imagefs
|
||||
|
||||
If `nodefs` filesystem has met eviction thresholds, `kubelet` will free up disk space in the following order:
|
||||
|
||||
1. Delete dead pods/containers
|
||||
|
||||
If `imagefs` filesystem has met eviction thresholds, `kubelet` will free up disk space in the following order:
|
||||
|
||||
1. Delete all unused images
|
||||
|
||||
#### Without Imagefs
|
||||
|
||||
If `nodefs` filesystem has met eviction thresholds, `kubelet` will free up disk space in the following order:
|
||||
|
||||
1. Delete dead pods/containers
|
||||
1. Delete all unused images
|
||||
|
||||
### Evicting end-user pods
|
||||
|
||||
If the `kubelet` is unable to reclaim sufficient resource on the node,
|
||||
it will begin evicting pods.
|
||||
|
||||
The `kubelet` ranks pods for eviction as follows:
|
||||
|
||||
* by their quality of service
|
||||
* by the consumption of the starved compute resource relative to the pods scheduling request.
|
||||
|
||||
As a result, pod eviction occurs in the following order:
|
||||
|
||||
* `BestEffort` pods that consume the most of the starved resource are failed
|
||||
first.
|
||||
* `Burstable` pods that consume the greatest amount of the starved resource
|
||||
relative to their request for that resource are killed first. If no pod
|
||||
has exceeded its request, the strategy targets the largest consumer of the
|
||||
starved resource.
|
||||
* `Guaranteed` pods that consume the greatest amount of the starved resource
|
||||
relative to their request are killed first. If no pod has exceeded its request,
|
||||
the strategy targets the largest consumer of the starved resource.
|
||||
|
||||
A `Guaranteed` pod is guaranteed to never be evicted because of another pod's
|
||||
resource consumption. If a system daemon (i.e. `kubelet`, `docker`, `journald`, etc.)
|
||||
is consuming more resources than were reserved via `system-reserved` or `kube-reserved` allocations,
|
||||
and the node only has `Guaranteed` pod(s) remaining, then the node must choose to evict a
|
||||
`Guaranteed` pod in order to preserve node stability, and to limit the impact
|
||||
of the unexpected consumption to other `Guaranteed` pod(s).
|
||||
|
||||
Local disk is a `BestEffort` resource. If necessary, `kubelet` will evict pods one at a time to reclaim
|
||||
disk when `DiskPressure` is encountered. The `kubelet` will rank pods by quality of service. If the `kubelet`
|
||||
is responding to `inode` starvation, it will reclaim `inodes` by evicting pods with the lowest quality of service
|
||||
first. If the `kubelet` is responding to lack of available disk, it will rank pods within a quality of service
|
||||
that consumes the largest amount of disk and kill those first.
|
||||
|
||||
#### With Imagefs
|
||||
|
||||
If `nodefs` is triggering evictions, `kubelet` will sort pods based on the usage on `nodefs`
|
||||
- local volumes + logs of all its containers.
|
||||
|
||||
If `imagefs` is triggering evictions, `kubelet` will sort pods based on the writable layer usage of all its containers.
|
||||
|
||||
#### Without Imagefs
|
||||
|
||||
If `nodefs` is triggering evictions, `kubelet` will sort pods based on their total disk usage
|
||||
- local volumes + logs & writable layer of all its containers.
|
||||
|
||||
### Minimum eviction reclaim
|
||||
|
||||
In certain scenarios, eviction of pods could result in reclamation of small amount of resources. This can result in
|
||||
`kubelet` hitting eviction thresholds in repeated successions. In addition to that, eviction of resources like `disk`,
|
||||
is time consuming.
|
||||
|
||||
To mitigate these issues, `kubelet` can have a per-resource `minimum-reclaim`. Whenever `kubelet` observes
|
||||
resource pressure, `kubelet` will attempt to reclaim at least `minimum-reclaim` amount of resource below
|
||||
the configured eviction threshold.
|
||||
|
||||
For example, with the following configuration:
|
||||
|
||||
```
|
||||
--eviction-hard=memory.available<500Mi,nodefs.available<1Gi,imagefs.available<100Gi
|
||||
--eviction-minimum-reclaim="memory.available=0Mi,nodefs.available=500Mi,imagefs.available=2Gi"`
|
||||
```
|
||||
|
||||
If an eviction threshold is triggered for `memory.available`, the `kubelet` will work to ensure
|
||||
that `memory.available` is at least `500Mi`. For `nodefs.available`, the `kubelet` will work
|
||||
to ensure that `nodefs.available` is at least `1.5Gi`, and for `imagefs.available` it will
|
||||
work to ensure that `imagefs.available` is at least `102Gi` before no longer reporting pressure
|
||||
on their associated resources.
|
||||
|
||||
The default `eviction-minimum-reclaim` is `0` for all resources.
|
||||
|
||||
### Scheduler
|
||||
|
||||
The node will report a condition when a compute resource is under pressure. The
|
||||
scheduler views that condition as a signal to dissuade placing additional
|
||||
pods on the node.
|
||||
|
||||
| Node Condition | Scheduler Behavior |
|
||||
| ---------------- | ------------------------------------------------ |
|
||||
| `MemoryPressure` | No new `BestEffort` pods are scheduled to the node. |
|
||||
| `DiskPressure` | No new pods are scheduled to the node. |
|
||||
|
||||
## Node OOM Behavior
|
||||
|
||||
If the node experiences a system OOM (out of memory) event prior to the `kubelet` is able to reclaim memory,
|
||||
the node depends on the [oom_killer](https://lwn.net/Articles/391222/) to respond.
|
||||
|
||||
The `kubelet` sets a `oom_score_adj` value for each container based on the quality of service for the pod.
|
||||
|
||||
| Quality of Service | oom_score_adj |
|
||||
|----------------------------|-----------------------------------------------------------------------|
|
||||
| `Guaranteed` | -998 |
|
||||
| `BestEffort` | 1000 |
|
||||
| `Burstable` | min(max(2, 1000 - (1000 * memoryRequestBytes) / machineMemoryCapacityBytes), 999) |
|
||||
|
||||
If the `kubelet` is unable to reclaim memory prior to a node experiencing system OOM, the `oom_killer` will calculate
|
||||
an `oom_score` based on the percentage of memory its using on the node, and then add the `oom_score_adj` to get an
|
||||
effective `oom_score` for the container, and then kills the container with the highest score.
|
||||
|
||||
The intended behavior should be that containers with the lowest quality of service that
|
||||
are consuming the largest amount of memory relative to the scheduling request should be killed first in order
|
||||
to reclaim memory.
|
||||
|
||||
Unlike pod eviction, if a pod container is OOM killed, it may be restarted by the `kubelet` based on its `RestartPolicy`.
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Schedulable resources and eviction policies
|
||||
|
||||
Let's imagine the following scenario:
|
||||
|
||||
* Node memory capacity: `10Gi`
|
||||
* Operator wants to reserve 10% of memory capacity for system daemons (kernel, `kubelet`, etc.)
|
||||
* Operator wants to evict pods at 95% memory utilization to reduce thrashing and incidence of system OOM.
|
||||
|
||||
To facilitate this scenario, the `kubelet` would be launched as follows:
|
||||
|
||||
```
|
||||
--eviction-hard=memory.available<500Mi
|
||||
--system-reserved=memory=1.5Gi
|
||||
```
|
||||
|
||||
Implicit in this configuration is the understanding that "System reserved" should include the amount of memory
|
||||
covered by the eviction threshold.
|
||||
|
||||
To reach that capacity, either some pod is using more than its request, or the system is using more than `500Mi`.
|
||||
|
||||
This configuration will ensure that the scheduler does not place pods on a node that immediately induce memory pressure
|
||||
and trigger eviction assuming those pods use less than their configured request.
|
||||
|
||||
### DaemonSet
|
||||
|
||||
It is never desired for a `kubelet` to evict a pod that was derived from
|
||||
a `DaemonSet` since the pod will immediately be recreated and rescheduled
|
||||
back to the same node.
|
||||
|
||||
At the moment, the `kubelet` has no ability to distinguish a pod created
|
||||
from `DaemonSet` versus any other object. If/when that information is
|
||||
available, the `kubelet` could pro-actively filter those pods from the
|
||||
candidate set of pods provided to the eviction strategy.
|
||||
|
||||
In general, it is strongly recommended that `DaemonSet` not
|
||||
create `BestEffort` pods to avoid being identified as a candidate pod
|
||||
for eviction. Instead `DaemonSet` should ideally launch `Guaranteed` pods.
|
||||
|
||||
## Deprecation of existing feature flags to reclaim disk
|
||||
|
||||
`kubelet` has been freeing up disk space on demand to keep the node stable.
|
||||
|
||||
As disk based eviction matures, the following `kubelet` flags will be marked for deprecation
|
||||
in favor of the simpler configuration supported around eviction.
|
||||
|
||||
| Existing Flag | New Flag |
|
||||
| ------------- | -------- |
|
||||
| `--image-gc-high-threshold` | `--eviction-hard` or `eviction-soft` |
|
||||
| `--image-gc-low-threshold` | `--eviction-minimum-reclaim` |
|
||||
| `--maximum-dead-containers` | deprecated |
|
||||
| `--maximum-dead-containers-per-container` | deprecated |
|
||||
| `--minimum-container-ttl-duration` | deprecated |
|
||||
| `--low-diskspace-threshold-mb` | `--eviction-hard` or `eviction-soft` |
|
||||
| `--outofdisk-transition-frequency` | `--eviction-pressure-transition-period` |
|
||||
|
||||
## Known issues
|
||||
|
||||
### kubelet may not observe memory pressure right away
|
||||
|
||||
The `kubelet` currently polls `cAdvisor` to collect memory usage stats at a regular interval. If memory usage
|
||||
increases within that window rapidly, the `kubelet` may not observe `MemoryPressure` fast enough, and the `OOMKiller`
|
||||
will still be invoked. We intend to integrate with the `memcg` notification API in a future release to reduce this
|
||||
latency, and instead have the kernel tell us when a threshold has been crossed immediately.
|
||||
|
||||
If you are not trying to achieve extreme utilization, but a sensible measure of overcommit, a viable workaround for
|
||||
this issue is to set eviction thresholds at approximately 75% capacity. This increases the ability of this feature
|
||||
to prevent system OOMs, and promote eviction of workloads so cluster state can rebalance.
|
||||
|
||||
### kubelet may evict more pods than needed
|
||||
|
||||
The pod eviction may evict more pods than needed due to stats collection timing gap. This can be mitigated by adding
|
||||
the ability to get root container stats on an on-demand basis (https://github.com/google/cadvisor/issues/1247) in the future.
|
||||
|
||||
### How kubelet ranks pods for eviction in response to inode exhaustion
|
||||
|
||||
At this time, it is not possible to know how many inodes were consumed by a particular container. If the `kubelet` observes
|
||||
inode exhaustion, it will evict pods by ranking them by quality of service. The following issue has been opened in cadvisor
|
||||
to track per container inode consumption (https://github.com/google/cadvisor/issues/1422) which would allow us to rank pods
|
||||
by inode consumption. For example, this would let us identify a container that created large numbers of 0 byte files, and evict
|
||||
that pod over others.
|
||||
[Configuring Out of Resource Handling](/docs/concepts/cluster-administration/out-of-resource/)
|
||||
|
|
|
@ -6,52 +6,6 @@ assignees:
|
|||
title: Guaranteed Scheduling For Critical Add-On Pods
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
## Overview
|
||||
|
||||
In addition to Kubernetes core components like api-server, scheduler, controller-manager running on a master machine
|
||||
there are a number of add-ons which, for various reasons, must run on a regular cluster node (rather than the Kubernetes master).
|
||||
Some of these add-ons are critical to a fully functional cluster, such as Heapster, DNS, and UI.
|
||||
A cluster may stop working properly if a critical add-on is evicted (either manually or as a side effect of another operation like upgrade)
|
||||
and becomes pending (for example when the cluster is highly utilized and either there are other pending pods that schedule into the space
|
||||
vacated by the evicted critical add-on pod or the amount of resources available on the node changed for some other reason).
|
||||
|
||||
## Rescheduler: guaranteed scheduling of critical add-ons
|
||||
|
||||
Rescheduler ensures that critical add-ons are always scheduled
|
||||
(assuming the cluster has enough resources to run the critical add-on pods in the absence of regular pods).
|
||||
If the scheduler determines that no node has enough free resources to run the critical add-on pod
|
||||
given the pods that are already running in the cluster
|
||||
(indicated by critical add-on pod's pod condition PodScheduled set to false, the reason set to Unschedulable)
|
||||
the rescheduler tries to free up space for the add-on by evicting some pods; then the scheduler will schedule the add-on pod.
|
||||
|
||||
To avoid situation when another pod is scheduled into the space prepared for the critical add-on,
|
||||
the chosen node gets a temporary taint "CriticalAddonsOnly" before the eviction(s)
|
||||
(see [more details](https://github.com/kubernetes/kubernetes/blob/master/docs/design/taint-toleration-dedicated.md)).
|
||||
Each critical add-on has to tolerate it,
|
||||
while the other pods shouldn't tolerate the taint. The taint is removed once the add-on is successfully scheduled.
|
||||
|
||||
*Warning:* currently there is no guarantee which node is chosen and which pods are being killed
|
||||
in order to schedule critical pods, so if rescheduler is enabled your pods might be occasionally
|
||||
killed for this purpose.
|
||||
|
||||
## Config
|
||||
|
||||
Rescheduler doesn't have any user facing configuration (component config) or API.
|
||||
It's enabled by default. It can be disabled:
|
||||
|
||||
* during cluster setup by setting `ENABLE_RESCHEDULER` flag to `false`
|
||||
* on running cluster by deleting its manifest from master node
|
||||
(default path `/etc/kubernetes/manifests/rescheduler.manifest`)
|
||||
|
||||
### Marking add-on as critical
|
||||
|
||||
To be critical an add-on has to run in `kube-system` namespace (configurable via flag)
|
||||
and have the following annotations specified:
|
||||
|
||||
* `scheduler.alpha.kubernetes.io/critical-pod` set to empty string
|
||||
* `scheduler.alpha.kubernetes.io/tolerations` set to `[{"key":"CriticalAddonsOnly", "operator":"Exists"}]`
|
||||
|
||||
The first one marks a pod a critical. The second one is required by Rescheduler algorithm.
|
||||
[Guaranteed Scheduling for Critical Add-On Pods](/docs/concepts/cluster-administration/guaranteed-scheduling-critical-addon-pods/)
|
||||
|
|
|
@ -4,237 +4,6 @@ assignees:
|
|||
title: Resource Quotas
|
||||
---
|
||||
|
||||
When several users or teams share a cluster with a fixed number of nodes,
|
||||
there is a concern that one team could use more than its fair share of resources.
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
Resource quotas are a tool for administrators to address this concern.
|
||||
|
||||
A resource quota, defined by a `ResourceQuota` object, provides constraints that limit
|
||||
aggregate resource consumption per namespace. It can limit the quantity of objects that can
|
||||
be created in a namespace by type, as well as the total amount of compute resources that may
|
||||
be consumed by resources in that project.
|
||||
|
||||
Resource quotas work like this:
|
||||
|
||||
- Different teams work in different namespaces. Currently this is voluntary, but
|
||||
support for making this mandatory via ACLs is planned.
|
||||
- The administrator creates one or more Resource Quota objects for each namespace.
|
||||
- Users create resources (pods, services, etc.) in the namespace, and the quota system
|
||||
tracks usage to ensure it does not exceed hard resource limits defined in a Resource Quota.
|
||||
- If creating or updating a resource violates a quota constraint, the request will fail with HTTP
|
||||
status code `403 FORBIDDEN` with a message explaining the constraint that would have been violated.
|
||||
- If quota is enabled in a namespace for compute resources like `cpu` and `memory`, users must specify
|
||||
requests or limits for those values; otherwise, the quota system may reject pod creation. Hint: Use
|
||||
the LimitRange admission controller to force defaults for pods that make no compute resource requirements.
|
||||
See the [walkthrough](/docs/admin/resourcequota/walkthrough/) for an example to avoid this problem.
|
||||
|
||||
Examples of policies that could be created using namespaces and quotas are:
|
||||
|
||||
- In a cluster with a capacity of 32 GiB RAM, and 16 cores, let team A use 20 Gib and 10 cores,
|
||||
let B use 10GiB and 4 cores, and hold 2GiB and 2 cores in reserve for future allocation.
|
||||
- Limit the "testing" namespace to using 1 core and 1GiB RAM. Let the "production" namespace
|
||||
use any amount.
|
||||
|
||||
In the case where the total capacity of the cluster is less than the sum of the quotas of the namespaces,
|
||||
there may be contention for resources. This is handled on a first-come-first-served basis.
|
||||
|
||||
Neither contention nor changes to quota will affect already created resources.
|
||||
|
||||
## Enabling Resource Quota
|
||||
|
||||
Resource Quota support is enabled by default for many Kubernetes distributions. It is
|
||||
enabled when the apiserver `--admission-control=` flag has `ResourceQuota` as
|
||||
one of its arguments.
|
||||
|
||||
Resource Quota is enforced in a particular namespace when there is a
|
||||
`ResourceQuota` object in that namespace. There should be at most one
|
||||
`ResourceQuota` object in a namespace.
|
||||
|
||||
## Compute Resource Quota
|
||||
|
||||
You can limit the total sum of [compute resources](/docs/user-guide/compute-resources) that can be requested in a given namespace.
|
||||
|
||||
The following resource types are supported:
|
||||
|
||||
| Resource Name | Description |
|
||||
| --------------------- | ----------------------------------------------------------- |
|
||||
| `cpu` | Across all pods in a non-terminal state, the sum of CPU requests cannot exceed this value. |
|
||||
| `limits.cpu` | Across all pods in a non-terminal state, the sum of CPU limits cannot exceed this value. |
|
||||
| `limits.memory` | Across all pods in a non-terminal state, the sum of memory limits cannot exceed this value. |
|
||||
| `memory` | Across all pods in a non-terminal state, the sum of memory requests cannot exceed this value. |
|
||||
| `requests.cpu` | Across all pods in a non-terminal state, the sum of CPU requests cannot exceed this value. |
|
||||
| `requests.memory` | Across all pods in a non-terminal state, the sum of memory requests cannot exceed this value. |
|
||||
|
||||
## Storage Resource Quota
|
||||
|
||||
You can limit the total sum of [storage resources](/docs/user-guide/persistent-volumes) that can be requested in a given namespace.
|
||||
|
||||
In addition, you can limit consumption of storage resources based on associated storage-class.
|
||||
|
||||
| Resource Name | Description |
|
||||
| --------------------- | ----------------------------------------------------------- |
|
||||
| `requests.storage` | Across all persistent volume claims, the sum of storage requests cannot exceed this value. |
|
||||
| `persistentvolumeclaims` | The total number of [persistent volume claims](/docs/user-guide/persistent-volumes/#persistentvolumeclaims) that can exist in the namespace. |
|
||||
| `<storage-class-name>.storageclass.storage.k8s.io/requests.storage` | Across all persistent volume claims associated with the storage-class-name, the sum of storage requests cannot exceed this value. |
|
||||
| `<storage-class-name>.storageclass.storage.k8s.io/persistentvolumeclaims` | Across all persistent volume claims associated with the storage-class-name, the total number of [persistent volume claims](/docs/user-guide/persistent-volumes/#persistentvolumeclaims) that can exist in the namespace. |
|
||||
|
||||
For example, if an operator wants to quota storage with `gold` storage class separate from `bronze` storage class, the operator can
|
||||
define a quota as follows:
|
||||
|
||||
* `gold.storageclass.storage.k8s.io/requests.storage: 500Gi`
|
||||
* `bronze.storageclass.storage.k8s.io/requests.storage: 100Gi`
|
||||
|
||||
## Object Count Quota
|
||||
|
||||
The number of objects of a given type can be restricted. The following types
|
||||
are supported:
|
||||
|
||||
| Resource Name | Description |
|
||||
| ------------------------------- | ------------------------------------------------- |
|
||||
| `configmaps` | The total number of config maps that can exist in the namespace. |
|
||||
| `persistentvolumeclaims` | The total number of [persistent volume claims](/docs/user-guide/persistent-volumes/#persistentvolumeclaims) that can exist in the namespace. |
|
||||
| `pods` | The total number of pods in a non-terminal state that can exist in the namespace. A pod is in a terminal state if `status.phase in (Failed, Succeeded)` is true. |
|
||||
| `replicationcontrollers` | The total number of replication controllers that can exist in the namespace. |
|
||||
| `resourcequotas` | The total number of [resource quotas](/docs/admin/admission-controllers/#resourcequota) that can exist in the namespace. |
|
||||
| `services` | The total number of services that can exist in the namespace. |
|
||||
| `services.loadbalancers` | The total number of services of type load balancer that can exist in the namespace. |
|
||||
| `services.nodeports` | The total number of services of type node port that can exist in the namespace. |
|
||||
| `secrets` | The total number of secrets that can exist in the namespace. |
|
||||
|
||||
For example, `pods` quota counts and enforces a maximum on the number of `pods`
|
||||
created in a single namespace.
|
||||
|
||||
You might want to set a pods quota on a namespace
|
||||
to avoid the case where a user creates many small pods and exhausts the cluster's
|
||||
supply of Pod IPs.
|
||||
|
||||
## Quota Scopes
|
||||
|
||||
Each quota can have an associated set of scopes. A quota will only measure usage for a resource if it matches
|
||||
the intersection of enumerated scopes.
|
||||
|
||||
When a scope is added to the quota, it limits the number of resources it supports to those that pertain to the scope.
|
||||
Resources specified on the quota outside of the allowed set results in a validation error.
|
||||
|
||||
| Scope | Description |
|
||||
| ----- | ----------- |
|
||||
| `Terminating` | Match pods where `spec.activeDeadlineSeconds >= 0` |
|
||||
| `NotTerminating` | Match pods where `spec.activeDeadlineSeconds is nil` |
|
||||
| `BestEffort` | Match pods that have best effort quality of service. |
|
||||
| `NotBestEffort` | Match pods that do not have best effort quality of service. |
|
||||
|
||||
The `BestEffort` scope restricts a quota to tracking the following resource: `pods`
|
||||
|
||||
The `Terminating`, `NotTerminating`, and `NotBestEffort` scopes restrict a quota to tracking the following resources:
|
||||
|
||||
* `cpu`
|
||||
* `limits.cpu`
|
||||
* `limits.memory`
|
||||
* `memory`
|
||||
* `pods`
|
||||
* `requests.cpu`
|
||||
* `requests.memory`
|
||||
|
||||
## Requests vs Limits
|
||||
|
||||
When allocating compute resources, each container may specify a request and a limit value for either CPU or memory.
|
||||
The quota can be configured to quota either value.
|
||||
|
||||
If the quota has a value specified for `requests.cpu` or `requests.memory`, then it requires that every incoming
|
||||
container makes an explicit request for those resources. If the quota has a value specified for `limits.cpu` or `limits.memory`,
|
||||
then it requires that every incoming container specifies an explicit limit for those resources.
|
||||
|
||||
## Viewing and Setting Quotas
|
||||
|
||||
Kubectl supports creating, updating, and viewing quotas:
|
||||
|
||||
```shell
|
||||
$ kubectl create namespace myspace
|
||||
|
||||
$ cat <<EOF > compute-resources.yaml
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: compute-resources
|
||||
spec:
|
||||
hard:
|
||||
pods: "4"
|
||||
requests.cpu: "1"
|
||||
requests.memory: 1Gi
|
||||
limits.cpu: "2"
|
||||
limits.memory: 2Gi
|
||||
EOF
|
||||
$ kubectl create -f ./compute-resources.yaml --namespace=myspace
|
||||
|
||||
$ cat <<EOF > object-counts.yaml
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: object-counts
|
||||
spec:
|
||||
hard:
|
||||
configmaps: "10"
|
||||
persistentvolumeclaims: "4"
|
||||
replicationcontrollers: "20"
|
||||
secrets: "10"
|
||||
services: "10"
|
||||
services.loadbalancers: "2"
|
||||
EOF
|
||||
$ kubectl create -f ./object-counts.yaml --namespace=myspace
|
||||
|
||||
$ kubectl get quota --namespace=myspace
|
||||
NAME AGE
|
||||
compute-resources 30s
|
||||
object-counts 32s
|
||||
|
||||
$ kubectl describe quota compute-resources --namespace=myspace
|
||||
Name: compute-resources
|
||||
Namespace: myspace
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
limits.cpu 0 2
|
||||
limits.memory 0 2Gi
|
||||
pods 0 4
|
||||
requests.cpu 0 1
|
||||
requests.memory 0 1Gi
|
||||
|
||||
$ kubectl describe quota object-counts --namespace=myspace
|
||||
Name: object-counts
|
||||
Namespace: myspace
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
configmaps 0 10
|
||||
persistentvolumeclaims 0 4
|
||||
replicationcontrollers 0 20
|
||||
secrets 1 10
|
||||
services 0 10
|
||||
services.loadbalancers 0 2
|
||||
```
|
||||
|
||||
## Quota and Cluster Capacity
|
||||
|
||||
Resource Quota objects are independent of the Cluster Capacity. They are
|
||||
expressed in absolute units. So, if you add nodes to your cluster, this does *not*
|
||||
automatically give each namespace the ability to consume more resources.
|
||||
|
||||
Sometimes more complex policies may be desired, such as:
|
||||
|
||||
- proportionally divide total cluster resources among several teams.
|
||||
- allow each tenant to grow resource usage as needed, but have a generous
|
||||
limit to prevent accidental resource exhaustion.
|
||||
- detect demand from one namespace, add nodes, and increase quota.
|
||||
|
||||
Such policies could be implemented using ResourceQuota as a building-block, by
|
||||
writing a 'controller' which watches the quota usage and adjusts the quota
|
||||
hard limits of each namespace according to other signals.
|
||||
|
||||
Note that resource quota divides up aggregate cluster resources, but it creates no
|
||||
restrictions around nodes: pods from several namespaces may run on the same node.
|
||||
|
||||
## Example
|
||||
|
||||
See a [detailed example for how to use resource quota](/docs/admin/resourcequota/walkthrough/).
|
||||
|
||||
## Read More
|
||||
|
||||
See [ResourceQuota design doc](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/admission_control_resource_quota.md) for more information.
|
||||
[Resource Quotas](/docs/concepts/policy/resource-quotas/)
|
||||
|
|
|
@ -4,75 +4,8 @@ assignees:
|
|||
- janetkuo
|
||||
title: Limiting Storage Consumption
|
||||
---
|
||||
This example demonstrates an easy way to limit the amount of storage consumed in a namespace.
|
||||
|
||||
The following resources are used in the demonstration:
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
* [Resource Quota](/docs/admin/resourcequota/)
|
||||
* [Limit Range](/docs/admin/limitrange/)
|
||||
* [Persistent Volume Claim](/docs/user-guide/persistent-volumes/)
|
||||
[Limiting Storage Consumption](/docs/tasks/administer-cluster/limit-storage-consumption/)
|
||||
|
||||
This example assumes you have a functional Kubernetes setup.
|
||||
|
||||
## Limiting Storage Consumption
|
||||
|
||||
The cluster-admin is operating a cluster on behalf of a user population and the admin wants to control
|
||||
how much storage a single namespace can consume in order to control cost.
|
||||
|
||||
The admin would like to limit:
|
||||
|
||||
1. The number of persistent volume claims in a namespace
|
||||
2. The amount of storage each claim can request
|
||||
3. The amount of cumulative storage the namespace can have
|
||||
|
||||
|
||||
## LimitRange to limit requests for storage
|
||||
|
||||
Adding a `LimitRange` to a namespace enforces storage request sizes to a minimum and maximum. Storage is requested
|
||||
via `PersistentVolumeClaim`. The admission controller that enforces limit ranges will reject any PVC that is above or below
|
||||
the values set by the admin.
|
||||
|
||||
In this example, a PVC requesting 10Gi of storage would be rejected because it exceeds the 2Gi max.
|
||||
|
||||
```
|
||||
apiVersion: v1
|
||||
kind: LimitRange
|
||||
metadata:
|
||||
name: storagelimits
|
||||
spec:
|
||||
limits:
|
||||
- type: PersistentVolumeClaim
|
||||
max:
|
||||
storage: 2Gi
|
||||
min:
|
||||
storage: 1Gi
|
||||
```
|
||||
|
||||
Minimum storage requests are used when the underlying storage provider requires certain minimums. For example,
|
||||
AWS EBS volumes have a 1Gi minimum requirement.
|
||||
|
||||
## StorageQuota to limit PVC count and cumulative storage capacity
|
||||
|
||||
Admins can limit the number of PVCs in a namespace as well as the cumulative capacity of those PVCs. New PVCs that exceed
|
||||
either maximum value will be rejected.
|
||||
|
||||
In this example, a 6th PVC in the namespace would be rejected because it exceeds the maximum count of 5. Alternatively,
|
||||
a 5Gi maximum quota when combined with the 2Gi max limit above, cannot have 3 PVCs where each has 2Gi. That would be 6Gi requested
|
||||
for a namespace capped at 5Gi.
|
||||
|
||||
```
|
||||
apiVersion: v1
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: storagequota
|
||||
spec:
|
||||
hard:
|
||||
persistentvolumeclaims: "5"
|
||||
requests.storage: "5Gi"
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
A limit range can put a ceiling on how much storage is requested while a resource quota can effectively cap the storage
|
||||
consumed by a namespace through claim counts and cumulative storage capacity. The allows a cluster-admin to plan their
|
||||
cluster's storage budget without risk of any one project going over their allotment.
|
||||
|
|
|
@ -5,362 +5,6 @@ assignees:
|
|||
title: Applying Resource Quotas and Limits
|
||||
---
|
||||
|
||||
This example demonstrates a typical setup to control for resource usage in a namespace.
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
It demonstrates using the following resources:
|
||||
|
||||
* [Namespace](/docs/admin/namespaces)
|
||||
* [Resource Quota](/docs/admin/resourcequota/)
|
||||
* [Limit Range](/docs/admin/limitrange/)
|
||||
|
||||
This example assumes you have a functional Kubernetes setup.
|
||||
|
||||
## Scenario
|
||||
|
||||
The cluster-admin is operating a cluster on behalf of a user population and the cluster-admin
|
||||
wants to control the amount of resources that can be consumed in a particular namespace to promote
|
||||
fair sharing of the cluster and control cost.
|
||||
|
||||
The cluster-admin has the following goals:
|
||||
|
||||
* Limit the amount of compute resource for running pods
|
||||
* Limit the number of persistent volume claims to control access to storage
|
||||
* Limit the number of load balancers to control cost
|
||||
* Prevent the use of node ports to preserve scarce resources
|
||||
* Provide default compute resource requests to enable better scheduling decisions
|
||||
|
||||
## Step 1: Create a namespace
|
||||
|
||||
This example will work in a custom namespace to demonstrate the concepts involved.
|
||||
|
||||
Let's create a new namespace called quota-example:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/resourcequota/namespace.yaml
|
||||
namespace "quota-example" created
|
||||
$ kubectl get namespaces
|
||||
NAME STATUS AGE
|
||||
default Active 2m
|
||||
kube-system Active 2m
|
||||
quota-example Active 39s
|
||||
```
|
||||
|
||||
## Step 2: Apply an object-count quota to the namespace
|
||||
|
||||
The cluster-admin wants to control the following resources:
|
||||
|
||||
* persistent volume claims
|
||||
* load balancers
|
||||
* node ports
|
||||
|
||||
Let's create a simple quota that controls object counts for those resource types in this namespace.
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/resourcequota/object-counts.yaml --namespace=quota-example
|
||||
resourcequota "object-counts" created
|
||||
```
|
||||
|
||||
The quota system will observe that a quota has been created, and will calculate consumption
|
||||
in the namespace in response. This should happen quickly.
|
||||
|
||||
Let's describe the quota to see what is currently being consumed in this namespace:
|
||||
|
||||
```shell
|
||||
$ kubectl describe quota object-counts --namespace=quota-example
|
||||
Name: object-counts
|
||||
Namespace: quota-example
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
persistentvolumeclaims 0 2
|
||||
services.loadbalancers 0 2
|
||||
services.nodeports 0 0
|
||||
```
|
||||
|
||||
The quota system will now prevent users from creating more than the specified amount for each resource.
|
||||
|
||||
|
||||
## Step 3: Apply a compute-resource quota to the namespace
|
||||
|
||||
To limit the amount of compute resource that can be consumed in this namespace,
|
||||
let's create a quota that tracks compute resources.
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/resourcequota/compute-resources.yaml --namespace=quota-example
|
||||
resourcequota "compute-resources" created
|
||||
```
|
||||
|
||||
Let's describe the quota to see what is currently being consumed in this namespace:
|
||||
|
||||
```shell
|
||||
$ kubectl describe quota compute-resources --namespace=quota-example
|
||||
Name: compute-resources
|
||||
Namespace: quota-example
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
limits.cpu 0 2
|
||||
limits.memory 0 2Gi
|
||||
pods 0 4
|
||||
requests.cpu 0 1
|
||||
requests.memory 0 1Gi
|
||||
```
|
||||
|
||||
The quota system will now prevent the namespace from having more than 4 non-terminal pods. In
|
||||
addition, it will enforce that each container in a pod makes a `request` and defines a `limit` for
|
||||
`cpu` and `memory`.
|
||||
|
||||
## Step 4: Applying default resource requests and limits
|
||||
|
||||
Pod authors rarely specify resource requests and limits for their pods.
|
||||
|
||||
Since we applied a quota to our project, let's see what happens when an end-user creates a pod that has unbounded
|
||||
cpu and memory by creating an nginx container.
|
||||
|
||||
To demonstrate, lets create a deployment that runs nginx:
|
||||
|
||||
```shell
|
||||
$ kubectl run nginx --image=nginx --replicas=1 --namespace=quota-example
|
||||
deployment "nginx" created
|
||||
```
|
||||
|
||||
Now let's look at the pods that were created.
|
||||
|
||||
```shell
|
||||
$ kubectl get pods --namespace=quota-example
|
||||
```
|
||||
|
||||
What happened? I have no pods! Let's describe the deployment to get a view of what is happening.
|
||||
|
||||
```shell
|
||||
$ kubectl describe deployment nginx --namespace=quota-example
|
||||
Name: nginx
|
||||
Namespace: quota-example
|
||||
CreationTimestamp: Mon, 06 Jun 2016 16:11:37 -0400
|
||||
Labels: run=nginx
|
||||
Selector: run=nginx
|
||||
Replicas: 0 updated | 1 total | 0 available | 1 unavailable
|
||||
StrategyType: RollingUpdate
|
||||
MinReadySeconds: 0
|
||||
RollingUpdateStrategy: 1 max unavailable, 1 max surge
|
||||
OldReplicaSets: <none>
|
||||
NewReplicaSet: nginx-3137573019 (0/1 replicas created)
|
||||
...
|
||||
```
|
||||
|
||||
A deployment created a corresponding replica set and attempted to size it to create a single pod.
|
||||
|
||||
Let's look at the replica set to get more detail.
|
||||
|
||||
```shell
|
||||
$ kubectl describe rs nginx-3137573019 --namespace=quota-example
|
||||
Name: nginx-3137573019
|
||||
Namespace: quota-example
|
||||
Image(s): nginx
|
||||
Selector: pod-template-hash=3137573019,run=nginx
|
||||
Labels: pod-template-hash=3137573019
|
||||
run=nginx
|
||||
Replicas: 0 current / 1 desired
|
||||
Pods Status: 0 Running / 0 Waiting / 0 Succeeded / 0 Failed
|
||||
No volumes.
|
||||
Events:
|
||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||
4m 7s 11 {replicaset-controller } Warning FailedCreate Error creating: pods "nginx-3137573019-" is forbidden: Failed quota: compute-resources: must specify limits.cpu,limits.memory,requests.cpu,requests.memory
|
||||
```
|
||||
|
||||
The Kubernetes API server is rejecting the replica set requests to create a pod because our pods
|
||||
do not specify `requests` or `limits` for `cpu` and `memory`.
|
||||
|
||||
So let's set some default values for the amount of `cpu` and `memory` a pod can consume:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/admin/resourcequota/limits.yaml --namespace=quota-example
|
||||
limitrange "limits" created
|
||||
$ kubectl describe limits limits --namespace=quota-example
|
||||
Name: limits
|
||||
Namespace: quota-example
|
||||
Type Resource Min Max Default Request Default Limit Max Limit/Request Ratio
|
||||
---- -------- --- --- --------------- ------------- -----------------------
|
||||
Container memory - - 256Mi 512Mi -
|
||||
Container cpu - - 100m 200m -
|
||||
```
|
||||
|
||||
If the Kubernetes API server observes a request to create a pod in this namespace, and the containers
|
||||
in that pod do not make any compute resource requests, a default request and default limit will be applied
|
||||
as part of admission control.
|
||||
|
||||
In this example, each pod created will have compute resources equivalent to the following:
|
||||
|
||||
```shell
|
||||
$ kubectl run nginx \
|
||||
--image=nginx \
|
||||
--replicas=1 \
|
||||
--requests=cpu=100m,memory=256Mi \
|
||||
--limits=cpu=200m,memory=512Mi \
|
||||
--namespace=quota-example
|
||||
```
|
||||
|
||||
Now that we have applied default compute resources for our namespace, our replica set should be able to create
|
||||
its pods.
|
||||
|
||||
```shell
|
||||
$ kubectl get pods --namespace=quota-example
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
nginx-3137573019-fvrig 1/1 Running 0 6m
|
||||
```
|
||||
|
||||
And if we print out our quota usage in the namespace:
|
||||
|
||||
```shell
|
||||
$ kubectl describe quota --namespace=quota-example
|
||||
Name: compute-resources
|
||||
Namespace: quota-example
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
limits.cpu 200m 2
|
||||
limits.memory 512Mi 2Gi
|
||||
pods 1 4
|
||||
requests.cpu 100m 1
|
||||
requests.memory 256Mi 1Gi
|
||||
|
||||
|
||||
Name: object-counts
|
||||
Namespace: quota-example
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
persistentvolumeclaims 0 2
|
||||
services.loadbalancers 0 2
|
||||
services.nodeports 0 0
|
||||
```
|
||||
|
||||
As you can see, the pod that was created is consuming explicit amounts of compute resources, and the usage is being
|
||||
tracked by Kubernetes properly.
|
||||
|
||||
## Step 5: Advanced quota scopes
|
||||
|
||||
Let's imagine you did not want to specify default compute resource consumption in your namespace.
|
||||
|
||||
Instead, you want to let users run a specific number of `BestEffort` pods in their namespace to take
|
||||
advantage of slack compute resources, and then require that users make an explicit resource request for
|
||||
pods that require a higher quality of service.
|
||||
|
||||
Let's create a new namespace with two quotas to demonstrate this behavior:
|
||||
|
||||
```shell
|
||||
$ kubectl create namespace quota-scopes
|
||||
namespace "quota-scopes" created
|
||||
$ kubectl create -f docs/admin/resourcequota/best-effort.yaml --namespace=quota-scopes
|
||||
resourcequota "best-effort" created
|
||||
$ kubectl create -f docs/admin/resourcequota/not-best-effort.yaml --namespace=quota-scopes
|
||||
resourcequota "not-best-effort" created
|
||||
$ kubectl describe quota --namespace=quota-scopes
|
||||
Name: best-effort
|
||||
Namespace: quota-scopes
|
||||
Scopes: BestEffort
|
||||
* Matches all pods that have best effort quality of service.
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
pods 0 10
|
||||
|
||||
|
||||
Name: not-best-effort
|
||||
Namespace: quota-scopes
|
||||
Scopes: NotBestEffort
|
||||
* Matches all pods that do not have best effort quality of service.
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
limits.cpu 0 2
|
||||
limits.memory 0 2Gi
|
||||
pods 0 4
|
||||
requests.cpu 0 1
|
||||
requests.memory 0 1Gi
|
||||
```
|
||||
|
||||
In this scenario, a pod that makes no compute resource requests will be tracked by the `best-effort` quota.
|
||||
|
||||
A pod that does make compute resource requests will be tracked by the `not-best-effort` quota.
|
||||
|
||||
Let's demonstrate this by creating two deployments:
|
||||
|
||||
```shell
|
||||
$ kubectl run best-effort-nginx --image=nginx --replicas=8 --namespace=quota-scopes
|
||||
deployment "best-effort-nginx" created
|
||||
$ kubectl run not-best-effort-nginx \
|
||||
--image=nginx \
|
||||
--replicas=2 \
|
||||
--requests=cpu=100m,memory=256Mi \
|
||||
--limits=cpu=200m,memory=512Mi \
|
||||
--namespace=quota-scopes
|
||||
deployment "not-best-effort-nginx" created
|
||||
```
|
||||
|
||||
Even though no default limits were specified, the `best-effort-nginx` deployment will create
|
||||
all 8 pods. This is because it is tracked by the `best-effort` quota, and the `not-best-effort`
|
||||
quota will just ignore it. The `not-best-effort` quota will track the `not-best-effort-nginx`
|
||||
deployment since it creates pods with `Burstable` quality of service.
|
||||
|
||||
Let's list the pods in the namespace:
|
||||
|
||||
```shell
|
||||
$ kubectl get pods --namespace=quota-scopes
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
best-effort-nginx-3488455095-2qb41 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-3go7n 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-9o2xg 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-eyg40 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-gcs3v 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-rq8p1 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-udhhd 1/1 Running 0 51s
|
||||
best-effort-nginx-3488455095-zmk12 1/1 Running 0 51s
|
||||
not-best-effort-nginx-2204666826-7sl61 1/1 Running 0 23s
|
||||
not-best-effort-nginx-2204666826-ke746 1/1 Running 0 23s
|
||||
```
|
||||
|
||||
As you can see, all 10 pods have been allowed to be created.
|
||||
|
||||
Let's describe current quota usage in the namespace:
|
||||
|
||||
```shell
|
||||
$ kubectl describe quota --namespace=quota-scopes
|
||||
Name: best-effort
|
||||
Namespace: quota-scopes
|
||||
Scopes: BestEffort
|
||||
* Matches all pods that have best effort quality of service.
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
pods 8 10
|
||||
|
||||
|
||||
Name: not-best-effort
|
||||
Namespace: quota-scopes
|
||||
Scopes: NotBestEffort
|
||||
* Matches all pods that do not have best effort quality of service.
|
||||
Resource Used Hard
|
||||
-------- ---- ----
|
||||
limits.cpu 400m 2
|
||||
limits.memory 1Gi 2Gi
|
||||
pods 2 4
|
||||
requests.cpu 200m 1
|
||||
requests.memory 512Mi 1Gi
|
||||
```
|
||||
|
||||
As you can see, the `best-effort` quota has tracked the usage for the 8 pods we created in
|
||||
the `best-effort-nginx` deployment, and the `not-best-effort` quota has tracked the usage for
|
||||
the 2 pods we created in the `not-best-effort-nginx` quota.
|
||||
|
||||
Scopes provide a mechanism to subdivide the set of resources that are tracked by
|
||||
any quota document to allow greater flexibility in how operators deploy and track resource
|
||||
consumption.
|
||||
|
||||
In addition to `BestEffort` and `NotBestEffort` scopes, there are scopes to restrict
|
||||
long-running versus time-bound pods. The `Terminating` scope will match any pod
|
||||
where `spec.activeDeadlineSeconds is not nil`. The `NotTerminating` scope will match any pod
|
||||
where `spec.activeDeadlineSeconds is nil`. These scopes allow you to quota pods based on their
|
||||
anticipated permanence on a node in your cluster.
|
||||
|
||||
## Summary
|
||||
|
||||
Actions that consume node resources for cpu and memory can be subject to hard quota limits defined by the namespace quota.
|
||||
|
||||
Any action that consumes those resources can be tweaked, or can pick up namespace level defaults to meet your end goal.
|
||||
|
||||
Quota can be apportioned based on quality of service and anticipated permanence on a node in your cluster.
|
||||
[Applying Resource Quotas and Limits](/docs/tasks/configure-pod-container/apply-resource-quota-limit/)
|
||||
|
|
|
@ -10,11 +10,11 @@ The Salt scripts are shared across multiple hosting providers and depending on w
|
|||
|
||||
## Salt cluster setup
|
||||
|
||||
The **salt-master** service runs on the kubernetes-master [(except on the default GCE setup)](#standalone-salt-configuration-on-gce).
|
||||
The **salt-master** service runs on the kubernetes-master [(except on the default GCE and OpenStack-Heat setup)](#standalone-salt-configuration-on-gce-and-others).
|
||||
|
||||
The **salt-minion** service runs on the kubernetes-master and each kubernetes-node in the cluster.
|
||||
|
||||
Each salt-minion service is configured to interact with the **salt-master** service hosted on the kubernetes-master via the **master.conf** file [(except on GCE)](#standalone-salt-configuration-on-gce).
|
||||
Each salt-minion service is configured to interact with the **salt-master** service hosted on the kubernetes-master via the **master.conf** file [(except on GCE and OpenStack-Heat)](#standalone-salt-configuration-on-gce-and-others).
|
||||
|
||||
```shell
|
||||
[root@kubernetes-master] $ cat /etc/salt/minion.d/master.conf
|
||||
|
@ -25,15 +25,15 @@ The salt-master is contacted by each salt-minion and depending upon the machine
|
|||
|
||||
If you are running the Vagrant based environment, the **salt-api** service is running on the kubernetes-master. It is configured to enable the vagrant user to introspect the salt cluster in order to find out about machines in the Vagrant environment via a REST API.
|
||||
|
||||
## Standalone Salt Configuration on GCE
|
||||
## Standalone Salt Configuration on GCE and others
|
||||
|
||||
On GCE, the master and nodes are all configured as [standalone minions](http://docs.saltstack.com/en/latest/topics/tutorials/standalone_minion.html). The configuration for each VM is derived from the VM's [instance metadata](https://cloud.google.com/compute/docs/metadata) and then stored in Salt grains (`/etc/salt/minion.d/grains.conf`) and pillars (`/srv/salt-overlay/pillar/cluster-params.sls`) that local Salt uses to enforce state.
|
||||
On GCE and OpenStack, using the Openstack-Heat provider, the master and nodes are all configured as [standalone minions](http://docs.saltstack.com/en/latest/topics/tutorials/standalone_minion.html). The configuration for each VM is derived from the VM's [instance metadata](https://cloud.google.com/compute/docs/metadata) and then stored in Salt grains (`/etc/salt/minion.d/grains.conf`) and pillars (`/srv/salt-overlay/pillar/cluster-params.sls`) that local Salt uses to enforce state.
|
||||
|
||||
All remaining sections that refer to master/minion setups should be ignored for GCE. One fallout of the GCE setup is that the Salt mine doesn't exist - there is no sharing of configuration amongst nodes.
|
||||
All remaining sections that refer to master/minion setups should be ignored for GCE and OpenStack. One fallout of this setup is that the Salt mine doesn't exist - there is no sharing of configuration amongst nodes.
|
||||
|
||||
## Salt security
|
||||
|
||||
*(Not applicable on default GCE setup.)*
|
||||
*(Not applicable on default GCE and OpenStack-Heat setup.)*
|
||||
|
||||
Security is not enabled on the salt-master, and the salt-master is configured to auto-accept incoming requests from minions. It is not recommended to use this security configuration in production environments without deeper study. (In some environments this isn't as bad as it might sound if the salt master port isn't externally accessible and you trust everyone on your network.)
|
||||
|
||||
|
|
|
@ -71,8 +71,9 @@ account. To create additional API tokens for a service account, create a secret
|
|||
of type `ServiceAccountToken` with an annotation referencing the service
|
||||
account, and the controller will update it with a generated token:
|
||||
|
||||
```json
|
||||
secret.json:
|
||||
|
||||
```json
|
||||
{
|
||||
"kind": "Secret",
|
||||
"apiVersion": "v1",
|
||||
|
@ -100,4 +101,4 @@ kubectl delete secret mysecretname
|
|||
### Service Account Controller
|
||||
|
||||
Service Account Controller manages ServiceAccount inside namespaces, and ensures
|
||||
a ServiceAccount named "default" exists in every active namespace.
|
||||
a ServiceAccount named "default" exists in every active namespace.
|
||||
|
|
|
@ -4,123 +4,6 @@ assignees:
|
|||
title: Static Pods
|
||||
---
|
||||
|
||||
**If you are running clustered Kubernetes and are using static pods to run a pod on every node, you should probably be using a [DaemonSet](/docs/admin/daemons/)!**
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
*Static pods* are managed directly by kubelet daemon on a specific node, without API server observing it. It does not have associated any replication controller, kubelet daemon itself watches it and restarts it when it crashes. There is no health check though. Static pods are always bound to one kubelet daemon and always run on the same node with it.
|
||||
|
||||
Kubelet automatically creates so-called *mirror pod* on Kubernetes API server for each static pod, so the pods are visible there, but they cannot be controlled from the API server.
|
||||
|
||||
## Static pod creation
|
||||
|
||||
Static pod can be created in two ways: either by using configuration file(s) or by HTTP.
|
||||
|
||||
### Configuration files
|
||||
|
||||
The configuration files are just standard pod definition in json or yaml format in specific directory. Use `kubelet --pod-manifest-path=<the directory>` to start kubelet daemon, which periodically scans the directory and creates/deletes static pods as yaml/json files appear/disappear there.
|
||||
|
||||
For example, this is how to start a simple web server as a static pod:
|
||||
|
||||
1. Choose a node where we want to run the static pod. In this example, it's `my-node1`.
|
||||
|
||||
```shell
|
||||
[joe@host ~] $ ssh my-node1
|
||||
```
|
||||
|
||||
2. Choose a directory, say `/etc/kubelet.d` and place a web server pod definition there, e.g. `/etc/kubernetes.d/static-web.yaml`:
|
||||
|
||||
```shell
|
||||
[root@my-node1 ~] $ mkdir /etc/kubernetes.d/
|
||||
[root@my-node1 ~] $ cat <<EOF >/etc/kubernetes.d/static-web.yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: static-web
|
||||
labels:
|
||||
role: myrole
|
||||
spec:
|
||||
containers:
|
||||
- name: web
|
||||
image: nginx
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 80
|
||||
protocol: TCP
|
||||
EOF
|
||||
```
|
||||
|
||||
2. Configure your kubelet daemon on the node to use this directory by running it with `--pod-manifest-path=/etc/kubelet.d/` argument. On Fedora edit `/etc/kubernetes/kubelet` to include this line:
|
||||
|
||||
```conf
|
||||
KUBELET_ARGS="--cluster-dns=10.254.0.10 --cluster-domain=kube.local --pod-manifest-path=/etc/kubelet.d/"
|
||||
```
|
||||
|
||||
Instructions for other distributions or Kubernetes installations may vary.
|
||||
|
||||
3. Restart kubelet. On Fedora, this is:
|
||||
|
||||
```shell
|
||||
[root@my-node1 ~] $ systemctl restart kubelet
|
||||
```
|
||||
|
||||
## Pods created via HTTP
|
||||
|
||||
Kubelet periodically downloads a file specified by `--manifest-url=<URL>` argument and interprets it as a json/yaml file with a pod definition. It works the same as `--pod-manifest-path=<directory>`, i.e. it's reloaded every now and then and changes are applied to running static pods (see below).
|
||||
|
||||
## Behavior of static pods
|
||||
|
||||
When kubelet starts, it automatically starts all pods defined in directory specified in `--pod-manifest-path=` or `--manifest-url=` arguments, i.e. our static-web. (It may take some time to pull nginx image, be patient…):
|
||||
|
||||
```shell
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS NAMES
|
||||
f6d05272b57e nginx:latest "nginx" 8 minutes ago Up 8 minutes k8s_web.6f802af4_static-web-fk-node1_default_67e24ed9466ba55986d120c867395f3c_378e5f3c
|
||||
```
|
||||
|
||||
If we look at our Kubernetes API server (running on host `my-master`), we see that a new mirror-pod was created there too:
|
||||
|
||||
```shell
|
||||
[joe@host ~] $ ssh my-master
|
||||
[joe@my-master ~] $ kubectl get pods
|
||||
POD IP CONTAINER(S) IMAGE(S) HOST LABELS STATUS CREATED MESSAGE
|
||||
static-web-my-node1 172.17.0.3 my-node1/192.168.100.71 role=myrole Running 11 minutes
|
||||
web nginx Running 11 minutes
|
||||
```
|
||||
|
||||
Labels from the static pod are propagated into the mirror-pod and can be used as usual for filtering.
|
||||
|
||||
Notice we cannot delete the pod with the API server (e.g. via [`kubectl`](/docs/user-guide/kubectl/) command), kubelet simply won't remove it.
|
||||
|
||||
```shell
|
||||
[joe@my-master ~] $ kubectl delete pod static-web-my-node1
|
||||
pods/static-web-my-node1
|
||||
[joe@my-master ~] $ kubectl get pods
|
||||
POD IP CONTAINER(S) IMAGE(S) HOST ...
|
||||
static-web-my-node1 172.17.0.3 my-node1/192.168.100.71 ...
|
||||
```
|
||||
|
||||
Back to our `my-node1` host, we can try to stop the container manually and see, that kubelet automatically restarts it in a while:
|
||||
|
||||
```shell
|
||||
[joe@host ~] $ ssh my-node1
|
||||
[joe@my-node1 ~] $ docker stop f6d05272b57e
|
||||
[joe@my-node1 ~] $ sleep 20
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED ...
|
||||
5b920cbaf8b1 nginx:latest "nginx -g 'daemon of 2 seconds ago ...
|
||||
```
|
||||
|
||||
## Dynamic addition and removal of static pods
|
||||
|
||||
Running kubelet periodically scans the configured directory (`/etc/kubelet.d` in our example) for changes and adds/removes pods as files appear/disappear in this directory.
|
||||
|
||||
```shell
|
||||
[joe@my-node1 ~] $ mv /etc/kubernetes.d/static-web.yaml /tmp
|
||||
[joe@my-node1 ~] $ sleep 20
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
// no nginx container is running
|
||||
[joe@my-node1 ~] $ mv /tmp/static-web.yaml /etc/kubernetes.d/
|
||||
[joe@my-node1 ~] $ sleep 20
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED ...
|
||||
e7a62e3427f1 nginx:latest "nginx -g 'daemon of 27 seconds ago
|
||||
```
|
||||
[Static Pods](/docs/concepts/cluster-administration/static-pod/)
|
||||
|
|
|
@ -4,119 +4,6 @@ assignees:
|
|||
title: Using Sysctls in a Kubernetes Cluster
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
This document describes how sysctls are used within a Kubernetes cluster.
|
||||
|
||||
## What is a Sysctl?
|
||||
|
||||
In Linux, the sysctl interface allows an administrator to modify kernel
|
||||
parameters at runtime. Parameters are available via the `/proc/sys/` virtual
|
||||
process file system. The parameters cover various subsystems such as:
|
||||
|
||||
- kernel (common prefix: `kernel.`)
|
||||
- networking (common prefix: `net.`)
|
||||
- virtual memory (common prefix: `vm.`)
|
||||
- MDADM (common prefix: `dev.`)
|
||||
- More subsystems are described in [Kernel docs](https://www.kernel.org/doc/Documentation/sysctl/README).
|
||||
|
||||
To get a list of all parameters, you can run
|
||||
|
||||
```
|
||||
$ sudo sysctl -a
|
||||
```
|
||||
|
||||
## Namespaced vs. Node-Level Sysctls
|
||||
|
||||
A number of sysctls are _namespaced_ in today's Linux kernels. This means that
|
||||
they can be set independently for each pod on a node. Being namespaced is a
|
||||
requirement for sysctls to be accessible in a pod context within Kubernetes.
|
||||
|
||||
The following sysctls are known to be _namespaced_:
|
||||
|
||||
- `kernel.shm*`,
|
||||
- `kernel.msg*`,
|
||||
- `kernel.sem`,
|
||||
- `fs.mqueue.*`,
|
||||
- `net.*`.
|
||||
|
||||
Sysctls which are not namespaced are called _node-level_ and must be set
|
||||
manually by the cluster admin, either by means of the underlying Linux
|
||||
distribution of the nodes (e.g. via `/etc/sysctls.conf`) or using a DaemonSet
|
||||
with privileged containers.
|
||||
|
||||
**Note**: it is good practice to consider nodes with special sysctl settings as
|
||||
_tainted_ within a cluster, and only schedule pods onto them which need those
|
||||
sysctl settings. It is suggested to use the Kubernetes [_taints and toleration_
|
||||
feature](/docs/user-guide/kubectl/kubectl_taint.md) to implement this.
|
||||
|
||||
## Safe vs. Unsafe Sysctls
|
||||
|
||||
Sysctls are grouped into _safe_ and _unsafe_ sysctls. In addition to proper
|
||||
namespacing a _safe_ sysctl must be properly _isolated_ between pods on the same
|
||||
node. This means that setting a _safe_ sysctl for one pod
|
||||
|
||||
- must not have any influence on any other pod on the node
|
||||
- must not allow to harm the node's health
|
||||
- must not allow to gain CPU or memory resources outside of the resource limits
|
||||
of a pod.
|
||||
|
||||
By far, most of the _namespaced_ sysctls are not necessarily considered _safe_.
|
||||
|
||||
For Kubernetes 1.4, the following sysctls are supported in the _safe_ set:
|
||||
|
||||
- `kernel.shm_rmid_forced`,
|
||||
- `net.ipv4.ip_local_port_range`,
|
||||
- `net.ipv4.tcp_syncookies`.
|
||||
|
||||
This list will be extended in future Kubernetes versions when the kubelet
|
||||
supports better isolation mechanisms.
|
||||
|
||||
All _safe_ sysctls are enabled by default.
|
||||
|
||||
All _unsafe_ sysctls are disabled by default and must be allowed manually by the
|
||||
cluster admin on a per-node basis. Pods with disabled unsafe sysctls will be
|
||||
scheduled, but will fail to launch.
|
||||
|
||||
**Warning**: Due to their nature of being _unsafe_, the use of _unsafe_ sysctls
|
||||
is at-your-own-risk and can lead to severe problems like wrong behavior of
|
||||
containers, resource shortage or complete breakage of a node.
|
||||
|
||||
## Enabling Unsafe Sysctls
|
||||
|
||||
With the warning above in mind, the cluster admin can allow certain _unsafe_
|
||||
sysctls for very special situations like e.g. high-performance or real-time
|
||||
application tuning. _Unsafe_ sysctls are enabled on a node-by-node basis with a
|
||||
flag of the kubelet, e.g.:
|
||||
|
||||
```shell
|
||||
$ kubelet --experimental-allowed-unsafe-sysctls 'kernel.msg*,net.ipv4.route.min_pmtu' ...
|
||||
```
|
||||
|
||||
Only _namespaced_ sysctls can be enabled this way.
|
||||
|
||||
## Setting Sysctls for a Pod
|
||||
|
||||
The sysctl feature is an alpha API in Kubernetes 1.4. Therefore, sysctls are set
|
||||
using annotations on pods. They apply to all containers in the same pod.
|
||||
|
||||
Here is an example, with different annotations for _safe_ and _unsafe_ sysctls:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: sysctl-example
|
||||
annotations:
|
||||
security.alpha.kubernetes.io/sysctls: kernel.shm_rmid_forced=1
|
||||
security.alpha.kubernetes.io/unsafe-sysctls: net.ipv4.route.min_pmtu=1000,kernel.msgmax=1 2 3
|
||||
spec:
|
||||
...
|
||||
```
|
||||
|
||||
**Note**: a pod with the _unsafe_ sysctls specified above will fail to launch on
|
||||
any node which has not enabled those two _unsafe_ sysctls explicitly. As with
|
||||
_node-level_ sysctls it is recommended to use [_taints and toleration_
|
||||
feature](/docs/user-guide/kubectl/kubectl_taint.md) or [labels on nodes](/docs
|
||||
/user-guide/labels.md) to schedule those pods onto the right nodes.
|
||||
[Using Sysctls in a Kubernetes Cluster](/docs/concepts/cluster-administration/sysctl-cluster/)
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
---
|
||||
assignees:
|
||||
- mml
|
||||
title: Cluster Management Guide
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
This document outlines the potentially disruptive changes that exist in the 1.6 release cycle. Operators, administrators, and developers should
|
||||
take note of the changes below in order to maintain continuity across their upgrade process.
|
||||
|
||||
## Cluster defaults set to etcd 3
|
||||
|
||||
In the 1.6 release cycle, the default backend storage layer has been upgraded to fully leverage [etcd 3 capabilities](https://coreos.com/blog/etcd3-a-new-etcd.html) by default.
|
||||
For new clusters, there is nothing an operator will need to do, it should "just work". However, if you are upgrading from a 1.5 cluster, care should be taken to ensure
|
||||
continuity.
|
||||
|
||||
It is possible to maintain v2 compatibility mode while running etcd 3 for an interim period of time. To do this, you will simply need to update an argument passed to your apiserver during
|
||||
startup:
|
||||
|
||||
```
|
||||
$ kube-apiserver --storage-backend='etcd2' $(EXISTING_ARGS)
|
||||
```
|
||||
|
||||
However, for long-term maintenance of the cluster, we recommend that the operator plan an outage window in order to perform a [v2->v3 data upgrade](https://coreos.com/etcd/docs/latest/upgrades/upgrade_3_0.html).
|
|
@ -3620,7 +3620,7 @@ The StatefulSet guarantees that a given network identity will always map to the
|
|||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nodeSelector</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection/README">http://kubernetes.io/docs/user-guide/node-selection/README</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection">http://kubernetes.io/docs/user-guide/node-selection</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">object</p></td>
|
||||
<td class="tableblock halign-left valign-top"></td>
|
||||
|
|
|
@ -3609,7 +3609,7 @@ Populated by the system when a graceful deletion is requested. Read-only. More i
|
|||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nodeSelector</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection/README">http://kubernetes.io/docs/user-guide/node-selection/README</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection">http://kubernetes.io/docs/user-guide/node-selection</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">object</p></td>
|
||||
<td class="tableblock halign-left valign-top"></td>
|
||||
|
|
|
@ -3457,7 +3457,7 @@ Populated by the system when a graceful deletion is requested. Read-only. More i
|
|||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nodeSelector</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection/README">http://kubernetes.io/docs/user-guide/node-selection/README</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node’s labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection">http://kubernetes.io/docs/user-guide/node-selection</a></p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">object</p></td>
|
||||
<td class="tableblock halign-left valign-top"></td>
|
||||
|
|
|
@ -8010,7 +8010,7 @@ Appears In <a href="#pod-v1">Pod</a> <a href="#podtemplatespec-v1">PodTemplateSp
|
|||
</tr>
|
||||
<tr>
|
||||
<td>nodeSelector <br /> <em>object</em></td>
|
||||
<td>NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection/README">http://kubernetes.io/docs/user-guide/node-selection/README</a></td>
|
||||
<td>NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: <a href="http://kubernetes.io/docs/user-guide/node-selection">http://kubernetes.io/docs/user-guide/node-selection</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>restartPolicy <br /> <em>string</em></td>
|
||||
|
|
102
docs/api.md
102
docs/api.md
|
@ -6,104 +6,6 @@ assignees:
|
|||
title: Kubernetes API Overview
|
||||
---
|
||||
|
||||
Primary system and API concepts are documented in the [User guide](/docs/user-guide/).
|
||||
{% include user-guide-content-moved.md %}
|
||||
|
||||
Overall API conventions are described in the [API conventions doc](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md).
|
||||
|
||||
Remote access to the API is discussed in the [access doc](/docs/admin/accessing-the-api).
|
||||
|
||||
The Kubernetes API also serves as the foundation for the declarative configuration schema for the system. The [Kubectl](/docs/user-guide/kubectl) command-line tool can be used to create, update, delete, and get API objects.
|
||||
|
||||
Kubernetes also stores its serialized state (currently in [etcd](https://coreos.com/docs/distributed-configuration/getting-started-with-etcd/)) in terms of the API resources.
|
||||
|
||||
Kubernetes itself is decomposed into multiple components, which interact through its API.
|
||||
|
||||
## API changes
|
||||
|
||||
In our experience, any system that is successful needs to grow and change as new use cases emerge or existing ones change. Therefore, we expect the Kubernetes API to continuously change and grow. However, we intend to not break compatibility with existing clients, for an extended period of time. In general, new API resources and new resource fields can be expected to be added frequently. Elimination of resources or fields will require following a deprecation process. The precise deprecation policy for eliminating features is TBD, but once we reach our 1.0 milestone, there will be a specific policy.
|
||||
|
||||
What constitutes a compatible change and how to change the API are detailed by the [API change document](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api_changes.md).
|
||||
|
||||
## OpenAPI and Swagger definitions
|
||||
|
||||
Complete API details are documented using [Swagger v1.2](http://swagger.io/) and [OpenAPI](https://www.openapis.org/). The Kubernetes apiserver (aka "master") exposes an API that can be used to retrieve the Swagger v1.2 Kubernetes API spec located at `/swaggerapi`. You can also enable a UI to browse the API documentation at `/swagger-ui` by passing the `--enable-swagger-ui=true` flag to apiserver.
|
||||
|
||||
We also host a version of the [latest v1.2 API documentation UI](http://kubernetes.io/kubernetes/third_party/swagger-ui/). This is updated with the latest release, so if you are using a different version of Kubernetes you will want to use the spec from your apiserver.
|
||||
|
||||
Starting with kubernetes 1.4, OpenAPI spec is also available at `/swagger.json`. While we are transitioning from Swagger v1.2 to OpenAPI (aka Swagger v2.0), some of the tools such as kubectl and swagger-ui are still using v1.2 spec. OpenAPI spec is in Beta as of Kubernetes 1.5.
|
||||
|
||||
Kubernetes implements an alternative Protobuf based serialization format for the API that is primarily intended for intra-cluster communication, documented in the [design proposal](https://github.com/kubernetes/kubernetes/blob/{{ page.githubbranch }}/docs/proposals/protobuf.md) and the IDL files for each schema are located in the Go packages that define the API objects.
|
||||
|
||||
## API versioning
|
||||
|
||||
To make it easier to eliminate fields or restructure resource representations, Kubernetes supports
|
||||
multiple API versions, each at a different API path, such as `/api/v1` or
|
||||
`/apis/extensions/v1beta1`.
|
||||
|
||||
We chose to version at the API level rather than at the resource or field level to ensure that the API presents a clear, consistent view of system resources and behavior, and to enable controlling access to end-of-lifed and/or experimental APIs. The JSON and Protobuf serialization schemas follow the same guidelines for schema changes - all descriptions below cover both formats.
|
||||
|
||||
Note that API versioning and Software versioning are only indirectly related. The [API and release
|
||||
versioning proposal](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/versioning.md) describes the relationship between API versioning and
|
||||
software versioning.
|
||||
|
||||
|
||||
Different API versions imply different levels of stability and support. The criteria for each level are described
|
||||
in more detail in the [API Changes documentation](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api_changes.md#alpha-beta-and-stable-versions). They are summarized here:
|
||||
|
||||
- Alpha level:
|
||||
- The version names contain `alpha` (e.g. `v1alpha1`).
|
||||
- May be buggy. Enabling the feature may expose bugs. Disabled by default.
|
||||
- Support for feature may be dropped at any time without notice.
|
||||
- The API may change in incompatible ways in a later software release without notice.
|
||||
- Recommended for use only in short-lived testing clusters, due to increased risk of bugs and lack of long-term support.
|
||||
- Beta level:
|
||||
- The version names contain `beta` (e.g. `v2beta3`).
|
||||
- Code is well tested. Enabling the feature is considered safe. Enabled by default.
|
||||
- Support for the overall feature will not be dropped, though details may change.
|
||||
- The schema and/or semantics of objects may change in incompatible ways in a subsequent beta or stable release. When this happens,
|
||||
we will provide instructions for migrating to the next version. This may require deleting, editing, and re-creating
|
||||
API objects. The editing process may require some thought. This may require downtime for applications that rely on the feature.
|
||||
- Recommended for only non-business-critical uses because of potential for incompatible changes in subsequent releases. If you have
|
||||
multiple clusters which can be upgraded independently, you may be able to relax this restriction.
|
||||
- **Please do try our beta features and give feedback on them! Once they exit beta, it may not be practical for us to make more changes.**
|
||||
- Stable level:
|
||||
- The version name is `vX` where `X` is an integer.
|
||||
- Stable versions of features will appear in released software for many subsequent versions.
|
||||
|
||||
## API groups
|
||||
|
||||
To make it easier to extend the Kubernetes API, we implemented [*API groups*](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/api-group.md).
|
||||
The API group is specified in a REST path and in the `apiVersion` field of a serialized object.
|
||||
|
||||
Currently there are several API groups in use:
|
||||
|
||||
1. the "core" (oftentimes called "legacy", due to not having explicit group name) group, which is at
|
||||
REST path `/api/v1` and is not specified as part of the `apiVersion` field, e.g. `apiVersion: v1`.
|
||||
1. the named groups are at REST path `/apis/$GROUP_NAME/$VERSION`, and use `apiVersion: $GROUP_NAME/$VERSION`
|
||||
(e.g. `apiVersion: batch/v1`). Full list of supported API groups can be seen in [Kubernetes API reference](/docs/reference/).
|
||||
|
||||
|
||||
There are two supported paths to extending the API.
|
||||
1. [Third Party Resources](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/extending-api.md)
|
||||
are for users with very basic CRUD needs.
|
||||
1. Coming soon: users needing the full set of Kubernetes API semantics can implement their own apiserver
|
||||
and use the [aggregator](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/aggregated-api-servers.md)
|
||||
to make it seamless for clients.
|
||||
|
||||
|
||||
## Enabling API groups
|
||||
|
||||
Certain resources and API groups are enabled by default. They can be enabled or disabled by setting `--runtime-config`
|
||||
on apiserver. `--runtime-config` accepts comma separated values. For ex: to disable batch/v1, set
|
||||
`--runtime-config=batch/v1=false`, to enable batch/v2alpha1, set `--runtime-config=batch/v2alpha1`.
|
||||
The flag accepts comma separated set of key=value pairs describing runtime configuration of the apiserver.
|
||||
|
||||
IMPORTANT: Enabling or disabling groups or resources requires restarting apiserver and controller-manager
|
||||
to pick up the `--runtime-config` changes.
|
||||
|
||||
## Enabling resources in the groups
|
||||
|
||||
DaemonSets, Deployments, HorizontalPodAutoscalers, Ingress, Jobs and ReplicaSets are enabled by default.
|
||||
Other extensions resources can be enabled by setting `--runtime-config` on
|
||||
apiserver. `--runtime-config` accepts comma separated values. For ex: to disable deployments and jobs, set
|
||||
`--runtime-config=extensions/v1beta1/deployments=false,extensions/v1beta1/jobs=false`
|
||||
[The Kubernetes API](/docs/concepts/overview/kubernetes-api/)
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: extensions/v1beta1
|
||||
kind: ReplicaSet
|
||||
metadata:
|
||||
name: my-repset
|
||||
spec:
|
||||
replicas: 3
|
||||
selector:
|
||||
matchLabels:
|
||||
pod-is-for: garbage-collection-example
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
pod-is-for: garbage-collection-example
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx
|
|
@ -0,0 +1,325 @@
|
|||
---
|
||||
title: Accessing Clusters
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
## Accessing the cluster API
|
||||
|
||||
### Accessing for the first time with kubectl
|
||||
|
||||
When accessing the Kubernetes API for the first time, we suggest using the
|
||||
Kubernetes CLI, `kubectl`.
|
||||
|
||||
To access a cluster, you need to know the location of the cluster and have credentials
|
||||
to access it. Typically, this is automatically set-up when you work through
|
||||
though a [Getting started guide](/docs/getting-started-guides/),
|
||||
or someone else setup the cluster and provided you with credentials and a location.
|
||||
|
||||
Check the location and credentials that kubectl knows about with this command:
|
||||
|
||||
```shell
|
||||
$ kubectl config view
|
||||
```
|
||||
|
||||
Many of the [examples](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/) provide an introduction to using
|
||||
kubectl and complete documentation is found in the [kubectl manual](/docs/user-guide/kubectl/index).
|
||||
|
||||
### Directly accessing the REST API
|
||||
|
||||
Kubectl handles locating and authenticating to the apiserver.
|
||||
If you want to directly access the REST API with an http client like
|
||||
curl or wget, or a browser, there are several ways to locate and authenticate:
|
||||
|
||||
- Run kubectl in proxy mode.
|
||||
- Recommended approach.
|
||||
- Uses stored apiserver location.
|
||||
- Verifies identity of apiserver using self-signed cert. No MITM possible.
|
||||
- Authenticates to apiserver.
|
||||
- In future, may do intelligent client-side load-balancing and failover.
|
||||
- Provide the location and credentials directly to the http client.
|
||||
- Alternate approach.
|
||||
- Works with some types of client code that are confused by using a proxy.
|
||||
- Need to import a root cert into your browser to protect against MITM.
|
||||
|
||||
#### Using kubectl proxy
|
||||
|
||||
The following command runs kubectl in a mode where it acts as a reverse proxy. It handles
|
||||
locating the apiserver and authenticating.
|
||||
Run it like this:
|
||||
|
||||
```shell
|
||||
$ kubectl proxy --port=8080 &
|
||||
```
|
||||
|
||||
See [kubectl proxy](/docs/user-guide/kubectl/kubectl_proxy) for more details.
|
||||
|
||||
Then you can explore the API with curl, wget, or a browser, like so:
|
||||
|
||||
```shell
|
||||
$ curl http://localhost:8080/api/
|
||||
{
|
||||
"versions": [
|
||||
"v1"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Without kubectl proxy (before v1.3.x)
|
||||
|
||||
It is possible to avoid using kubectl proxy by passing an authentication token
|
||||
directly to the apiserver, like this:
|
||||
|
||||
```shell
|
||||
$ APISERVER=$(kubectl config view | grep server | cut -f 2- -d ":" | tr -d " ")
|
||||
$ TOKEN=$(kubectl config view | grep token | cut -f 2 -d ":" | tr -d " ")
|
||||
$ curl $APISERVER/api --header "Authorization: Bearer $TOKEN" --insecure
|
||||
{
|
||||
"versions": [
|
||||
"v1"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Without kubectl proxy (post v1.3.x)
|
||||
|
||||
In Kubernetes version 1.3 or later, `kubectl config view` no longer displays the token. Use `kubectl describe secret...` to get the token for the default service account, like this:
|
||||
|
||||
``` shell
|
||||
$ APISERVER=$(kubectl config view | grep server | cut -f 2- -d ":" | tr -d " ")
|
||||
$ TOKEN=$(kubectl describe secret $(kubectl get secrets | grep default | cut -f1 -d ' ') | grep -E '^token' | cut -f2 -d':' | tr -d '\t')
|
||||
$ curl $APISERVER/api --header "Authorization: Bearer $TOKEN" --insecure
|
||||
{
|
||||
"kind": "APIVersions",
|
||||
"versions": [
|
||||
"v1"
|
||||
],
|
||||
"serverAddressByClientCIDRs": [
|
||||
{
|
||||
"clientCIDR": "0.0.0.0/0",
|
||||
"serverAddress": "10.0.1.149:443"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The above examples use the `--insecure` flag. This leaves it subject to MITM
|
||||
attacks. When kubectl accesses the cluster it uses a stored root certificate
|
||||
and client certificates to access the server. (These are installed in the
|
||||
`~/.kube` directory). Since cluster certificates are typically self-signed, it
|
||||
may take special configuration to get your http client to use root
|
||||
certificate.
|
||||
|
||||
On some clusters, the apiserver does not require authentication; it may serve
|
||||
on localhost, or be protected by a firewall. There is not a standard
|
||||
for this. [Configuring Access to the API](/docs/admin/accessing-the-api)
|
||||
describes how a cluster admin can configure this. Such approaches may conflict
|
||||
with future high-availability support.
|
||||
|
||||
### Programmatic access to the API
|
||||
|
||||
The Kubernetes project-supported Go client library is at [https://github.com/kubernetes/client-go](https://github.com/kubernetes/client-go).
|
||||
|
||||
To use it,
|
||||
|
||||
* To get the library, run the following command: `go get k8s.io/client-go/<version number>/kubernetes` See [https://github.com/kubernetes/client-go](https://github.com/kubernetes/client-go) to see which versions are supported.
|
||||
* Write an application atop of the client-go clients. Note that client-go defines its own API objects, so if needed, please import API definitions from client-go rather than from the main repository, e.g., `import "k8s.io/client-go/1.4/pkg/api/v1"` is correct.
|
||||
|
||||
The Go client can use the same [kubeconfig file](/docs/concepts/cluster-administration/authenticate-across-clusters-kubeconfig/)
|
||||
as the kubectl CLI does to locate and authenticate to the apiserver. See this [example](https://github.com/kubernetes/client-go/blob/master/examples/out-of-cluster/main.go):
|
||||
|
||||
```golang
|
||||
import (
|
||||
"fmt"
|
||||
"k8s.io/client-go/1.4/kubernetes"
|
||||
"k8s.io/client-go/1.4/pkg/api/v1"
|
||||
"k8s.io/client-go/1.4/tools/clientcmd"
|
||||
)
|
||||
...
|
||||
// uses the current context in kubeconfig
|
||||
config, _ := clientcmd.BuildConfigFromFlags("", "path to kubeconfig")
|
||||
// creates the clientset
|
||||
clientset, _:= kubernetes.NewForConfig(config)
|
||||
// access the API to list pods
|
||||
pods, _:= clientset.Core().Pods("").List(v1.ListOptions{})
|
||||
fmt.Printf("There are %d pods in the cluster\n", len(pods.Items))
|
||||
...
|
||||
```
|
||||
|
||||
If the application is deployed as a Pod in the cluster, please refer to the [next section](#accessing-the-api-from-a-pod).
|
||||
|
||||
There are [client libraries](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/client-libraries.md) for accessing the API from other languages. See documentation for other libraries for how they authenticate.
|
||||
|
||||
### Accessing the API from a Pod
|
||||
|
||||
When accessing the API from a pod, locating and authenticating
|
||||
to the api server are somewhat different.
|
||||
|
||||
The recommended way to locate the apiserver within the pod is with
|
||||
the `kubernetes` DNS name, which resolves to a Service IP which in turn
|
||||
will be routed to an apiserver.
|
||||
|
||||
The recommended way to authenticate to the apiserver is with a
|
||||
[service account](/docs/user-guide/service-accounts) credential. By kube-system, a pod
|
||||
is associated with a service account, and a credential (token) for that
|
||||
service account is placed into the filesystem tree of each container in that pod,
|
||||
at `/var/run/secrets/kubernetes.io/serviceaccount/token`.
|
||||
|
||||
If available, a certificate bundle is placed into the filesystem tree of each
|
||||
container at `/var/run/secrets/kubernetes.io/serviceaccount/ca.crt`, and should be
|
||||
used to verify the serving certificate of the apiserver.
|
||||
|
||||
Finally, the default namespace to be used for namespaced API operations is placed in a file
|
||||
at `/var/run/secrets/kubernetes.io/serviceaccount/namespace` in each container.
|
||||
|
||||
From within a pod the recommended ways to connect to API are:
|
||||
|
||||
- run a kubectl proxy as one of the containers in the pod, or as a background
|
||||
process within a container. This proxies the
|
||||
Kubernetes API to the localhost interface of the pod, so that other processes
|
||||
in any container of the pod can access it. See this [example of using kubectl proxy
|
||||
in a pod](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/kubectl-container/).
|
||||
- use the Go client library, and create a client using the `rest.InClusterConfig()` and `kubernetes.NewForConfig()` functions.
|
||||
They handle locating and authenticating to the apiserver. [example](https://github.com/kubernetes/client-go/blob/master/examples/in-cluster/main.go)
|
||||
|
||||
In each case, the credentials of the pod are used to communicate securely with the apiserver.
|
||||
|
||||
|
||||
## Accessing services running on the cluster
|
||||
|
||||
The previous section was about connecting the Kubernetes API server. This section is about
|
||||
connecting to other services running on Kubernetes cluster. In Kubernetes, the
|
||||
[nodes](/docs/admin/node), [pods](/docs/user-guide/pods) and [services](/docs/user-guide/services) all have
|
||||
their own IPs. In many cases, the node IPs, pod IPs, and some service IPs on a cluster will not be
|
||||
routable, so they will not be reachable from a machine outside the cluster,
|
||||
such as your desktop machine.
|
||||
|
||||
### Ways to connect
|
||||
|
||||
You have several options for connecting to nodes, pods and services from outside the cluster:
|
||||
|
||||
- Access services through public IPs.
|
||||
- Use a service with type `NodePort` or `LoadBalancer` to make the service reachable outside
|
||||
the cluster. See the [services](/docs/user-guide/services) and
|
||||
[kubectl expose](/docs/user-guide/kubectl/kubectl_expose) documentation.
|
||||
- Depending on your cluster environment, this may just expose the service to your corporate network,
|
||||
or it may expose it to the internet. Think about whether the service being exposed is secure.
|
||||
Does it do its own authentication?
|
||||
- Place pods behind services. To access one specific pod from a set of replicas, such as for debugging,
|
||||
place a unique label on the pod it and create a new service which selects this label.
|
||||
- In most cases, it should not be necessary for application developer to directly access
|
||||
nodes via their nodeIPs.
|
||||
- Access services, nodes, or pods using the Proxy Verb.
|
||||
- Does apiserver authentication and authorization prior to accessing the remote service.
|
||||
Use this if the services are not secure enough to expose to the internet, or to gain
|
||||
access to ports on the node IP, or for debugging.
|
||||
- Proxies may cause problems for some web applications.
|
||||
- Only works for HTTP/HTTPS.
|
||||
- Described [here](#manually-constructing-apiserver-proxy-urls).
|
||||
- Access from a node or pod in the cluster.
|
||||
- Run a pod, and then connect to a shell in it using [kubectl exec](/docs/user-guide/kubectl/kubectl_exec).
|
||||
Connect to other nodes, pods, and services from that shell.
|
||||
- Some clusters may allow you to ssh to a node in the cluster. From there you may be able to
|
||||
access cluster services. This is a non-standard method, and will work on some clusters but
|
||||
not others. Browsers and other tools may or may not be installed. Cluster DNS may not work.
|
||||
|
||||
### Discovering builtin services
|
||||
|
||||
Typically, there are several services which are started on a cluster by kube-system. Get a list of these
|
||||
with the `kubectl cluster-info` command:
|
||||
|
||||
```shell
|
||||
$ kubectl cluster-info
|
||||
|
||||
Kubernetes master is running at https://104.197.5.247
|
||||
elasticsearch-logging is running at https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/elasticsearch-logging
|
||||
kibana-logging is running at https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/kibana-logging
|
||||
kube-dns is running at https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/kube-dns
|
||||
grafana is running at https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/monitoring-grafana
|
||||
heapster is running at https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/monitoring-heapster
|
||||
```
|
||||
|
||||
This shows the proxy-verb URL for accessing each service.
|
||||
For example, this cluster has cluster-level logging enabled (using Elasticsearch), which can be reached
|
||||
at `https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/elasticsearch-logging/` if suitable credentials are passed, or through a kubectl proxy at, for example:
|
||||
`http://localhost:8080/api/v1/proxy/namespaces/kube-system/services/elasticsearch-logging/`.
|
||||
(See [above](#accessing-the-cluster-api) for how to pass credentials or use kubectl proxy.)
|
||||
|
||||
#### Manually constructing apiserver proxy URLs
|
||||
|
||||
As mentioned above, you use the `kubectl cluster-info` command to retrieve the service's proxy URL. To create proxy URLs that include service endpoints, suffixes, and parameters, you simply append to the service's proxy URL:
|
||||
`http://`*`kubernetes_master_address`*`/api/v1/proxy/namespaces/`*`namespace_name`*`/services/`*`service_name[:port_name]`*
|
||||
|
||||
If you haven't specified a name for your port, you don't have to specify *port_name* in the URL
|
||||
|
||||
##### Examples
|
||||
|
||||
* To access the Elasticsearch service endpoint `_search?q=user:kimchy`, you would use: `http://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/elasticsearch-logging/_search?q=user:kimchy`
|
||||
* To access the Elasticsearch cluster health information `_cluster/health?pretty=true`, you would use: `https://104.197.5.247/api/v1/proxy/namespaces/kube-system/services/elasticsearch-logging/_cluster/health?pretty=true`
|
||||
|
||||
```json
|
||||
{
|
||||
"cluster_name" : "kubernetes_logging",
|
||||
"status" : "yellow",
|
||||
"timed_out" : false,
|
||||
"number_of_nodes" : 1,
|
||||
"number_of_data_nodes" : 1,
|
||||
"active_primary_shards" : 5,
|
||||
"active_shards" : 5,
|
||||
"relocating_shards" : 0,
|
||||
"initializing_shards" : 0,
|
||||
"unassigned_shards" : 5
|
||||
}
|
||||
```
|
||||
|
||||
#### Using web browsers to access services running on the cluster
|
||||
|
||||
You may be able to put an apiserver proxy url into the address bar of a browser. However:
|
||||
|
||||
- Web browsers cannot usually pass tokens, so you may need to use basic (password) auth. Apiserver can be configured to accept basic auth,
|
||||
but your cluster may not be configured to accept basic auth.
|
||||
- Some web apps may not work, particularly those with client side javascript that construct urls in a
|
||||
way that is unaware of the proxy path prefix.
|
||||
|
||||
## Requesting redirects
|
||||
|
||||
The redirect capabilities have been deprecated and removed. Please use a proxy (see below) instead.
|
||||
|
||||
## So Many Proxies
|
||||
|
||||
There are several different proxies you may encounter when using Kubernetes:
|
||||
|
||||
1. The [kubectl proxy](#directly-accessing-the-rest-api):
|
||||
- runs on a user's desktop or in a pod
|
||||
- proxies from a localhost address to the Kubernetes apiserver
|
||||
- client to proxy uses HTTP
|
||||
- proxy to apiserver uses HTTPS
|
||||
- locates apiserver
|
||||
- adds authentication headers
|
||||
1. The [apiserver proxy](#discovering-builtin-services):
|
||||
- is a bastion built into the apiserver
|
||||
- connects a user outside of the cluster to cluster IPs which otherwise might not be reachable
|
||||
- runs in the apiserver processes
|
||||
- client to proxy uses HTTPS (or http if apiserver so configured)
|
||||
- proxy to target may use HTTP or HTTPS as chosen by proxy using available information
|
||||
- can be used to reach a Node, Pod, or Service
|
||||
- does load balancing when used to reach a Service
|
||||
1. The [kube proxy](/docs/user-guide/services/#ips-and-vips):
|
||||
- runs on each node
|
||||
- proxies UDP and TCP
|
||||
- does not understand HTTP
|
||||
- provides load balancing
|
||||
- is just used to reach services
|
||||
1. A Proxy/Load-balancer in front of apiserver(s):
|
||||
- existence and implementation varies from cluster to cluster (e.g. nginx)
|
||||
- sits between all clients and one or more apiservers
|
||||
- acts as load balancer if there are several apiservers.
|
||||
1. Cloud Load Balancers on external services:
|
||||
- are provided by some cloud providers (e.g. AWS ELB, Google Cloud Load Balancer)
|
||||
- are created automatically when the Kubernetes service has type `LoadBalancer`
|
||||
- use UDP/TCP only
|
||||
- implementation varies by cloud provider.
|
||||
|
||||
Kubernetes users will typically not need to worry about anything other than the first two types. The cluster admin
|
||||
will typically ensure that the latter types are setup correctly.
|
|
@ -0,0 +1,67 @@
|
|||
---
|
||||
assignees:
|
||||
- soltysh
|
||||
- sttts
|
||||
title: Auditing
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
Kubernetes Audit provides a security-relevant chronological set of records documenting
|
||||
the sequence of activities that have affected system by individual users, administrators
|
||||
or other components of the system. It allows cluster administrator to
|
||||
answer the following questions:
|
||||
- what happened?
|
||||
- when did it happen?
|
||||
- who initiated it?
|
||||
- on what did it happen?
|
||||
- where was it observed?
|
||||
- from where was it initiated?
|
||||
- to where was it going?
|
||||
|
||||
NOTE: Currently, Kubernetes provides only basic audit capabilities, there is still a lot
|
||||
of work going on to provide fully featured auditing capabilities (see [this issue](https://github.com/kubernetes/features/issues/22)).
|
||||
|
||||
Kubernetes audit is part of [kube-apiserver](/docs/admin/kube-apiserver) logging all requests
|
||||
coming to the server. Each audit log contains two entries:
|
||||
|
||||
1. The request line containing:
|
||||
- unique id allowing to match the response line (see 2)
|
||||
- source ip of the request
|
||||
- HTTP method being invoked
|
||||
- original user invoking the operation
|
||||
- impersonated user for the operation
|
||||
- namespace of the request or <none>
|
||||
- URI as requested
|
||||
2. The response line containing:
|
||||
- the unique id from 1
|
||||
- response code
|
||||
|
||||
Example output for user `admin` asking for a list of pods:
|
||||
|
||||
```
|
||||
2016-09-07T13:03:57.400333046Z AUDIT: id="5c3b8227-4af9-4322-8a71-542231c3887b" ip="127.0.0.1" method="GET" user="admin" as="<self>" namespace="default" uri="/api/v1/namespaces/default/pods"
|
||||
2016-09-07T13:03:57.400710987Z AUDIT: id="5c3b8227-4af9-4322-8a71-542231c3887b" response="200"
|
||||
```
|
||||
|
||||
NOTE: The audit capabilities are available *only* for the secured endpoint of the API server.
|
||||
|
||||
## Configuration
|
||||
|
||||
[Kube-apiserver](/docs/admin/kube-apiserver) provides following options which are responsible
|
||||
for configuring where and how audit logs are handled:
|
||||
|
||||
- `audit-log-path` - enables the audit log pointing to a file where the requests are being logged to.
|
||||
- `audit-log-maxage` - specifies maximum number of days to retain old audit log files based on the timestamp encoded in their filename.
|
||||
- `audit-log-maxbackup` - specifies maximum number of old audit log files to retain.
|
||||
- `audit-log-maxsize` - specifies maximum size in megabytes of the audit log file before it gets rotated. Defaults to 100MB
|
||||
|
||||
If an audit log file already exists, Kubernetes appends new audit logs to that file.
|
||||
Otherwise, Kubernetes creates an audit log file at the location you specified in
|
||||
`audit-log-path`. If the audit log file exceeds the size you specify in `audit-log-maxsize`,
|
||||
Kubernetes will rename the current log file by appending the current timestamp on
|
||||
the file name (before the file extension) and create a new audit log file.
|
||||
Kubernetes may delete old log files when creating a new log file; you can configure
|
||||
how many files are retained and how old they can be by specifying the `audit-log-maxbackup`
|
||||
and `audit-log-maxage` options.
|
|
@ -0,0 +1,314 @@
|
|||
---
|
||||
assignees:
|
||||
- mikedanese
|
||||
- thockin
|
||||
title: Authenticating Across Clusters with kubeconfig
|
||||
---
|
||||
|
||||
Authentication in Kubernetes can differ for different individuals.
|
||||
|
||||
- A running kubelet might have one way of authenticating (i.e. certificates).
|
||||
- Users might have a different way of authenticating (i.e. tokens).
|
||||
- Administrators might have a list of certificates which they provide individual users.
|
||||
- There may be multiple clusters, and we may want to define them all in one place - giving users the ability to use their own certificates and reusing the same global configuration.
|
||||
|
||||
So in order to easily switch between multiple clusters, for multiple users, a kubeconfig file was defined.
|
||||
|
||||
This file contains a series of authentication mechanisms and cluster connection information associated with nicknames. It also introduces the concept of a tuple of authentication information (user) and cluster connection information called a context that is also associated with a nickname.
|
||||
|
||||
Multiple kubeconfig files are allowed, if specified explicitly. At runtime they are loaded and merged along with override options specified from the command line (see [rules](#loading-and-merging-rules) below).
|
||||
|
||||
## Related discussion
|
||||
|
||||
http://issue.k8s.io/1755
|
||||
|
||||
## Components of a kubeconfig file
|
||||
|
||||
### Example kubeconfig file
|
||||
|
||||
```yaml
|
||||
current-context: federal-context
|
||||
apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
api-version: v1
|
||||
server: http://cow.org:8080
|
||||
name: cow-cluster
|
||||
- cluster:
|
||||
certificate-authority: path/to/my/cafile
|
||||
server: https://horse.org:4443
|
||||
name: horse-cluster
|
||||
- cluster:
|
||||
insecure-skip-tls-verify: true
|
||||
server: https://pig.org:443
|
||||
name: pig-cluster
|
||||
contexts:
|
||||
- context:
|
||||
cluster: horse-cluster
|
||||
namespace: chisel-ns
|
||||
user: green-user
|
||||
name: federal-context
|
||||
- context:
|
||||
cluster: pig-cluster
|
||||
namespace: saw-ns
|
||||
user: black-user
|
||||
name: queen-anne-context
|
||||
kind: Config
|
||||
preferences:
|
||||
colors: true
|
||||
users:
|
||||
- name: blue-user
|
||||
user:
|
||||
token: blue-token
|
||||
- name: green-user
|
||||
user:
|
||||
client-certificate: path/to/my/client/cert
|
||||
client-key: path/to/my/client/key
|
||||
```
|
||||
|
||||
### Breakdown/explanation of components
|
||||
|
||||
#### cluster
|
||||
|
||||
```yaml
|
||||
clusters:
|
||||
- cluster:
|
||||
certificate-authority: path/to/my/cafile
|
||||
server: https://horse.org:4443
|
||||
name: horse-cluster
|
||||
- cluster:
|
||||
insecure-skip-tls-verify: true
|
||||
server: https://pig.org:443
|
||||
name: pig-cluster
|
||||
```
|
||||
|
||||
A `cluster` contains endpoint data for a kubernetes cluster. This includes the fully
|
||||
qualified url for the kubernetes apiserver, as well as the cluster's certificate
|
||||
authority or `insecure-skip-tls-verify: true`, if the cluster's serving
|
||||
certificate is not signed by a system trusted certificate authority.
|
||||
A `cluster` has a name (nickname) which acts as a dictionary key for the cluster
|
||||
within this kubeconfig file. You can add or modify `cluster` entries using
|
||||
[`kubectl config set-cluster`](/docs/user-guide/kubectl/kubectl_config_set-cluster/).
|
||||
|
||||
#### user
|
||||
|
||||
```yaml
|
||||
users:
|
||||
- name: blue-user
|
||||
user:
|
||||
token: blue-token
|
||||
- name: green-user
|
||||
user:
|
||||
client-certificate: path/to/my/client/cert
|
||||
client-key: path/to/my/client/key
|
||||
```
|
||||
|
||||
A `user` defines client credentials for authenticating to a kubernetes cluster. A
|
||||
`user` has a name (nickname) which acts as its key within the list of user entries
|
||||
after kubeconfig is loaded/merged. Available credentials are `client-certificate`,
|
||||
`client-key`, `token`, and `username/password`. `username/password` and `token`
|
||||
are mutually exclusive, but client certs and keys can be combined with them.
|
||||
You can add or modify `user` entries using
|
||||
[`kubectl config set-credentials`](/docs/user-guide/kubectl/kubectl_config_set-credentials).
|
||||
|
||||
#### context
|
||||
|
||||
```yaml
|
||||
contexts:
|
||||
- context:
|
||||
cluster: horse-cluster
|
||||
namespace: chisel-ns
|
||||
user: green-user
|
||||
name: federal-context
|
||||
```
|
||||
|
||||
A `context` defines a named [`cluster`](#cluster),[`user`](#user),[`namespace`](/docs/user-guide/namespaces) tuple
|
||||
which is used to send requests to the specified cluster using the provided authentication info and
|
||||
namespace. Each of the three is optional; it is valid to specify a context with only one of `cluster`,
|
||||
`user`,`namespace`, or to specify none. Unspecified values, or named values that don't have corresponding
|
||||
entries in the loaded kubeconfig (e.g. if the context specified a `pink-user` for the above kubeconfig file)
|
||||
will be replaced with the default. See [Loading and merging rules](#loading-and-merging) below for override/merge behavior.
|
||||
You can add or modify `context` entries with [`kubectl config set-context`](/docs/user-guide/kubectl/kubectl_config_set-context).
|
||||
|
||||
#### current-context
|
||||
|
||||
```yaml
|
||||
current-context: federal-context
|
||||
```
|
||||
|
||||
`current-context` is the nickname or 'key' for the cluster,user,namespace tuple that kubectl
|
||||
will use by default when loading config from this file. You can override any of the values in kubectl
|
||||
from the commandline, by passing `--context=CONTEXT`, `--cluster=CLUSTER`, `--user=USER`, and/or `--namespace=NAMESPACE` respectively.
|
||||
You can change the `current-context` with [`kubectl config use-context`](/docs/user-guide/kubectl/kubectl_config_use-context).
|
||||
|
||||
#### miscellaneous
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Config
|
||||
preferences:
|
||||
colors: true
|
||||
```
|
||||
|
||||
`apiVersion` and `kind` identify the version and schema for the client parser and should not
|
||||
be edited manually.
|
||||
|
||||
`preferences` specify optional (and currently unused) kubectl preferences.
|
||||
|
||||
## Viewing kubeconfig files
|
||||
|
||||
`kubectl config view` will display the current kubeconfig settings. By default
|
||||
it will show you all loaded kubeconfig settings; you can filter the view to just
|
||||
the settings relevant to the `current-context` by passing `--minify`. See
|
||||
[`kubectl config view`](/docs/user-guide/kubectl/kubectl_config_view) for other options.
|
||||
|
||||
## Building your own kubeconfig file
|
||||
|
||||
NOTE, that if you are deploying k8s via kube-up.sh, you do not need to create your own kubeconfig files, the script will do it for you.
|
||||
|
||||
In any case, you can easily use this file as a template to create your own kubeconfig files.
|
||||
|
||||
So, lets do a quick walk through the basics of the above file so you can easily modify it as needed...
|
||||
|
||||
The above file would likely correspond to an api-server which was launched using the `--token-auth-file=tokens.csv` option, where the tokens.csv file looked something like this:
|
||||
|
||||
```conf
|
||||
blue-user,blue-user,1
|
||||
mister-red,mister-red,2
|
||||
```
|
||||
|
||||
Also, since we have other users who validate using **other** mechanisms, the api-server would have probably been launched with other authentication options (there are many such options, make sure you understand which ones YOU care about before crafting a kubeconfig file, as nobody needs to implement all the different permutations of possible authentication schemes).
|
||||
|
||||
- Since the user for the current context is "green-user", any client of the api-server using this kubeconfig file would naturally be able to log in successfully, because we are providing the green-user's client credentials.
|
||||
- Similarly, we can operate as the "blue-user" if we choose to change the value of current-context.
|
||||
|
||||
In the above scenario, green-user would have to log in by providing certificates, whereas blue-user would just provide the token. All this information would be handled for us by the
|
||||
|
||||
## Loading and merging rules
|
||||
|
||||
The rules for loading and merging the kubeconfig files are straightforward, but there are a lot of them. The final config is built in this order:
|
||||
|
||||
1. Get the kubeconfig from disk. This is done with the following hierarchy and merge rules:
|
||||
|
||||
|
||||
If the `CommandLineLocation` (the value of the `kubeconfig` command line option) is set, use this file only. No merging. Only one instance of this flag is allowed.
|
||||
|
||||
|
||||
Else, if `EnvVarLocation` (the value of `$KUBECONFIG`) is available, use it as a list of files that should be merged.
|
||||
Merge files together based on the following rules.
|
||||
Empty filenames are ignored. Files with non-deserializable content produced errors.
|
||||
The first file to set a particular value or map key wins and the value or map key is never changed.
|
||||
This means that the first file to set `CurrentContext` will have its context preserved. It also means that if two files specify a "red-user", only values from the first file's red-user are used. Even non-conflicting entries from the second file's "red-user" are discarded.
|
||||
|
||||
|
||||
Otherwise, use HomeDirectoryLocation (`~/.kube/config`) with no merging.
|
||||
1. Determine the context to use based on the first hit in this chain
|
||||
1. command line argument - the value of the `context` command line option
|
||||
1. `current-context` from the merged kubeconfig file
|
||||
1. Empty is allowed at this stage
|
||||
1. Determine the cluster info and user to use. At this point, we may or may not have a context. They are built based on the first hit in this chain. (run it twice, once for user, once for cluster)
|
||||
1. command line argument - `user` for user name and `cluster` for cluster name
|
||||
1. If context is present, then use the context's value
|
||||
1. Empty is allowed
|
||||
1. Determine the actual cluster info to use. At this point, we may or may not have a cluster info. Build each piece of the cluster info based on the chain (first hit wins):
|
||||
1. command line arguments - `server`, `api-version`, `certificate-authority`, and `insecure-skip-tls-verify`
|
||||
1. If cluster info is present and a value for the attribute is present, use it.
|
||||
1. If you don't have a server location, error.
|
||||
1. Determine the actual user info to use. User is built using the same rules as cluster info, EXCEPT that you can only have one authentication technique per user.
|
||||
1. Load precedence is 1) command line flag, 2) user fields from kubeconfig
|
||||
1. The command line flags are: `client-certificate`, `client-key`, `username`, `password`, and `token`.
|
||||
1. If there are two conflicting techniques, fail.
|
||||
1. For any information still missing, use default values and potentially prompt for authentication information
|
||||
1. All file references inside of a kubeconfig file are resolved relative to the location of the kubeconfig file itself. When file references are presented on the command line
|
||||
they are resolved relative to the current working directory. When paths are saved in the ~/.kube/config, relative paths are stored relatively while absolute paths are stored absolutely.
|
||||
|
||||
Any path in a kubeconfig file is resolved relative to the location of the kubeconfig file itself.
|
||||
|
||||
|
||||
## Manipulation of kubeconfig via `kubectl config <subcommand>`
|
||||
|
||||
In order to more easily manipulate kubeconfig files, there are a series of subcommands to `kubectl config` to help.
|
||||
See [kubectl/kubectl_config.md](/docs/user-guide/kubectl/kubectl_config) for help.
|
||||
|
||||
### Example
|
||||
|
||||
```shell
|
||||
$ kubectl config set-credentials myself --username=admin --password=secret
|
||||
$ kubectl config set-cluster local-server --server=http://localhost:8080
|
||||
$ kubectl config set-context default-context --cluster=local-server --user=myself
|
||||
$ kubectl config use-context default-context
|
||||
$ kubectl config set contexts.default-context.namespace the-right-prefix
|
||||
$ kubectl config view
|
||||
```
|
||||
|
||||
produces this output
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
server: http://localhost:8080
|
||||
name: local-server
|
||||
contexts:
|
||||
- context:
|
||||
cluster: local-server
|
||||
namespace: the-right-prefix
|
||||
user: myself
|
||||
name: default-context
|
||||
current-context: default-context
|
||||
kind: Config
|
||||
preferences: {}
|
||||
users:
|
||||
- name: myself
|
||||
user:
|
||||
password: secret
|
||||
username: admin
|
||||
```
|
||||
|
||||
and a kubeconfig file that looks like this
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
server: http://localhost:8080
|
||||
name: local-server
|
||||
contexts:
|
||||
- context:
|
||||
cluster: local-server
|
||||
namespace: the-right-prefix
|
||||
user: myself
|
||||
name: default-context
|
||||
current-context: default-context
|
||||
kind: Config
|
||||
preferences: {}
|
||||
users:
|
||||
- name: myself
|
||||
user:
|
||||
password: secret
|
||||
username: admin
|
||||
```
|
||||
|
||||
#### Commands for the example file
|
||||
|
||||
```shell
|
||||
$ kubectl config set preferences.colors true
|
||||
$ kubectl config set-cluster cow-cluster --server=http://cow.org:8080 --api-version=v1
|
||||
$ kubectl config set-cluster horse-cluster --server=https://horse.org:4443 --certificate-authority=path/to/my/cafile
|
||||
$ kubectl config set-cluster pig-cluster --server=https://pig.org:443 --insecure-skip-tls-verify=true
|
||||
$ kubectl config set-credentials blue-user --token=blue-token
|
||||
$ kubectl config set-credentials green-user --client-certificate=path/to/my/client/cert --client-key=path/to/my/client/key
|
||||
$ kubectl config set-context queen-anne-context --cluster=pig-cluster --user=black-user --namespace=saw-ns
|
||||
$ kubectl config set-context federal-context --cluster=horse-cluster --user=green-user --namespace=chisel-ns
|
||||
$ kubectl config use-context federal-context
|
||||
```
|
||||
|
||||
### Final notes for tying it all together
|
||||
|
||||
So, tying this all together, a quick start to create your own kubeconfig file:
|
||||
|
||||
- Take a good look and understand how your api-server is being launched: You need to know YOUR security requirements and policies before you can design a kubeconfig file for convenient authentication.
|
||||
|
||||
- Replace the snippet above with information for your cluster's api-server endpoint.
|
||||
|
||||
- Make sure your api-server is launched in such a way that at least one user (i.e. green-user) credentials are provided to it. You will of course have to look at api-server documentation in order to determine the current state-of-the-art in terms of providing authentication details.
|
|
@ -0,0 +1,382 @@
|
|||
---
|
||||
assignees:
|
||||
- bprashanth
|
||||
- quinton-hoole
|
||||
title: Cross-cluster Service Discovery using Federated Services
|
||||
---
|
||||
|
||||
This guide explains how to use Kubernetes Federated Services to deploy
|
||||
a common Service across multiple Kubernetes clusters. This makes it
|
||||
easy to achieve cross-cluster service discovery and availability zone
|
||||
fault tolerance for your Kubernetes applications.
|
||||
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
## Prerequisites
|
||||
|
||||
This guide assumes that you have a running Kubernetes Cluster
|
||||
Federation installation. If not, then head over to the
|
||||
[federation admin guide](/docs/admin/federation/) to learn how to
|
||||
bring up a cluster federation (or have your cluster administrator do
|
||||
this for you). Other tutorials, for example
|
||||
[this one](https://github.com/kelseyhightower/kubernetes-cluster-federation)
|
||||
by Kelsey Hightower, are also available to help you.
|
||||
|
||||
You are also expected to have a basic
|
||||
[working knowledge of Kubernetes](/docs/getting-started-guides/) in
|
||||
general, and [Services](/docs/user-guide/services/) in particular.
|
||||
|
||||
## Overview
|
||||
|
||||
Federated Services are created in much that same way as traditional
|
||||
[Kubernetes Services](/docs/user-guide/services/) by making an API
|
||||
call which specifies the desired properties of your service. In the
|
||||
case of Federated Services, this API call is directed to the
|
||||
Federation API endpoint, rather than a Kubernetes cluster API
|
||||
endpoint. The API for Federated Services is 100% compatible with the
|
||||
API for traditional Kubernetes Services.
|
||||
|
||||
Once created, the Federated Service automatically:
|
||||
|
||||
1. Creates matching Kubernetes Services in every cluster underlying your Cluster Federation,
|
||||
2. Monitors the health of those service "shards" (and the clusters in which they reside), and
|
||||
3. Manages a set of DNS records in a public DNS provider (like Google Cloud DNS, or AWS Route 53), thus ensuring that clients
|
||||
of your federated service can seamlessly locate an appropriate healthy service endpoint at all times, even in the event of cluster,
|
||||
availability zone or regional outages.
|
||||
|
||||
Clients inside your federated Kubernetes clusters (i.e. Pods) will
|
||||
automatically find the local shard of the Federated Service in their
|
||||
cluster if it exists and is healthy, or the closest healthy shard in a
|
||||
different cluster if it does not.
|
||||
|
||||
## Hybrid cloud capabilities
|
||||
|
||||
Federations of Kubernetes Clusters can include clusters running in
|
||||
different cloud providers (e.g. Google Cloud, AWS), and on-premises
|
||||
(e.g. on OpenStack). Simply create all of the clusters that you
|
||||
require, in the appropriate cloud providers and/or locations, and
|
||||
register each cluster's API endpoint and credentials with your
|
||||
Federation API Server (See the
|
||||
[federation admin guide](/docs/admin/federation/) for details).
|
||||
|
||||
Thereafter, your applications and services can span different clusters
|
||||
and cloud providers as described in more detail below.
|
||||
|
||||
## Creating a federated service
|
||||
|
||||
This is done in the usual way, for example:
|
||||
|
||||
``` shell
|
||||
kubectl --context=federation-cluster create -f services/nginx.yaml
|
||||
```
|
||||
|
||||
The '--context=federation-cluster' flag tells kubectl to submit the
|
||||
request to the Federation API endpoint, with the appropriate
|
||||
credentials. If you have not yet configured such a context, visit the
|
||||
[federation admin guide](/docs/admin/federation/) or one of the
|
||||
[administration tutorials](https://github.com/kelseyhightower/kubernetes-cluster-federation)
|
||||
to find out how to do so.
|
||||
|
||||
As described above, the Federated Service will automatically create
|
||||
and maintain matching Kubernetes services in all of the clusters
|
||||
underlying your federation.
|
||||
|
||||
You can verify this by checking in each of the underlying clusters, for example:
|
||||
|
||||
``` shell
|
||||
kubectl --context=gce-asia-east1a get services nginx
|
||||
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
||||
nginx 10.63.250.98 104.199.136.89 80/TCP 9m
|
||||
```
|
||||
|
||||
The above assumes that you have a context named 'gce-asia-east1a'
|
||||
configured in your client for your cluster in that zone. The name and
|
||||
namespace of the underlying services will automatically match those of
|
||||
the Federated Service that you created above (and if you happen to
|
||||
have had services of the same name and namespace already existing in
|
||||
any of those clusters, they will be automatically adopted by the
|
||||
Federation and updated to conform with the specification of your
|
||||
Federated Service - either way, the end result will be the same).
|
||||
|
||||
The status of your Federated Service will automatically reflect the
|
||||
real-time status of the underlying Kubernetes services, for example:
|
||||
|
||||
``` shell
|
||||
$kubectl --context=federation-cluster describe services nginx
|
||||
|
||||
Name: nginx
|
||||
Namespace: default
|
||||
Labels: run=nginx
|
||||
Selector: run=nginx
|
||||
Type: LoadBalancer
|
||||
IP:
|
||||
LoadBalancer Ingress: 104.197.246.190, 130.211.57.243, 104.196.14.231, 104.199.136.89, ...
|
||||
Port: http 80/TCP
|
||||
Endpoints: <none>
|
||||
Session Affinity: None
|
||||
No events.
|
||||
```
|
||||
|
||||
Note the 'LoadBalancer Ingress' addresses of your Federated Service
|
||||
correspond with the 'LoadBalancer Ingress' addresses of all of the
|
||||
underlying Kubernetes services (once these have been allocated - this
|
||||
may take a few seconds). For inter-cluster and inter-cloud-provider
|
||||
networking between service shards to work correctly, your services
|
||||
need to have an externally visible IP address. [Service Type:
|
||||
Loadbalancer](/docs/user-guide/services/#type-loadbalancer)
|
||||
is typically used for this, although other options
|
||||
(e.g. [External IP's](/docs/user-guide/services/#external-ips)) exist.
|
||||
|
||||
Note also that we have not yet provisioned any backend Pods to receive
|
||||
the network traffic directed to these addresses (i.e. 'Service
|
||||
Endpoints'), so the Federated Service does not yet consider these to
|
||||
be healthy service shards, and has accordingly not yet added their
|
||||
addresses to the DNS records for this Federated Service (more on this
|
||||
aspect later).
|
||||
|
||||
## Adding backend pods
|
||||
|
||||
To render the underlying service shards healthy, we need to add
|
||||
backend Pods behind them. This is currently done directly against the
|
||||
API endpoints of the underlying clusters (although in future the
|
||||
Federation server will be able to do all this for you with a single
|
||||
command, to save you the trouble). For example, to create backend Pods
|
||||
in 13 underlying clusters:
|
||||
|
||||
``` shell
|
||||
for CLUSTER in asia-east1-c asia-east1-a asia-east1-b \
|
||||
europe-west1-d europe-west1-c europe-west1-b \
|
||||
us-central1-f us-central1-a us-central1-b us-central1-c \
|
||||
us-east1-d us-east1-c us-east1-b
|
||||
do
|
||||
kubectl --context=$CLUSTER run nginx --image=nginx:1.11.1-alpine --port=80
|
||||
done
|
||||
```
|
||||
|
||||
Note that `kubectl run` automatically adds the `run=nginx` labels required to associate the backend pods with their services.
|
||||
|
||||
## Verifying public DNS records
|
||||
|
||||
Once the above Pods have successfully started and have begun listening
|
||||
for connections, Kubernetes will report them as healthy endpoints of
|
||||
the service in that cluster (via automatic health checks). The Cluster
|
||||
Federation will in turn consider each of these
|
||||
service 'shards' to be healthy, and place them in serving by
|
||||
automatically configuring corresponding public DNS records. You can
|
||||
use your preferred interface to your configured DNS provider to verify
|
||||
this. For example, if your Federation is configured to use Google
|
||||
Cloud DNS, and a managed DNS domain 'example.com':
|
||||
|
||||
``` shell
|
||||
$ gcloud dns managed-zones describe example-dot-com
|
||||
creationTime: '2016-06-26T18:18:39.229Z'
|
||||
description: Example domain for Kubernetes Cluster Federation
|
||||
dnsName: example.com.
|
||||
id: '3229332181334243121'
|
||||
kind: dns#managedZone
|
||||
name: example-dot-com
|
||||
nameServers:
|
||||
- ns-cloud-a1.googledomains.com.
|
||||
- ns-cloud-a2.googledomains.com.
|
||||
- ns-cloud-a3.googledomains.com.
|
||||
- ns-cloud-a4.googledomains.com.
|
||||
```
|
||||
|
||||
``` shell
|
||||
$ gcloud dns record-sets list --zone example-dot-com
|
||||
NAME TYPE TTL DATA
|
||||
example.com. NS 21600 ns-cloud-e1.googledomains.com., ns-cloud-e2.googledomains.com.
|
||||
example.com. SOA 21600 ns-cloud-e1.googledomains.com. cloud-dns-hostmaster.google.com. 1 21600 3600 1209600 300
|
||||
nginx.mynamespace.myfederation.svc.example.com. A 180 104.197.246.190, 130.211.57.243, 104.196.14.231, 104.199.136.89,...
|
||||
nginx.mynamespace.myfederation.svc.us-central1-a.example.com. A 180 104.197.247.191
|
||||
nginx.mynamespace.myfederation.svc.us-central1-b.example.com. A 180 104.197.244.180
|
||||
nginx.mynamespace.myfederation.svc.us-central1-c.example.com. A 180 104.197.245.170
|
||||
nginx.mynamespace.myfederation.svc.us-central1-f.example.com. CNAME 180 nginx.mynamespace.myfederation.svc.us-central1.example.com.
|
||||
nginx.mynamespace.myfederation.svc.us-central1.example.com. A 180 104.197.247.191, 104.197.244.180, 104.197.245.170
|
||||
nginx.mynamespace.myfederation.svc.asia-east1-a.example.com. A 180 130.211.57.243
|
||||
nginx.mynamespace.myfederation.svc.asia-east1-b.example.com. CNAME 180 nginx.mynamespace.myfederation.svc.asia-east1.example.com.
|
||||
nginx.mynamespace.myfederation.svc.asia-east1-c.example.com. A 180 130.211.56.221
|
||||
nginx.mynamespace.myfederation.svc.asia-east1.example.com. A 180 130.211.57.243, 130.211.56.221
|
||||
nginx.mynamespace.myfederation.svc.europe-west1.example.com. CNAME 180 nginx.mynamespace.myfederation.svc.example.com.
|
||||
nginx.mynamespace.myfederation.svc.europe-west1-d.example.com. CNAME 180 nginx.mynamespace.myfederation.svc.europe-west1.example.com.
|
||||
... etc.
|
||||
```
|
||||
|
||||
Note: If your Federation is configured to use AWS Route53, you can use one of the equivalent AWS tools, for example:
|
||||
|
||||
``` shell
|
||||
$aws route53 list-hosted-zones
|
||||
```
|
||||
and
|
||||
``` shell
|
||||
$aws route53 list-resource-record-sets --hosted-zone-id Z3ECL0L9QLOVBX
|
||||
```
|
||||
|
||||
Whatever DNS provider you use, any DNS query tool (for example 'dig'
|
||||
or 'nslookup') will of course also allow you to see the records
|
||||
created by the Federation for you. Note that you should either point
|
||||
these tools directly at your DNS provider (e.g. `dig
|
||||
@ns-cloud-e1.googledomains.com...`) or expect delays in the order of
|
||||
your configured TTL (180 seconds, by default) before seeing updates,
|
||||
due to caching by intermediate DNS servers.
|
||||
|
||||
### Some notes about the above example
|
||||
|
||||
1. Notice that there is a normal ('A') record for each service shard that has at least one healthy backend endpoint. For example, in us-central1-a, 104.197.247.191 is the external IP address of the service shard in that zone, and in asia-east1-a the address is 130.211.56.221.
|
||||
2. Similarly, there are regional 'A' records which include all healthy shards in that region. For example, 'us-central1'. These regional records are useful for clients which do not have a particular zone preference, and as a building block for the automated locality and failover mechanism described below.
|
||||
2. For zones where there are currently no healthy backend endpoints, a CNAME ('Canonical Name') record is used to alias (automatically redirect) those queries to the next closest healthy zone. In the example, the service shard in us-central1-f currently has no healthy backend endpoints (i.e. Pods), so a CNAME record has been created to automatically redirect queries to other shards in that region (us-central1 in this case).
|
||||
3. Similarly, if no healthy shards exist in the enclosing region, the search progresses further afield. In the europe-west1-d availability zone, there are no healthy backends, so queries are redirected to the broader europe-west1 region (which also has no healthy backends), and onward to the global set of healthy addresses (' nginx.mynamespace.myfederation.svc.example.com.')
|
||||
|
||||
The above set of DNS records is automatically kept in sync with the
|
||||
current state of health of all service shards globally by the
|
||||
Federated Service system. DNS resolver libraries (which are invoked by
|
||||
all clients) automatically traverse the hierarchy of 'CNAME' and 'A'
|
||||
records to return the correct set of healthy IP addresses. Clients can
|
||||
then select any one of the returned addresses to initiate a network
|
||||
connection (and fail over automatically to one of the other equivalent
|
||||
addresses if required).
|
||||
|
||||
## Discovering a federated service
|
||||
|
||||
### From pods inside your federated clusters
|
||||
|
||||
By default, Kubernetes clusters come pre-configured with a
|
||||
cluster-local DNS server ('KubeDNS'), as well as an intelligently
|
||||
constructed DNS search path which together ensure that DNS queries
|
||||
like "myservice", "myservice.mynamespace",
|
||||
"bobsservice.othernamespace" etc issued by your software running
|
||||
inside Pods are automatically expanded and resolved correctly to the
|
||||
appropriate service IP of services running in the local cluster.
|
||||
|
||||
With the introduction of Federated Services and Cross-Cluster Service
|
||||
Discovery, this concept is extended to cover Kubernetes services
|
||||
running in any other cluster across your Cluster Federation, globally.
|
||||
To take advantage of this extended range, you use a slightly different
|
||||
DNS name (of the form "<servicename>.<namespace>.<federationname>",
|
||||
e.g. myservice.mynamespace.myfederation) to resolve Federated
|
||||
Services. Using a different DNS name also avoids having your existing
|
||||
applications accidentally traversing cross-zone or cross-region
|
||||
networks and you incurring perhaps unwanted network charges or
|
||||
latency, without you explicitly opting in to this behavior.
|
||||
|
||||
So, using our NGINX example service above, and the Federated Service
|
||||
DNS name form just described, let's consider an example: A Pod in a
|
||||
cluster in the `us-central1-f` availability zone needs to contact our
|
||||
NGINX service. Rather than use the service's traditional cluster-local
|
||||
DNS name (```"nginx.mynamespace"```, which is automatically expanded
|
||||
to ```"nginx.mynamespace.svc.cluster.local"```) it can now use the
|
||||
service's Federated DNS name, which is
|
||||
```"nginx.mynamespace.myfederation"```. This will be automatically
|
||||
expanded and resolved to the closest healthy shard of my NGINX
|
||||
service, wherever in the world that may be. If a healthy shard exists
|
||||
in the local cluster, that service's cluster-local (typically
|
||||
10.x.y.z) IP address will be returned (by the cluster-local KubeDNS).
|
||||
This is almost exactly equivalent to non-federated service resolution
|
||||
(almost because KubeDNS actually returns both a CNAME and an A record
|
||||
for local federated services, but applications will be oblivious
|
||||
to this minor technical difference).
|
||||
|
||||
But if the service does not exist in the local cluster (or it exists
|
||||
but has no healthy backend pods), the DNS query is automatically
|
||||
expanded to
|
||||
```"nginx.mynamespace.myfederation.svc.us-central1-f.example.com"```
|
||||
(i.e. logically "find the external IP of one of the shards closest to
|
||||
my availability zone"). This expansion is performed automatically by
|
||||
KubeDNS, which returns the associated CNAME record. This results in
|
||||
automatic traversal of the hierarchy of DNS records in the above
|
||||
example, and ends up at one of the external IP's of the Federated
|
||||
Service in the local us-central1 region (i.e. 104.197.247.191,
|
||||
104.197.244.180 or 104.197.245.170).
|
||||
|
||||
It is of course possible to explicitly target service shards in
|
||||
availability zones and regions other than the ones local to a Pod by
|
||||
specifying the appropriate DNS names explicitly, and not relying on
|
||||
automatic DNS expansion. For example,
|
||||
"nginx.mynamespace.myfederation.svc.europe-west1.example.com" will
|
||||
resolve to all of the currently healthy service shards in Europe, even
|
||||
if the Pod issuing the lookup is located in the U.S., and irrespective
|
||||
of whether or not there are healthy shards of the service in the U.S.
|
||||
This is useful for remote monitoring and other similar applications.
|
||||
|
||||
### From other clients outside your federated clusters
|
||||
|
||||
Much of the above discussion applies equally to external clients,
|
||||
except that the automatic DNS expansion described is no longer
|
||||
possible. So external clients need to specify one of the fully
|
||||
qualified DNS names of the Federated Service, be that a zonal,
|
||||
regional or global name. For convenience reasons, it is often a good
|
||||
idea to manually configure additional static CNAME records in your
|
||||
service, for example:
|
||||
|
||||
``` shell
|
||||
eu.nginx.acme.com CNAME nginx.mynamespace.myfederation.svc.europe-west1.example.com.
|
||||
us.nginx.acme.com CNAME nginx.mynamespace.myfederation.svc.us-central1.example.com.
|
||||
nginx.acme.com CNAME nginx.mynamespace.myfederation.svc.example.com.
|
||||
```
|
||||
That way your clients can always use the short form on the left, and
|
||||
always be automatically routed to the closest healthy shard on their
|
||||
home continent. All of the required failover is handled for you
|
||||
automatically by Kubernetes Cluster Federation. Future releases will
|
||||
improve upon this even further.
|
||||
|
||||
## Handling failures of backend pods and whole clusters
|
||||
|
||||
Standard Kubernetes service cluster-IP's already ensure that
|
||||
non-responsive individual Pod endpoints are automatically taken out of
|
||||
service with low latency (a few seconds). In addition, as alluded
|
||||
above, the Kubernetes Cluster Federation system automatically monitors
|
||||
the health of clusters and the endpoints behind all of the shards of
|
||||
your Federated Service, taking shards in and out of service as
|
||||
required (e.g. when all of the endpoints behind a service, or perhaps
|
||||
the entire cluster or availability zone go down, or conversely recover
|
||||
from an outage). Due to the latency inherent in DNS caching (the cache
|
||||
timeout, or TTL for Federated Service DNS records is configured to 3
|
||||
minutes, by default, but can be adjusted), it may take up to that long
|
||||
for all clients to completely fail over to an alternative cluster in
|
||||
the case of catastrophic failure. However, given the number of
|
||||
discrete IP addresses which can be returned for each regional service
|
||||
endpoint (see e.g. us-central1 above, which has three alternatives)
|
||||
many clients will fail over automatically to one of the alternative
|
||||
IP's in less time than that given appropriate configuration.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
#### I cannot connect to my cluster federation API
|
||||
Check that your
|
||||
|
||||
1. Client (typically kubectl) is correctly configured (including API endpoints and login credentials), and
|
||||
2. Cluster Federation API server is running and network-reachable.
|
||||
|
||||
See the [federation admin guide](/docs/admin/federation/) to learn
|
||||
how to bring up a cluster federation correctly (or have your cluster administrator do this for you), and how to correctly configure your client.
|
||||
|
||||
#### I can create a federated service successfully against the cluster federation API, but no matching services are created in my underlying clusters
|
||||
Check that:
|
||||
|
||||
1. Your clusters are correctly registered in the Cluster Federation API (`kubectl describe clusters`)
|
||||
2. Your clusters are all 'Active'. This means that the cluster Federation system was able to connect and authenticate against the clusters' endpoints. If not, consult the logs of the federation-controller-manager pod to ascertain what the failure might be. (`kubectl --namespace=federation logs $(kubectl get pods --namespace=federation -l module=federation-controller-manager -oname`)
|
||||
3. That the login credentials provided to the Cluster Federation API for the clusters have the correct authorization and quota to create services in the relevant namespace in the clusters. Again you should see associated error messages providing more detail in the above log file if this is not the case.
|
||||
4. Whether any other error is preventing the service creation operation from succeeding (look for `service-controller` errors in the output of `kubectl logs federation-controller-manager --namespace federation`).
|
||||
|
||||
#### I can create a federated service successfully, but no matching DNS records are created in my DNS provider.
|
||||
Check that:
|
||||
|
||||
1. Your federation name, DNS provider, DNS domain name are configured correctly. Consult the [federation admin guide](/docs/admin/federation/) or [tutorial](https://github.com/kelseyhightower/kubernetes-cluster-federation) to learn
|
||||
how to configure your Cluster Federation system's DNS provider (or have your cluster administrator do this for you).
|
||||
2. Confirm that the Cluster Federation's service-controller is successfully connecting to and authenticating against your selected DNS provider (look for `service-controller` errors or successes in the output of `kubectl logs federation-controller-manager --namespace federation`)
|
||||
3. Confirm that the Cluster Federation's service-controller is successfully creating DNS records in your DNS provider (or outputting errors in its logs explaining in more detail what's failing).
|
||||
|
||||
#### Matching DNS records are created in my DNS provider, but clients are unable to resolve against those names
|
||||
Check that:
|
||||
|
||||
1. The DNS registrar that manages your federation DNS domain has been correctly configured to point to your configured DNS provider's nameservers. See for example [Google Domains Documentation](https://support.google.com/domains/answer/3290309?hl=en&ref_topic=3251230) and [Google Cloud DNS Documentation](https://cloud.google.com/dns/update-name-servers), or equivalent guidance from your domain registrar and DNS provider.
|
||||
|
||||
#### This troubleshooting guide did not help me solve my problem
|
||||
|
||||
1. Please use one of our [support channels](http://kubernetes.io/docs/troubleshooting/) to seek assistance.
|
||||
|
||||
## For more information
|
||||
|
||||
* [Federation proposal](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/proposals/federation.md) details use cases that motivated this work.
|
|
@ -0,0 +1,137 @@
|
|||
---
|
||||
title: Federation
|
||||
---
|
||||
|
||||
This guide explains why and how to manage multiple Kubernetes clusters using
|
||||
federation.
|
||||
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
|
||||
## Why federation
|
||||
|
||||
Federation makes it easy to manage multiple clusters. It does so by providing 2
|
||||
major building blocks:
|
||||
|
||||
* Sync resources across clusters: Federation provides the ability to keep
|
||||
resources in multiple clusters in sync. This can be used, for example, to
|
||||
ensure that the same deployment exists in multiple clusters.
|
||||
* Cross cluster discovery: It provides the ability to auto-configure DNS
|
||||
servers and load balancers with backends from all clusters. This can be used,
|
||||
for example, to ensure that a global VIP or DNS record can be used to access
|
||||
backends from multiple clusters.
|
||||
|
||||
Some other use cases that federation enables are:
|
||||
|
||||
* High Availability: By spreading load across clusters and auto configuring DNS
|
||||
servers and load balancers, federation minimises the impact of cluster
|
||||
failure.
|
||||
* Avoiding provider lock-in: By making it easier to migrate applications across
|
||||
clusters, federation prevents cluster provider lock-in.
|
||||
|
||||
|
||||
Federation is not helpful unless you have multiple clusters. Some of the reasons
|
||||
why you might want multiple clusters are:
|
||||
|
||||
* Low latency: Having clusters in multiple regions minimises latency by serving
|
||||
users from the cluster that is closest to them.
|
||||
* Fault isolation: It might be better to have multiple small clusters rather
|
||||
than a single large cluster for fault isolation (for example: multiple
|
||||
clusters in different availability zones of a cloud provider).
|
||||
[Multi cluster guide](/docs/admin/multi-cluster) has more details on this.
|
||||
* Scalability: There are scalability limits to a single kubernetes cluster (this
|
||||
should not be the case for most users. For more details:
|
||||
[Kubernetes Scaling and Performance Goals](https://github.com/kubernetes/community/blob/master/sig-scalability/goals.md)).
|
||||
* Hybrid cloud: You can have multiple clusters on different cloud providers or
|
||||
on-premises data centers.
|
||||
|
||||
|
||||
### Caveats
|
||||
|
||||
While there are a lot of attractive use cases for federation, there are also
|
||||
some caveats.
|
||||
|
||||
* Increased network bandwidth and cost: The federation control plane watches all
|
||||
clusters to ensure that the current state is as expected. This can lead to
|
||||
significant network cost if the clusters are running in different regions on
|
||||
a cloud provider or on different cloud providers.
|
||||
* Reduced cross cluster isolation: A bug in the federation control plane can
|
||||
impact all clusters. This is mitigated by keeping the logic in federation
|
||||
control plane to a minimum. It mostly delegates to the control plane in
|
||||
kubernetes clusters whenever it can. The design and implementation also errs
|
||||
on the side of safety and avoiding multicluster outage.
|
||||
* Maturity: The federation project is relatively new and is not very mature.
|
||||
Not all resources are available and many are still alpha. [Issue
|
||||
38893](https://github.com/kubernetes/kubernetes/issues/38893) ennumerates
|
||||
known issues with the system that the team is busy solving.
|
||||
|
||||
## Setup
|
||||
|
||||
To be able to federate multiple clusters, we first need to setup a federation
|
||||
control plane.
|
||||
Follow the [setup guide](/docs/admin/federation/) to setup the
|
||||
federation control plane.
|
||||
|
||||
## Hybrid cloud capabilities
|
||||
|
||||
Federations of Kubernetes Clusters can include clusters running in
|
||||
different cloud providers (e.g. Google Cloud, AWS), and on-premises
|
||||
(e.g. on OpenStack). Simply create all of the clusters that you
|
||||
require, in the appropriate cloud providers and/or locations, and
|
||||
register each cluster's API endpoint and credentials with your
|
||||
Federation API Server (See the
|
||||
[federation admin guide](/docs/admin/federation/) for details).
|
||||
|
||||
Thereafter, your API resources can span different clusters
|
||||
and cloud providers.
|
||||
|
||||
## API resources
|
||||
|
||||
Once we have the control plane setup, we can start creating federation API
|
||||
resources.
|
||||
The following guides explain some of the resources in detail:
|
||||
|
||||
* [ConfigMap](https://kubernetes.io/docs/user-guide/federation/configmap/)
|
||||
* [DaemonSets](https://kubernetes.io/docs/user-guide/federation/daemonsets/)
|
||||
* [Deployment](https://kubernetes.io/docs/user-guide/federation/deployment/)
|
||||
* [Events](https://kubernetes.io/docs/user-guide/federation/events/)
|
||||
* [Ingress](https://kubernetes.io/docs/user-guide/federation/federated-ingress/)
|
||||
* [Namespaces](https://kubernetes.io/docs/user-guide/federation/namespaces/)
|
||||
* [ReplicaSets](https://kubernetes.io/docs/user-guide/federation/replicasets/)
|
||||
* [Secrets](https://kubernetes.io/docs/user-guide/federation/secrets/)
|
||||
* [Services](https://kubernetes.io/docs/user-guide/federation/federated-services/)
|
||||
|
||||
[API reference docs](/docs/federation/api-reference/) lists all the
|
||||
resources supported by federation apiserver.
|
||||
|
||||
## Cascading deletion
|
||||
|
||||
Kubernetes version 1.5 includes support for cascading deletion of federated
|
||||
resources. With cascading deletion, when you delete a resource from the
|
||||
federation control plane, the corresponding resources in all underlying clusters
|
||||
are also deleted.
|
||||
|
||||
To enable cascading deletion, set the option
|
||||
`DeleteOptions.orphanDependents=false` when you delete a resource from the
|
||||
federation control plane.
|
||||
|
||||
The following Federated resources are affected by cascading deletion:
|
||||
|
||||
* [Ingress](https://kubernetes.io/docs/user-guide/federation/federated-ingress/)
|
||||
* [Namespaces](https://kubernetes.io/docs/user-guide/federation/namespaces/)
|
||||
* [ReplicaSets](https://kubernetes.io/docs/user-guide/federation/replicasets/)
|
||||
* [Secrets](https://kubernetes.io/docs/user-guide/federation/secrets/)
|
||||
* [Deployment](https://kubernetes.io/docs/user-guide/federation/deployment/)
|
||||
* [DaemonSets](https://kubernetes.io/docs/user-guide/federation/daemonsets/)
|
||||
|
||||
Note: By default, deleting a resource from federation control plane does not
|
||||
delete the corresponding resources from underlying clusters.
|
||||
|
||||
|
||||
## For more information
|
||||
|
||||
* [Federation
|
||||
proposal](https://github.com/kubernetes/community/blob/{{page.githubbranch}}/contributors/design-proposals/federation.md)
|
||||
* [Kubecon2016 talk on federation](https://www.youtube.com/watch?v=pq9lbkmxpS8)
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
assignees:
|
||||
- davidopp
|
||||
- filipg
|
||||
- piosz
|
||||
title: Guaranteed Scheduling For Critical Add-On Pods
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
## Overview
|
||||
|
||||
In addition to Kubernetes core components like api-server, scheduler, controller-manager running on a master machine
|
||||
there are a number of add-ons which, for various reasons, must run on a regular cluster node (rather than the Kubernetes master).
|
||||
Some of these add-ons are critical to a fully functional cluster, such as Heapster, DNS, and UI.
|
||||
A cluster may stop working properly if a critical add-on is evicted (either manually or as a side effect of another operation like upgrade)
|
||||
and becomes pending (for example when the cluster is highly utilized and either there are other pending pods that schedule into the space
|
||||
vacated by the evicted critical add-on pod or the amount of resources available on the node changed for some other reason).
|
||||
|
||||
## Rescheduler: guaranteed scheduling of critical add-ons
|
||||
|
||||
Rescheduler ensures that critical add-ons are always scheduled
|
||||
(assuming the cluster has enough resources to run the critical add-on pods in the absence of regular pods).
|
||||
If the scheduler determines that no node has enough free resources to run the critical add-on pod
|
||||
given the pods that are already running in the cluster
|
||||
(indicated by critical add-on pod's pod condition PodScheduled set to false, the reason set to Unschedulable)
|
||||
the rescheduler tries to free up space for the add-on by evicting some pods; then the scheduler will schedule the add-on pod.
|
||||
|
||||
To avoid situation when another pod is scheduled into the space prepared for the critical add-on,
|
||||
the chosen node gets a temporary taint "CriticalAddonsOnly" before the eviction(s)
|
||||
(see [more details](https://github.com/kubernetes/kubernetes/blob/master/docs/design/taint-toleration-dedicated.md)).
|
||||
Each critical add-on has to tolerate it,
|
||||
while the other pods shouldn't tolerate the taint. The taint is removed once the add-on is successfully scheduled.
|
||||
|
||||
*Warning:* currently there is no guarantee which node is chosen and which pods are being killed
|
||||
in order to schedule critical pods, so if rescheduler is enabled your pods might be occasionally
|
||||
killed for this purpose.
|
||||
|
||||
## Config
|
||||
|
||||
Rescheduler doesn't have any user facing configuration (component config) or API.
|
||||
It's enabled by default. It can be disabled:
|
||||
|
||||
* during cluster setup by setting `ENABLE_RESCHEDULER` flag to `false`
|
||||
* on running cluster by deleting its manifest from master node
|
||||
(default path `/etc/kubernetes/manifests/rescheduler.manifest`)
|
||||
|
||||
### Marking add-on as critical
|
||||
|
||||
To be critical an add-on has to run in `kube-system` namespace (configurable via flag)
|
||||
and have the following annotations specified:
|
||||
|
||||
* `scheduler.alpha.kubernetes.io/critical-pod` set to empty string
|
||||
* `scheduler.alpha.kubernetes.io/tolerations` set to `[{"key":"CriticalAddonsOnly", "operator":"Exists"}]`
|
||||
|
||||
The first one marks a pod a critical. The second one is required by Rescheduler algorithm.
|
|
@ -0,0 +1,226 @@
|
|||
---
|
||||
assignees:
|
||||
- crassirostris
|
||||
- piosz
|
||||
title: Logging and Monitoring Cluster Activity
|
||||
redirect_from:
|
||||
- "/docs/concepts/clusters/logging/"
|
||||
- "/docs/concepts/clusters/logging.html"
|
||||
---
|
||||
|
||||
Application and systems logs can help you understand what is happening inside your cluster. The logs are particularly useful for debugging problems and monitoring cluster activity. Most modern applications have some kind of logging mechanism; as such, most container engines are likewise designed to support some kind of logging. The easiest and most embraced logging method for containerized applications is to write to the standard output and standard error streams.
|
||||
|
||||
However, the native functionality provided by a container engine or runtime is usually not enough for a complete logging solution. For example, if a container crashes, a pod is evicted, or a node dies, you'll usually still want to access your application's logs. As such, logs should have a separate storage and lifecycle independent of nodes, pods, or containers. This concept is called _cluster-level-logging_. Cluster-level logging requires a separate backend to store, analyze, and query logs. Kubernetes provides no native storage solution for log data, but you can integrate many existing logging solutions into your Kubernetes cluster.
|
||||
|
||||
This document includes:
|
||||
|
||||
* A basic demonstration of logging in Kubernetes using the standard output stream
|
||||
* A detailed description of the node logging architecture in Kubernetes
|
||||
* Guidance for implementing cluster-level logging in Kubernetes
|
||||
|
||||
The guidance for cluster-level logging assumes that a logging backend is present inside or outside of your cluster. If you're not interested in having cluster-level logging, you might still find the description of how logs are stored and handled on the node to be useful.
|
||||
|
||||
## Basic logging in Kubernetes
|
||||
|
||||
In this section, you can see an example of basic logging in Kubernetes that
|
||||
outputs data to the standard output stream. This demonstration uses
|
||||
a [pod specification](/docs/concepts/cluster-administration/counter-pod.yaml) with
|
||||
a container that writes some text to standard output once per second.
|
||||
|
||||
{% include code.html language="yaml" file="counter-pod.yaml" ghlink="/docs/tasks/debug-application-cluster/counter-pod.yaml" %}
|
||||
|
||||
To run this pod, use the following command:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f http://k8s.io/docs/tasks/debug-application-cluster/counter-pod.yaml
|
||||
pod "counter" created
|
||||
```
|
||||
|
||||
To fetch the logs, use the `kubectl logs` command, as follows
|
||||
|
||||
```shell
|
||||
$ kubectl logs counter
|
||||
0: Mon Jan 1 00:00:00 UTC 2001
|
||||
1: Mon Jan 1 00:00:01 UTC 2001
|
||||
2: Mon Jan 1 00:00:02 UTC 2001
|
||||
...
|
||||
```
|
||||
|
||||
You can use `kubectl logs` to retrieve logs from a previous instantiation of a container with `--previous` flag, in case the container has crashed. If your pod has multiple containers, you should specify which container's logs you want to access by appending a container name to the command. See the [`kubectl logs` documentation](/docs/user-guide/kubectl/kubectl_logs/) for more details.
|
||||
|
||||
## Logging at the node level
|
||||
|
||||
![Node level logging](/images/docs/user-guide/logging/logging-node-level.png)
|
||||
|
||||
Everything a containerized application writes to `stdout` and `stderr` is handled and redirected somewhere by a container engine. For example, the Docker container engine redirects those two streams to [a logging driver](https://docs.docker.com/engine/admin/logging/overview), which is configured in Kubernetes to write to a file in json format.
|
||||
|
||||
**Note:** The Docker json logging driver treats each line as a separate message. When using the Docker logging driver, there is no direct support for multi-line messages. You need to handle multi-line messages at the logging agent level or higher.
|
||||
|
||||
By default, if a container restarts, the kubelet keeps one terminated container with its logs. If a pod is evicted from the node, all corresponding containers are also evicted, along with their logs.
|
||||
|
||||
An important consideration in node-level logging is implementing log rotation, so that logs don't consume all available storage on the node. Kubernetes uses the [`logrotate`](http://www.linuxcommand.org/man_pages/logrotate8.html) tool to implement log rotation.
|
||||
|
||||
Kubernetes performs log rotation daily, or if the log file grows beyond 10MB in size. Each rotation belongs to a single container; if the container repeatedly fails or the pod is evicted, all previous rotations for the container are lost. By default, Kubernetes keeps up to five logging rotations per container.
|
||||
|
||||
The Kubernetes logging configuration differs depending on the node type. For example, you can find detailed information for GCI in the corresponding [configure helper](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/cluster/gce/gci/configure-helper.sh#L96).
|
||||
|
||||
When you run [`kubectl logs`](/docs/user-guide/kubectl/kubectl_logs), as in the basic logging example, the kubelet on the node handles the request and reads directly from the log file, returning the contents in the response. Note that `kubectl logs` **only returns the last rotation**; you must manually extract prior rotations, if desired and cluster-level logging is not enabled.
|
||||
|
||||
### System component logs
|
||||
|
||||
There are two types of system components: those that run in a container and those
|
||||
that do not run in a container. For example:
|
||||
|
||||
* The Kubernetes scheduler and kube-proxy run in a container.
|
||||
* The kubelet and container runtime, for example Docker, do not run in containers.
|
||||
|
||||
On machines with systemd, the kubelet and container runtime write to journald. If
|
||||
systemd is not present, they write to `.log` files in the `/var/log` directory.
|
||||
System components inside containers always write to the `/var/log` directory,
|
||||
bypassing the default logging mechanism. They use the [glog](https://godoc.org/github.com/golang/glog)
|
||||
logging library. You can find the conventions for logging severity for those
|
||||
components in the [development docs on logging](https://github.com/kubernetes/community/blob/master/contributors/devel/logging.md).
|
||||
|
||||
Similarly to the container logs, system component logs in the `/var/log`
|
||||
directory are rotated daily and based on the log size. However,
|
||||
system component logs have a higher size retention: by default,
|
||||
they can store up to 100MB.
|
||||
|
||||
## Cluster-level logging architectures
|
||||
|
||||
While Kubernetes does not provide a native solution for cluster-level logging, there are several common approaches you can consider. Here are some options:
|
||||
|
||||
* Use a node-level logging agent that runs on every node.
|
||||
* Include a dedicated sidecar container for logging in an application pod.
|
||||
* Push logs directly to a backend from within an application.
|
||||
|
||||
### Using a node logging agent
|
||||
|
||||
![Using a node level logging agent](/images/docs/user-guide/logging/logging-with-node-agent.png)
|
||||
|
||||
You can implement cluster-level logging by including a _node-level logging agent_ on each node. The logging agent is a dedicated tool that exposes logs or pushes logs to a backend. Commonly, the logging agent is a container that has access to a directory with log files from all of the application containers on that node.
|
||||
|
||||
Because the logging agent must run on every node, it's common to implement it as either a DaemonSet replica, a manifest pod, or a dedicated native process on the node. However the latter two approaches are deprecated and highly discouraged.
|
||||
|
||||
Using a node-level logging agent is the most common and encouraged approach for a Kubernetes cluster, because it creates only one agent per node, and it doesn't require any changes to the applications running on the node. However, node-level logging _only works for applications' standard output and standard error_.
|
||||
|
||||
Kubernetes doesn't specify a logging agent, but two optional logging agents are packaged with the Kubernetes release: [Stackdriver Logging](/docs/user-guide/logging/stackdriver) for use with Google Cloud Platform, and [Elasticsearch](/docs/user-guide/logging/elasticsearch). You can find more information and instructions in the dedicated documents. Both use [fluentd](http://www.fluentd.org/) with custom configuration as an agent on the node.
|
||||
|
||||
### Using a sidecar container with the logging agent
|
||||
|
||||
You can use a sidecar container in one of the following ways:
|
||||
|
||||
* The sidecar container streams application logs to its own `stdout`.
|
||||
* The sidecar container runs a logging agent, which is configured to pick up logs from an application container.
|
||||
|
||||
#### Streaming sidecar container
|
||||
|
||||
![Sidecar container with a streaming container](/images/docs/user-guide/logging/logging-with-streaming-sidecar.png)
|
||||
|
||||
By having your sidecar containers stream to their own `stdout` and `stderr`
|
||||
streams, you can take advantage of the kubelet and the logging agent that
|
||||
already run on each node. The sidecar containers read logs from a file, a socket,
|
||||
or the journald. Each individual sidecar container prints log to its own `stdout`
|
||||
or `stderr` stream.
|
||||
|
||||
This approach allows you to separate several log streams from different
|
||||
parts of your application, some of which can lack support
|
||||
for writing to `stdout` or `stderr`. The logic behind redirecting logs
|
||||
is minimal, so it's hardly a significant overhead. Additionally, because
|
||||
`stdout` and `stderr` are handled by the kubelet, you can use built-in tools
|
||||
like `kubectl logs`.
|
||||
|
||||
Consider the following example. A pod runs a single container, and the container
|
||||
writes to two different log files, using two different formats. Here's a
|
||||
configuration file for the Pod:
|
||||
|
||||
{% include code.html language="yaml" file="two-files-counter-pod.yaml" ghlink="/docs/concepts/cluster-administration/two-files-counter-pod.yaml" %}
|
||||
|
||||
It would be a mess to have log entries of different formats in the same log
|
||||
stream, even if you managed to redirect both components to the `stdout` stream of
|
||||
the container. Instead, you could introduce two sidecar containers. Each sidecar
|
||||
container could tail a particular log file from a shared volume and then redirect
|
||||
the logs to its own `stdout` stream.
|
||||
|
||||
Here's a configuration file for a pod that has two sidecar containers:
|
||||
|
||||
{% include code.html language="yaml" file="two-files-counter-pod-streaming-sidecar.yaml" ghlink="/docs/concepts/cluster-administration/two-files-counter-pod-streaming-sidecar.yaml" %}
|
||||
|
||||
Now when you run this pod, you can access each log stream separately by
|
||||
running the following commands:
|
||||
|
||||
```shell
|
||||
$ kubectl logs counter count-log-1
|
||||
0: Mon Jan 1 00:00:00 UTC 2001
|
||||
1: Mon Jan 1 00:00:01 UTC 2001
|
||||
2: Mon Jan 1 00:00:02 UTC 2001
|
||||
...
|
||||
```
|
||||
|
||||
```shell
|
||||
$ kubectl logs counter count-log-2
|
||||
Mon Jan 1 00:00:00 UTC 2001 INFO 0
|
||||
Mon Jan 1 00:00:01 UTC 2001 INFO 1
|
||||
Mon Jan 1 00:00:02 UTC 2001 INFO 2
|
||||
...
|
||||
```
|
||||
|
||||
The node-level agent installed in your cluster picks up those log streams
|
||||
automatically without any further configuration. If you like, you can configure
|
||||
the agent to parse log lines depending on the source container.
|
||||
|
||||
Note, that despite low CPU and memory usage (order of couple of millicores
|
||||
for cpu and order of several megabytes for memory), writing logs to a file and
|
||||
then streaming them to `stdout` can double disk usage. If you have
|
||||
an application that writes to a single file, it's generally better to set
|
||||
`/dev/stdout` as destination rather than implementing the streaming sidecar
|
||||
container approach.
|
||||
|
||||
Sidecar containers can also be used to rotate log files that cannot be
|
||||
rotated by the application itself. [An example](https://github.com/samsung-cnct/logrotate)
|
||||
of this approach is a small container running logrotate periodically.
|
||||
However, it's recommended to use `stdout` and `stderr` directly and leave rotation
|
||||
and retention policies to the kubelet.
|
||||
|
||||
#### Sidecar container with a logging agent
|
||||
|
||||
![Sidecar container with a logging agent](/images/docs/user-guide/logging/logging-with-sidecar-agent.png)
|
||||
|
||||
If the node-level logging agent is not flexible enough for your situation, you
|
||||
can create a sidecar container with a separate logging agent that you have
|
||||
configured specifically to run with your application.
|
||||
|
||||
**Note**: Using a logging agent in a sidecar container can lead
|
||||
to significant resource consumption. Moreover, you won't be able to access
|
||||
those logs using `kubectl logs` command, because they are not controlled
|
||||
by the kubelet.
|
||||
|
||||
As an example, you could use [Stackdriver](/docs/user-guide/logging/stackdriver/),
|
||||
which uses fluentd as a logging agent. Here are two configuration files that
|
||||
you can use to implement this approach. The first file contains
|
||||
a [ConfigMap](/docs/user-guide/configmap/) to configure fluentd.
|
||||
|
||||
{% include code.html language="yaml" file="fluentd-sidecar-config.yaml" ghlink="/docs/concepts/cluster-administration/fluentd-sidecar-config.yaml" %}
|
||||
|
||||
**Note**: The configuration of fluentd is beyond the scope of this article. For
|
||||
information about configuring fluentd, see the
|
||||
[official fluentd documentation](http://docs.fluentd.org/).
|
||||
|
||||
The second file describes a pod that has a sidecar container running fluentd.
|
||||
The pod mounts a volume where fluentd can pick up its configuration data.
|
||||
|
||||
{% include code.html language="yaml" file="two-files-counter-pod-agent-sidecar.yaml" ghlink="/docs/concepts/cluster-administration/two-files-counter-pod-agent-sidecar.yaml" %}
|
||||
|
||||
After some time you can find log messages in the Stackdriver interface.
|
||||
|
||||
Remember, that this is just an example and you can actually replace fluentd
|
||||
with any logging agent, reading from any source inside an application
|
||||
container.
|
||||
|
||||
### Exposing logs directly from the application
|
||||
|
||||
![Exposing logs directly from the application](/images/docs/user-guide/logging/logging-from-application.png)
|
||||
|
||||
You can implement cluster-level logging by exposing or pushing logs directly from
|
||||
every application; however, the implementation for such a logging mechanism
|
||||
is outside the scope of Kubernetes.
|
|
@ -0,0 +1,438 @@
|
|||
---
|
||||
assignees:
|
||||
- bgrant0607
|
||||
- janetkuo
|
||||
- mikedanese
|
||||
title: Managing Resources
|
||||
---
|
||||
|
||||
You've deployed your application and exposed it via a service. Now what? Kubernetes provides a number of tools to help you manage your application deployment, including scaling and updating. Among the features we'll discuss in more depth are [configuration files](/docs/user-guide/configuring-containers/#configuration-in-kubernetes) and [labels](/docs/user-guide/deploying-applications/#labels).
|
||||
|
||||
You can find all the files for this example [in our docs
|
||||
repo here](https://github.com/kubernetes/kubernetes.github.io/tree/{{page.docsbranch}}/docs/user-guide/).
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
## Organizing resource configurations
|
||||
|
||||
Many applications require multiple resources to be created, such as a Deployment and a Service. Management of multiple resources can be simplified by grouping them together in the same file (separated by `---` in YAML). For example:
|
||||
|
||||
{% include code.html language="yaml" file="nginx-app.yaml" ghlink="/docs/user-guide/nginx-app.yaml" %}
|
||||
|
||||
Multiple resources can be created the same way as a single resource:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/user-guide/nginx-app.yaml
|
||||
service "my-nginx-svc" created
|
||||
deployment "my-nginx" created
|
||||
```
|
||||
|
||||
The resources will be created in the order they appear in the file. Therefore, it's best to specify the service first, since that will ensure the scheduler can spread the pods associated with the service as they are created by the controller(s), such as Deployment.
|
||||
|
||||
`kubectl create` also accepts multiple `-f` arguments:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/user-guide/nginx/nginx-svc.yaml -f docs/user-guide/nginx/nginx-deployment.yaml
|
||||
```
|
||||
|
||||
And a directory can be specified rather than or in addition to individual files:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f docs/user-guide/nginx/
|
||||
```
|
||||
|
||||
`kubectl` will read any files with suffixes `.yaml`, `.yml`, or `.json`.
|
||||
|
||||
It is a recommended practice to put resources related to the same microservice or application tier into the same file, and to group all of the files associated with your application in the same directory. If the tiers of your application bind to each other using DNS, then you can then simply deploy all of the components of your stack en masse.
|
||||
|
||||
A URL can also be specified as a configuration source, which is handy for deploying directly from configuration files checked into github:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f https://raw.githubusercontent.com/kubernetes/kubernetes/master/docs/user-guide/nginx-deployment.yaml
|
||||
deployment "nginx-deployment" created
|
||||
```
|
||||
|
||||
## Bulk operations in kubectl
|
||||
|
||||
Resource creation isn't the only operation that `kubectl` can perform in bulk. It can also extract resource names from configuration files in order to perform other operations, in particular to delete the same resources you created:
|
||||
|
||||
```shell
|
||||
$ kubectl delete -f docs/user-guide/nginx/
|
||||
deployment "my-nginx" deleted
|
||||
service "my-nginx-svc" deleted
|
||||
```
|
||||
|
||||
In the case of just two resources, it's also easy to specify both on the command line using the resource/name syntax:
|
||||
|
||||
```shell
|
||||
$ kubectl delete deployments/my-nginx services/my-nginx-svc
|
||||
```
|
||||
|
||||
For larger numbers of resources, you'll find it easier to specify the selector (label query) specified using `-l` or `--selector`, to filter resources by their labels:
|
||||
|
||||
```shell
|
||||
$ kubectl delete deployment,services -l app=nginx
|
||||
deployment "my-nginx" deleted
|
||||
service "my-nginx-svc" deleted
|
||||
```
|
||||
|
||||
Because `kubectl` outputs resource names in the same syntax it accepts, it's easy to chain operations using `$()` or `xargs`:
|
||||
|
||||
```shell
|
||||
$ kubectl get $(kubectl create -f docs/user-guide/nginx/ -o name | grep service)
|
||||
NAME CLUSTER-IP EXTERNAL-IP PORT(S) AGE
|
||||
my-nginx-svc 10.0.0.208 80/TCP 0s
|
||||
```
|
||||
|
||||
With the above commands, we first create resources under docs/user-guide/nginx/ and print the resources created with `-o name` output format
|
||||
(print each resource as resource/name). Then we `grep` only the "service", and then print it with `kubectl get`.
|
||||
|
||||
If you happen to organize your resources across several subdirectories within a particular directory, you can recursively perform the operations on the subdirectories also, by specifying `--recursive` or `-R` alongside the `--filename,-f` flag.
|
||||
|
||||
For instance, assume there is a directory `project/k8s/development` that holds all of the manifests needed for the development environment, organized by resource type:
|
||||
|
||||
```
|
||||
project/k8s/development
|
||||
├── configmap
|
||||
│ └── my-configmap.yaml
|
||||
├── deployment
|
||||
│ └── my-deployment.yaml
|
||||
└── pvc
|
||||
└── my-pvc.yaml
|
||||
```
|
||||
|
||||
By default, performing a bulk operation on `project/k8s/development` will stop at the first level of the directory, not processing any subdirectories. If we tried to create the resources in this directory using the following command, we'd encounter an error:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f project/k8s/development
|
||||
error: you must provide one or more resources by argument or filename (.json|.yaml|.yml|stdin)
|
||||
```
|
||||
|
||||
Instead, specify the `--recursive` or `-R` flag with the `--filename,-f` flag as such:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f project/k8s/development --recursive
|
||||
configmap "my-config" created
|
||||
deployment "my-deployment" created
|
||||
persistentvolumeclaim "my-pvc" created
|
||||
```
|
||||
|
||||
The `--recursive` flag works with any operation that accepts the `--filename,-f` flag such as: `kubectl {create,get,delete,describe,rollout} etc.`
|
||||
|
||||
The `--recursive` flag also works when multiple `-f` arguments are provided:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f project/k8s/namespaces -f project/k8s/development --recursive
|
||||
namespace "development" created
|
||||
namespace "staging" created
|
||||
configmap "my-config" created
|
||||
deployment "my-deployment" created
|
||||
persistentvolumeclaim "my-pvc" created
|
||||
```
|
||||
|
||||
If you're interested in learning more about `kubectl`, go ahead and read [kubectl Overview](/docs/user-guide/kubectl-overview).
|
||||
|
||||
## Using labels effectively
|
||||
|
||||
The examples we've used so far apply at most a single label to any resource. There are many scenarios where multiple labels should be used to distinguish sets from one another.
|
||||
|
||||
For instance, different applications would use different values for the `app` label, but a multi-tier application, such as the [guestbook example](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/guestbook/), would additionally need to distinguish each tier. The frontend could carry the following labels:
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
app: guestbook
|
||||
tier: frontend
|
||||
```
|
||||
|
||||
while the Redis master and slave would have different `tier` labels, and perhaps even an additional `role` label:
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
app: guestbook
|
||||
tier: backend
|
||||
role: master
|
||||
```
|
||||
|
||||
and
|
||||
|
||||
```yaml
|
||||
labels:
|
||||
app: guestbook
|
||||
tier: backend
|
||||
role: slave
|
||||
```
|
||||
|
||||
The labels allow us to slice and dice our resources along any dimension specified by a label:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f examples/guestbook/all-in-one/guestbook-all-in-one.yaml
|
||||
$ kubectl get pods -Lapp -Ltier -Lrole
|
||||
NAME READY STATUS RESTARTS AGE APP TIER ROLE
|
||||
guestbook-fe-4nlpb 1/1 Running 0 1m guestbook frontend <none>
|
||||
guestbook-fe-ght6d 1/1 Running 0 1m guestbook frontend <none>
|
||||
guestbook-fe-jpy62 1/1 Running 0 1m guestbook frontend <none>
|
||||
guestbook-redis-master-5pg3b 1/1 Running 0 1m guestbook backend master
|
||||
guestbook-redis-slave-2q2yf 1/1 Running 0 1m guestbook backend slave
|
||||
guestbook-redis-slave-qgazl 1/1 Running 0 1m guestbook backend slave
|
||||
my-nginx-divi2 1/1 Running 0 29m nginx <none> <none>
|
||||
my-nginx-o0ef1 1/1 Running 0 29m nginx <none> <none>
|
||||
$ kubectl get pods -lapp=guestbook,role=slave
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
guestbook-redis-slave-2q2yf 1/1 Running 0 3m
|
||||
guestbook-redis-slave-qgazl 1/1 Running 0 3m
|
||||
```
|
||||
|
||||
## Canary deployments
|
||||
|
||||
Another scenario where multiple labels are needed is to distinguish deployments of different releases or configurations of the same component. It is common practice to deploy a *canary* of a new application release (specified via image tag in the pod template) side by side with the previous release so that the new release can receive live production traffic before fully rolling it out.
|
||||
|
||||
For instance, you can use a `track` label to differentiate different releases.
|
||||
|
||||
The primary, stable release would have a `track` label with value as `stable`:
|
||||
|
||||
```yaml
|
||||
name: frontend
|
||||
replicas: 3
|
||||
...
|
||||
labels:
|
||||
app: guestbook
|
||||
tier: frontend
|
||||
track: stable
|
||||
...
|
||||
image: gb-frontend:v3
|
||||
```
|
||||
|
||||
and then you can create a new release of the guestbook frontend that carries the `track` label with different value (i.e. `canary`), so that two sets of pods would not overlap:
|
||||
|
||||
```yaml
|
||||
name: frontend-canary
|
||||
replicas: 1
|
||||
...
|
||||
labels:
|
||||
app: guestbook
|
||||
tier: frontend
|
||||
track: canary
|
||||
...
|
||||
image: gb-frontend:v4
|
||||
```
|
||||
|
||||
|
||||
The frontend service would span both sets of replicas by selecting the common subset of their labels (i.e. omitting the `track` label), so that the traffic will be redirected to both applications:
|
||||
|
||||
```yaml
|
||||
selector:
|
||||
app: guestbook
|
||||
tier: frontend
|
||||
```
|
||||
|
||||
You can tweak the number of replicas of the stable and canary releases to determine the ratio of each release that will receive live production traffic (in this case, 3:1).
|
||||
Once you're confident, you can update the stable track to the new application release and remove the canary one.
|
||||
|
||||
For a more concrete example, check the [tutorial of deploying Ghost](https://github.com/kelseyhightower/talks/tree/master/kubecon-eu-2016/demo#deploy-a-canary).
|
||||
|
||||
## Updating labels
|
||||
|
||||
Sometimes existing pods and other resources need to be relabeled before creating new resources. This can be done with `kubectl label`.
|
||||
For example, if you want to label all your nginx pods as frontend tier, simply run:
|
||||
|
||||
```shell
|
||||
$ kubectl label pods -l app=nginx tier=fe
|
||||
pod "my-nginx-2035384211-j5fhi" labeled
|
||||
pod "my-nginx-2035384211-u2c7e" labeled
|
||||
pod "my-nginx-2035384211-u3t6x" labeled
|
||||
```
|
||||
|
||||
This first filters all pods with the label "app=nginx", and then labels them with the "tier=fe".
|
||||
To see the pods you just labeled, run:
|
||||
|
||||
```shell
|
||||
$ kubectl get pods -l app=nginx -L tier
|
||||
NAME READY STATUS RESTARTS AGE TIER
|
||||
my-nginx-2035384211-j5fhi 1/1 Running 0 23m fe
|
||||
my-nginx-2035384211-u2c7e 1/1 Running 0 23m fe
|
||||
my-nginx-2035384211-u3t6x 1/1 Running 0 23m fe
|
||||
```
|
||||
|
||||
This outputs all "app=nginx" pods, with an additional label column of pods' tier (specified with `-L` or `--label-columns`).
|
||||
|
||||
For more information, please see [labels](/docs/user-guide/labels/) and [kubectl label](/docs/user-guide/kubectl/kubectl_label/) document.
|
||||
|
||||
## Updating annotations
|
||||
|
||||
Sometimes you would want to attach annotations to resources. Annotations are arbitrary non-identifying metadata for retrieval by API clients such as tools, libraries, etc. This can be done with `kubectl annotate`. For example:
|
||||
|
||||
```shell
|
||||
$ kubectl annotate pods my-nginx-v4-9gw19 description='my frontend running nginx'
|
||||
$ kubectl get pods my-nginx-v4-9gw19 -o yaml
|
||||
apiversion: v1
|
||||
kind: pod
|
||||
metadata:
|
||||
annotations:
|
||||
description: my frontend running nginx
|
||||
...
|
||||
```
|
||||
|
||||
For more information, please see [annotations](/docs/user-guide/annotations/) and [kubectl annotate](/docs/user-guide/kubectl/kubectl_annotate/) document.
|
||||
|
||||
## Scaling your application
|
||||
|
||||
When load on your application grows or shrinks, it's easy to scale with `kubectl`. For instance, to decrease the number of nginx replicas from 3 to 1, do:
|
||||
|
||||
```shell
|
||||
$ kubectl scale deployment/my-nginx --replicas=1
|
||||
deployment "my-nginx" scaled
|
||||
```
|
||||
|
||||
Now you only have one pod managed by the deployment.
|
||||
|
||||
```shell
|
||||
$ kubectl get pods -l app=nginx
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
my-nginx-2035384211-j5fhi 1/1 Running 0 30m
|
||||
```
|
||||
|
||||
To have the system automatically choose the number of nginx replicas as needed, ranging from 1 to 3, do:
|
||||
|
||||
```shell
|
||||
$ kubectl autoscale deployment/my-nginx --min=1 --max=3
|
||||
deployment "my-nginx" autoscaled
|
||||
```
|
||||
|
||||
Now your nginx replicas will be scaled up and down as needed, automatically.
|
||||
|
||||
For more information, please see [kubectl scale](/docs/user-guide/kubectl/kubectl_scale/), [kubectl autoscale](/docs/user-guide/kubectl/kubectl_autoscale/) and [horizontal pod autoscaler](/docs/user-guide/horizontal-pod-autoscaler/) document.
|
||||
|
||||
|
||||
## In-place updates of resources
|
||||
|
||||
Sometimes it's necessary to make narrow, non-disruptive updates to resources you've created.
|
||||
|
||||
### kubectl apply
|
||||
|
||||
It is suggested to maintain a set of configuration files in source control (see [configuration as code](http://martinfowler.com/bliki/InfrastructureAsCode.html)),
|
||||
so that they can be maintained and versioned along with the code for the resources they configure.
|
||||
Then, you can use [`kubectl apply`](/docs/user-guide/kubectl/kubectl_apply/) to push your configuration changes to the cluster.
|
||||
|
||||
This command will compare the version of the configuration that you're pushing with the previous version and apply the changes you've made, without overwriting any automated changes to properties you haven't specified.
|
||||
|
||||
```shell
|
||||
$ kubectl apply -f docs/user-guide/nginx/nginx-deployment.yaml
|
||||
deployment "my-nginx" configured
|
||||
```
|
||||
|
||||
Note that `kubectl apply` attaches an annotation to the resource in order to determine the changes to the configuration since the previous invocation. When it's invoked, `kubectl apply` does a three-way diff between the previous configuration, the provided input and the current configuration of the resource, in order to determine how to modify the resource.
|
||||
|
||||
Currently, resources are created without this annotation, so the first invocation of `kubectl apply` will fall back to a two-way diff between the provided input and the current configuration of the resource. During this first invocation, it cannot detect the deletion of properties set when the resource was created. For this reason, it will not remove them.
|
||||
|
||||
All subsequent calls to `kubectl apply`, and other commands that modify the configuration, such as `kubectl replace` and `kubectl edit`, will update the annotation, allowing subsequent calls to `kubectl apply` to detect and perform deletions using a three-way diff.
|
||||
|
||||
**Note:** To use apply, always create resource initially with either `kubectl apply` or `kubectl create --save-config`.
|
||||
|
||||
### kubectl edit
|
||||
|
||||
Alternatively, you may also update resources with `kubectl edit`:
|
||||
|
||||
```shell
|
||||
$ kubectl edit deployment/my-nginx
|
||||
```
|
||||
|
||||
This is equivalent to first `get` the resource, edit it in text editor, and then `apply` the resource with the updated version:
|
||||
|
||||
```shell
|
||||
$ kubectl get deployment my-nginx -o yaml > /tmp/nginx.yaml
|
||||
$ vi /tmp/nginx.yaml
|
||||
# do some edit, and then save the file
|
||||
$ kubectl apply -f /tmp/nginx.yaml
|
||||
deployment "my-nginx" configured
|
||||
$ rm /tmp/nginx.yaml
|
||||
```
|
||||
|
||||
This allows you to do more significant changes more easily. Note that you can specify the editor with your `EDITOR` or `KUBE_EDITOR` environment variables.
|
||||
|
||||
For more information, please see [kubectl edit](/docs/user-guide/kubectl/kubectl_edit/) document.
|
||||
|
||||
### kubectl patch
|
||||
|
||||
Suppose you want to fix a typo of the container's image of a Deployment. One way to do that is with `kubectl patch`:
|
||||
|
||||
```shell
|
||||
# Suppose you have a Deployment with a container named "nginx" and its image "nignx" (typo),
|
||||
# use container name "nginx" as a key to update the image from "nignx" (typo) to "nginx"
|
||||
$ kubectl get deployment my-nginx -o yaml
|
||||
```
|
||||
|
||||
```yaml
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
...
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- image: nignx
|
||||
name: nginx
|
||||
...
|
||||
```
|
||||
|
||||
```shell
|
||||
$ kubectl patch deployment my-nginx -p'{"spec":{"template":{"spec":{"containers":[{"name":"nginx","image":"nginx"}]}}}}'
|
||||
"my-nginx" patched
|
||||
$ kubectl get pod my-nginx-1jgkf -o yaml
|
||||
```
|
||||
|
||||
```yaml
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
...
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- image: nginx
|
||||
name: nginx
|
||||
...
|
||||
```
|
||||
|
||||
The patch is specified using json.
|
||||
|
||||
The system ensures that you don't clobber changes made by other users or components by confirming that the `resourceVersion` doesn't differ from the version you edited. If you want to update regardless of other changes, remove the `resourceVersion` field when you edit the resource. However, if you do this, don't use your original configuration file as the source since additional fields most likely were set in the live state.
|
||||
|
||||
For more information, please see [kubectl patch](/docs/user-guide/kubectl/kubectl_patch/) document.
|
||||
|
||||
## Disruptive updates
|
||||
|
||||
In some cases, you may need to update resource fields that cannot be updated once initialized, or you may just want to make a recursive change immediately, such as to fix broken pods created by a Deployment. To change such fields, use `replace --force`, which deletes and re-creates the resource. In this case, you can simply modify your original configuration file:
|
||||
|
||||
```shell
|
||||
$ kubectl replace -f docs/user-guide/nginx/nginx-deployment.yaml --force
|
||||
deployment "my-nginx" deleted
|
||||
deployment "my-nginx" replaced
|
||||
```
|
||||
|
||||
## Updating your application without a service outage
|
||||
|
||||
At some point, you'll eventually need to update your deployed application, typically by specifying a new image or image tag, as in the canary deployment scenario above. `kubectl` supports several update operations, each of which is applicable to different scenarios.
|
||||
|
||||
We'll guide you through how to create and update applications with Deployments. If your deployed application is managed by Replication Controllers,
|
||||
you should read [how to use `kubectl rolling-update`](/docs/tasks/run-application/rolling-update-replication-controller/) instead.
|
||||
|
||||
Let's say you were running version 1.7.9 of nginx:
|
||||
|
||||
```shell
|
||||
$ kubectl run my-nginx --image=nginx:1.7.9 --replicas=3
|
||||
deployment "my-nginx" created
|
||||
```
|
||||
|
||||
To update to version 1.9.1, simply change `.spec.template.spec.containers[0].image` from `nginx:1.7.9` to `nginx:1.9.1`, with the kubectl commands we learned above.
|
||||
|
||||
```shell
|
||||
$ kubectl edit deployment/my-nginx
|
||||
```
|
||||
|
||||
That's it! The Deployment will declaratively update the deployed nginx application progressively behind the scene. It ensures that only a certain number of old replicas may be down while they are being updated, and only a certain number of new replicas may be created above the desired number of pods. To learn more details about it, visit [Deployment page](/docs/user-guide/deployments/).
|
||||
|
||||
## What's next?
|
||||
|
||||
- [Learn about how to use `kubectl` for application introspection and debugging.](/docs/user-guide/introspection-and-debugging/)
|
||||
- [Configuration Best Practices and Tips](/docs/concepts/configuration/overview/)
|
|
@ -0,0 +1,66 @@
|
|||
---
|
||||
assignees:
|
||||
- davidopp
|
||||
title: Using Multiple Clusters
|
||||
---
|
||||
|
||||
You may want to set up multiple Kubernetes clusters, both to
|
||||
have clusters in different regions to be nearer to your users, and to tolerate failures and/or invasive maintenance.
|
||||
This document describes some of the issues to consider when making a decision about doing so.
|
||||
|
||||
If you decide to have multiple clusters, Kubernetes provides a way to [federate them](/docs/admin/federation/).
|
||||
|
||||
## Scope of a single cluster
|
||||
|
||||
On IaaS providers such as Google Compute Engine or Amazon Web Services, a VM exists in a
|
||||
[zone](https://cloud.google.com/compute/docs/zones) or [availability
|
||||
zone](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html).
|
||||
We suggest that all the VMs in a Kubernetes cluster should be in the same availability zone, because:
|
||||
|
||||
- compared to having a single global Kubernetes cluster, there are fewer single-points of failure
|
||||
- compared to a cluster that spans availability zones, it is easier to reason about the availability properties of a
|
||||
single-zone cluster.
|
||||
- when the Kubernetes developers are designing the system (e.g. making assumptions about latency, bandwidth, or
|
||||
correlated failures) they are assuming all the machines are in a single data center, or otherwise closely connected.
|
||||
|
||||
It is okay to have multiple clusters per availability zone, though on balance we think fewer is better.
|
||||
Reasons to prefer fewer clusters are:
|
||||
|
||||
- improved bin packing of Pods in some cases with more nodes in one cluster (less resource fragmentation)
|
||||
- reduced operational overhead (though the advantage is diminished as ops tooling and processes matures)
|
||||
- reduced costs for per-cluster fixed resource costs, e.g. apiserver VMs (but small as a percentage
|
||||
of overall cluster cost for medium to large clusters).
|
||||
|
||||
Reasons to have multiple clusters include:
|
||||
|
||||
- strict security policies requiring isolation of one class of work from another (but, see Partitioning Clusters
|
||||
below).
|
||||
- test clusters to canary new Kubernetes releases or other cluster software.
|
||||
|
||||
## Selecting the right number of clusters
|
||||
|
||||
The selection of the number of Kubernetes clusters may be a relatively static choice, only revisited occasionally.
|
||||
By contrast, the number of nodes in a cluster and the number of pods in a service may change frequently according to
|
||||
load and growth.
|
||||
|
||||
To pick the number of clusters, first, decide which regions you need to be in to have adequate latency to all your end users, for services that will run
|
||||
on Kubernetes (if you use a Content Distribution Network, the latency requirements for the CDN-hosted content need not
|
||||
be considered). Legal issues might influence this as well. For example, a company with a global customer base might decide to have clusters in US, EU, AP, and SA regions.
|
||||
Call the number of regions to be in `R`.
|
||||
|
||||
Second, decide how many clusters should be able to be unavailable at the same time, while still being available. Call
|
||||
the number that can be unavailable `U`. If you are not sure, then 1 is a fine choice.
|
||||
|
||||
If it is allowable for load-balancing to direct traffic to any region in the event of a cluster failure, then
|
||||
you need at least the larger of `R` or `U + 1` clusters. If it is not (e.g. you want to ensure low latency for all
|
||||
users in the event of a cluster failure), then you need to have `R * (U + 1)` clusters
|
||||
(`U + 1` in each of `R` regions). In any case, try to put each cluster in a different zone.
|
||||
|
||||
Finally, if any of your clusters would need more than the maximum recommended number of nodes for a Kubernetes cluster, then
|
||||
you may need even more clusters. Kubernetes v1.3 supports clusters up to 1000 nodes in size.
|
||||
|
||||
## Working with multiple clusters
|
||||
|
||||
When you have multiple clusters, you would typically create services with the same config in each cluster and put each of those
|
||||
service instances behind a load balancer (AWS Elastic Load Balancer, GCE Forwarding Rule or HTTP Load Balancer) spanning all of them, so that
|
||||
failures of a single cluster are not visible to end users.
|
|
@ -0,0 +1,73 @@
|
|||
---
|
||||
assignees:
|
||||
- dcbw
|
||||
- freehan
|
||||
- thockin
|
||||
title: Network Plugins
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
__Disclaimer__: Network plugins are in alpha. Its contents will change rapidly.
|
||||
|
||||
Network plugins in Kubernetes come in a few flavors:
|
||||
|
||||
* CNI plugins: adhere to the appc/CNI specification, designed for interoperability.
|
||||
* Kubenet plugin: implements basic `cbr0` using the `bridge` and `host-local` CNI plugins
|
||||
|
||||
## Installation
|
||||
|
||||
The kubelet has a single default network plugin, and a default network common to the entire cluster. It probes for plugins when it starts up, remembers what it found, and executes the selected plugin at appropriate times in the pod lifecycle (this is only true for docker, as rkt manages its own CNI plugins). There are two Kubelet command line parameters to keep in mind when using plugins:
|
||||
|
||||
* `network-plugin-dir`: Kubelet probes this directory for plugins on startup
|
||||
* `network-plugin`: The network plugin to use from `network-plugin-dir`. It must match the name reported by a plugin probed from the plugin directory. For CNI plugins, this is simply "cni".
|
||||
|
||||
## Network Plugin Requirements
|
||||
|
||||
Besides providing the [`NetworkPlugin` interface](https://github.com/kubernetes/kubernetes/tree/{{page.version}}/pkg/kubelet/network/plugins.go) to configure and clean up pod networking, the plugin may also need specific support for kube-proxy. The iptables proxy obviously depends on iptables, and the plugin may need to ensure that container traffic is made available to iptables. For example, if the plugin connects containers to a Linux bridge, the plugin must set the `net/bridge/bridge-nf-call-iptables` sysctl to `1` to ensure that the iptables proxy functions correctly. If the plugin does not use a Linux bridge (but instead something like Open vSwitch or some other mechanism) it should ensure container traffic is appropriately routed for the proxy.
|
||||
|
||||
By default if no kubelet network plugin is specified, the `noop` plugin is used, which sets `net/bridge/bridge-nf-call-iptables=1` to ensure simple configurations (like docker with a bridge) work correctly with the iptables proxy.
|
||||
|
||||
### CNI
|
||||
|
||||
The CNI plugin is selected by passing Kubelet the `--network-plugin=cni` command-line option. Kubelet reads a file from `--cni-conf-dir` (default `/etc/cni/net.d`) and uses the CNI configuration from that file to set up each pod's network. The CNI configuration file must match the [CNI specification](https://github.com/containernetworking/cni/blob/master/SPEC.md#network-configuration), and any required CNI plugins referenced by the configuration must be present in `--cni-bin-dir` (default `/opt/cni/bin`).
|
||||
|
||||
If there are multiple CNI configuration files in the directory, the first one in lexicographic order of file name is used.
|
||||
|
||||
In addition to the CNI plugin specified by the configuration file, Kubernetes requires the standard CNI [`lo`](https://github.com/containernetworking/cni/blob/master/plugins/main/loopback/loopback.go) plugin, at minimum version 0.2.0
|
||||
|
||||
Limitation: Due to [#31307](https://github.com/kubernetes/kubernetes/issues/31307), `HostPort` won't work with CNI networking plugin at the moment. That means all `hostPort` attribute in pod would be simply ignored.
|
||||
|
||||
### kubenet
|
||||
|
||||
Kubenet is a very basic, simple network plugin, on Linux only. It does not, of itself, implement more advanced features like cross-node networking or network policy. It is typically used together with a cloud provider that sets up routing rules for communication between nodes, or in single-node environments.
|
||||
|
||||
Kubenet creates a Linux bridge named `cbr0` and creates a veth pair for each pod with the host end of each pair connected to `cbr0`. The pod end of the pair is assigned an IP address allocated from a range assigned to the node either through configuration or by the controller-manager. `cbr0` is assigned an MTU matching the smallest MTU of an enabled normal interface on the host.
|
||||
|
||||
The plugin requires a few things:
|
||||
|
||||
* The standard CNI `bridge`, `lo` and `host-local` plugins are required, at minimum version 0.2.0. Kubenet will first search for them in `/opt/cni/bin`. Specify `network-plugin-dir` to supply additional search path. The first found match will take effect.
|
||||
* Kubelet must be run with the `--network-plugin=kubenet` argument to enable the plugin
|
||||
* Kubelet should also be run with the `--non-masquerade-cidr=<clusterCidr>` argument to ensure traffic to IPs outside this range will use IP masquerade.
|
||||
* The node must be assigned an IP subnet through either the `--pod-cidr` kubelet command-line option or the `--allocate-node-cidrs=true --cluster-cidr=<cidr>` controller-manager command-line options.
|
||||
|
||||
### Customizing the MTU (with kubenet)
|
||||
|
||||
The MTU should always be configured correctly to get the best networking performance. Network plugins will usually try
|
||||
to infer a sensible MTU, but sometimes the logic will not result in an optimal MTU. For example, if the
|
||||
Docker bridge or another interface has a small MTU, kubenet will currently select that MTU. Or if you are
|
||||
using IPSEC encapsulation, the MTU must be reduced, and this calculation is out-of-scope for
|
||||
most network plugins.
|
||||
|
||||
Where needed, you can specify the MTU explicitly with the `network-plugin-mtu` kubelet option. For example,
|
||||
on AWS the `eth0` MTU is typically 9001, so you might specify `--network-plugin-mtu=9001`. If you're using IPSEC you
|
||||
might reduce it to allow for encapsulation overhead e.g. `--network-plugin-mtu=8873`.
|
||||
|
||||
This option is provided to the network-plugin; currently **only kubenet supports `network-plugin-mtu`**.
|
||||
|
||||
## Usage Summary
|
||||
|
||||
* `--network-plugin=cni` specifies that we use the `cni` network plugin with actual CNI plugin binaries located in `--cni-bin-dir` (default `/opt/cni/bin`) and CNI plugin configuration located in `--cni-conf-dir` (default `/etc/cni/net.d`).
|
||||
* `--network-plugin=kubenet` specifies that we use the `kubenet` network plugin with CNI `bridge` and `host-local` plugins placed in `/opt/cni/bin` or `network-plugin-dir`.
|
||||
* `--network-plugin-mtu=9001` specifies the MTU to use, currently only used by the `kubenet` network plugin.
|
|
@ -0,0 +1,215 @@
|
|||
---
|
||||
assignees:
|
||||
- thockin
|
||||
title: Cluster Networking
|
||||
---
|
||||
|
||||
Kubernetes approaches networking somewhat differently than Docker does by
|
||||
default. There are 4 distinct networking problems to solve:
|
||||
|
||||
1. Highly-coupled container-to-container communications: this is solved by
|
||||
[pods](/docs/user-guide/pods/) and `localhost` communications.
|
||||
2. Pod-to-Pod communications: this is the primary focus of this document.
|
||||
3. Pod-to-Service communications: this is covered by [services](/docs/user-guide/services/).
|
||||
4. External-to-Service communications: this is covered by [services](/docs/user-guide/services/).
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
|
||||
## Summary
|
||||
|
||||
Kubernetes assumes that pods can communicate with other pods, regardless of
|
||||
which host they land on. We give every pod its own IP address so you do not
|
||||
need to explicitly create links between pods and you almost never need to deal
|
||||
with mapping container ports to host ports. This creates a clean,
|
||||
backwards-compatible model where pods can be treated much like VMs or physical
|
||||
hosts from the perspectives of port allocation, naming, service discovery, load
|
||||
balancing, application configuration, and migration.
|
||||
|
||||
To achieve this we must impose some requirements on how you set up your cluster
|
||||
networking.
|
||||
|
||||
## Docker model
|
||||
|
||||
Before discussing the Kubernetes approach to networking, it is worthwhile to
|
||||
review the "normal" way that networking works with Docker. By default, Docker
|
||||
uses host-private networking. It creates a virtual bridge, called `docker0` by
|
||||
default, and allocates a subnet from one of the private address blocks defined
|
||||
in [RFC1918](https://tools.ietf.org/html/rfc1918) for that bridge. For each
|
||||
container that Docker creates, it allocates a virtual ethernet device (called
|
||||
`veth`) which is attached to the bridge. The veth is mapped to appear as `eth0`
|
||||
in the container, using Linux namespaces. The in-container `eth0` interface is
|
||||
given an IP address from the bridge's address range.
|
||||
|
||||
The result is that Docker containers can talk to other containers only if they
|
||||
are on the same machine (and thus the same virtual bridge). Containers on
|
||||
different machines can not reach each other - in fact they may end up with the
|
||||
exact same network ranges and IP addresses.
|
||||
|
||||
In order for Docker containers to communicate across nodes, they must be
|
||||
allocated ports on the machine's own IP address, which are then forwarded or
|
||||
proxied to the containers. This obviously means that containers must either
|
||||
coordinate which ports they use very carefully or else be allocated ports
|
||||
dynamically.
|
||||
|
||||
## Kubernetes model
|
||||
|
||||
Coordinating ports across multiple developers is very difficult to do at
|
||||
scale and exposes users to cluster-level issues outside of their control.
|
||||
Dynamic port allocation brings a lot of complications to the system - every
|
||||
application has to take ports as flags, the API servers have to know how to
|
||||
insert dynamic port numbers into configuration blocks, services have to know
|
||||
how to find each other, etc. Rather than deal with this, Kubernetes takes a
|
||||
different approach.
|
||||
|
||||
Kubernetes imposes the following fundamental requirements on any networking
|
||||
implementation (barring any intentional network segmentation policies):
|
||||
|
||||
* all containers can communicate with all other containers without NAT
|
||||
* all nodes can communicate with all containers (and vice-versa) without NAT
|
||||
* the IP that a container sees itself as is the same IP that others see it as
|
||||
|
||||
What this means in practice is that you can not just take two computers
|
||||
running Docker and expect Kubernetes to work. You must ensure that the
|
||||
fundamental requirements are met.
|
||||
|
||||
This model is not only less complex overall, but it is principally compatible
|
||||
with the desire for Kubernetes to enable low-friction porting of apps from VMs
|
||||
to containers. If your job previously ran in a VM, your VM had an IP and could
|
||||
talk to other VMs in your project. This is the same basic model.
|
||||
|
||||
Until now this document has talked about containers. In reality, Kubernetes
|
||||
applies IP addresses at the `Pod` scope - containers within a `Pod` share their
|
||||
network namespaces - including their IP address. This means that containers
|
||||
within a `Pod` can all reach each other's ports on `localhost`. This does imply
|
||||
that containers within a `Pod` must coordinate port usage, but this is no
|
||||
different than processes in a VM. We call this the "IP-per-pod" model. This
|
||||
is implemented in Docker as a "pod container" which holds the network namespace
|
||||
open while "app containers" (the things the user specified) join that namespace
|
||||
with Docker's `--net=container:<id>` function.
|
||||
|
||||
As with Docker, it is possible to request host ports, but this is reduced to a
|
||||
very niche operation. In this case a port will be allocated on the host `Node`
|
||||
and traffic will be forwarded to the `Pod`. The `Pod` itself is blind to the
|
||||
existence or non-existence of host ports.
|
||||
|
||||
## How to achieve this
|
||||
|
||||
There are a number of ways that this network model can be implemented. This
|
||||
document is not an exhaustive study of the various methods, but hopefully serves
|
||||
as an introduction to various technologies and serves as a jumping-off point.
|
||||
|
||||
The following networking options are sorted alphabetically - the order does not
|
||||
imply any preferential status.
|
||||
|
||||
### Contiv
|
||||
|
||||
[Contiv](https://github.com/contiv/netplugin) provides configurable networking (native l3 using BGP, overlay using vxlan, classic l2, or Cisco-SDN/ACI) for various use cases. [Contiv](http://contiv.io) is all open sourced.
|
||||
|
||||
### Flannel
|
||||
|
||||
[Flannel](https://github.com/coreos/flannel#flannel) is a very simple overlay
|
||||
network that satisfies the Kubernetes requirements. Many
|
||||
people have reported success with Flannel and Kubernetes.
|
||||
|
||||
### Google Compute Engine (GCE)
|
||||
|
||||
For the Google Compute Engine cluster configuration scripts, we use [advanced
|
||||
routing](https://cloud.google.com/compute/docs/networking#routing) to
|
||||
assign each VM a subnet (default is `/24` - 254 IPs). Any traffic bound for that
|
||||
subnet will be routed directly to the VM by the GCE network fabric. This is in
|
||||
addition to the "main" IP address assigned to the VM, which is NAT'ed for
|
||||
outbound internet access. A linux bridge (called `cbr0`) is configured to exist
|
||||
on that subnet, and is passed to docker's `--bridge` flag.
|
||||
|
||||
We start Docker with:
|
||||
|
||||
```shell
|
||||
DOCKER_OPTS="--bridge=cbr0 --iptables=false --ip-masq=false"
|
||||
```
|
||||
|
||||
This bridge is created by Kubelet (controlled by the `--network-plugin=kubenet`
|
||||
flag) according to the `Node`'s `spec.podCIDR`.
|
||||
|
||||
Docker will now allocate IPs from the `cbr-cidr` block. Containers can reach
|
||||
each other and `Nodes` over the `cbr0` bridge. Those IPs are all routable
|
||||
within the GCE project network.
|
||||
|
||||
GCE itself does not know anything about these IPs, though, so it will not NAT
|
||||
them for outbound internet traffic. To achieve that we use an iptables rule to
|
||||
masquerade (aka SNAT - to make it seem as if packets came from the `Node`
|
||||
itself) traffic that is bound for IPs outside the GCE project network
|
||||
(10.0.0.0/8).
|
||||
|
||||
```shell
|
||||
iptables -t nat -A POSTROUTING ! -d 10.0.0.0/8 -o eth0 -j MASQUERADE
|
||||
```
|
||||
|
||||
Lastly we enable IP forwarding in the kernel (so the kernel will process
|
||||
packets for bridged containers):
|
||||
|
||||
```shell
|
||||
sysctl net.ipv4.ip_forward=1
|
||||
```
|
||||
|
||||
The result of all this is that all `Pods` can reach each other and can egress
|
||||
traffic to the internet.
|
||||
|
||||
### L2 networks and linux bridging
|
||||
|
||||
If you have a "dumb" L2 network, such as a simple switch in a "bare-metal"
|
||||
environment, you should be able to do something similar to the above GCE setup.
|
||||
Note that these instructions have only been tried very casually - it seems to
|
||||
work, but has not been thoroughly tested. If you use this technique and
|
||||
perfect the process, please let us know.
|
||||
|
||||
Follow the "With Linux Bridge devices" section of [this very nice
|
||||
tutorial](http://blog.oddbit.com/2014/08/11/four-ways-to-connect-a-docker/) from
|
||||
Lars Kellogg-Stedman.
|
||||
|
||||
### Nuage Networks VCS (Virtualized Cloud Services)
|
||||
|
||||
[Nuage](http://www.nuagenetworks.net) provides a highly scalable policy-based Software-Defined Networking (SDN) platform. Nuage uses the open source Open vSwitch for the data plane along with a feature rich SDN Controller built on open standards.
|
||||
|
||||
The Nuage platform uses overlays to provide seamless policy-based networking between Kubernetes Pods and non-Kubernetes environments (VMs and bare metal servers). Nuage's policy abstraction model is designed with applications in mind and makes it easy to declare fine-grained policies for applications.The platform's real-time analytics engine enables visibility and security monitoring for Kubernetes applications.
|
||||
|
||||
### OpenVSwitch
|
||||
|
||||
[OpenVSwitch](/docs/admin/ovs-networking) is a somewhat more mature but also
|
||||
complicated way to build an overlay network. This is endorsed by several of the
|
||||
"Big Shops" for networking.
|
||||
|
||||
### OVN (Open Virtual Networking)
|
||||
|
||||
OVN is an opensource network virtualization solution developed by the
|
||||
Open vSwitch community. It lets one create logical switches, logical routers,
|
||||
stateful ACLs, load-balancers etc to build different virtual networking
|
||||
topologies. The project has a specific Kubernetes plugin and documentation
|
||||
at [ovn-kubernetes](https://github.com/openvswitch/ovn-kubernetes).
|
||||
|
||||
### Project Calico
|
||||
|
||||
[Project Calico](http://docs.projectcalico.org/) is an open source container networking provider and network policy engine.
|
||||
|
||||
Calico provides a highly scalable networking and network policy solution for connecting Kubernetes pods based on the same IP networking principles as the internet. Calico can be deployed without encapsulation or overlays to provide high-performance, high-scale data center networking. Calico also provides fine-grained, intent based network security policy for Kubernetes pods via its distributed firewall.
|
||||
|
||||
Calico can also be run in policy enforcement mode in conjunction with other networking solutions such as Flannel, aka [canal](https://github.com/tigera/canal), or native GCE networking.
|
||||
|
||||
### Romana
|
||||
|
||||
[Romana](http://romana.io) is an open source network and security automation solution that lets you deploy Kubernetes without an overlay network. Romana supports Kubernetes [Network Policy](/docs/user-guide/networkpolicies/) to provide isolation across network namespaces.
|
||||
|
||||
### Weave Net from Weaveworks
|
||||
|
||||
[Weave Net](https://www.weave.works/products/weave-net/) is a
|
||||
resilient and simple to use network for Kubernetes and its hosted applications.
|
||||
Weave Net runs as a [CNI plug-in](https://www.weave.works/docs/net/latest/cni-plugin/)
|
||||
or stand-alone. In either version, it doesn't require any configuration or extra code
|
||||
to run, and in both cases, the network provides one IP address per pod - as is standard for Kubernetes.
|
||||
|
||||
## Other reading
|
||||
|
||||
The early design of the networking model and its rationale, and some future
|
||||
plans are described in more detail in the [networking design
|
||||
document](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/networking.md).
|
|
@ -0,0 +1,29 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: my-nginx-svc
|
||||
labels:
|
||||
app: nginx
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
ports:
|
||||
- port: 80
|
||||
selector:
|
||||
app: nginx
|
||||
---
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: my-nginx
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nginx
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:1.7.9
|
||||
ports:
|
||||
- containerPort: 80
|
|
@ -0,0 +1,368 @@
|
|||
---
|
||||
assignees:
|
||||
- derekwaynecarr
|
||||
- vishh
|
||||
- timstclair
|
||||
title: Configuring Out Of Resource Handling
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
The `kubelet` needs to preserve node stability when available compute resources are low.
|
||||
|
||||
This is especially important when dealing with incompressible resources such as memory or disk.
|
||||
|
||||
If either resource is exhausted, the node would become unstable.
|
||||
|
||||
## Eviction Policy
|
||||
|
||||
The `kubelet` can pro-actively monitor for and prevent against total starvation of a compute resource. In those cases, the `kubelet` can pro-actively fail one or more pods in order to reclaim
|
||||
the starved resource. When the `kubelet` fails a pod, it terminates all containers in the pod, and the `PodPhase`
|
||||
is transitioned to `Failed`.
|
||||
|
||||
### Eviction Signals
|
||||
|
||||
The `kubelet` can support the ability to trigger eviction decisions on the signals described in the
|
||||
table below. The value of each signal is described in the description column based on the `kubelet`
|
||||
summary API.
|
||||
|
||||
| Eviction Signal | Description |
|
||||
|----------------------------|-----------------------------------------------------------------------|
|
||||
| `memory.available` | `memory.available` := `node.status.capacity[memory]` - `node.stats.memory.workingSet` |
|
||||
| `nodefs.available` | `nodefs.available` := `node.stats.fs.available` |
|
||||
| `nodefs.inodesFree` | `nodefs.inodesFree` := `node.stats.fs.inodesFree` |
|
||||
| `imagefs.available` | `imagefs.available` := `node.stats.runtime.imagefs.available` |
|
||||
| `imagefs.inodesFree` | `imagefs.inodesFree` := `node.stats.runtime.imagefs.inodesFree` |
|
||||
|
||||
Each of the above signals supports either a literal or percentage based value. The percentage based value
|
||||
is calculated relative to the total capacity associated with each signal.
|
||||
|
||||
`kubelet` supports only two filesystem partitions.
|
||||
|
||||
1. The `nodefs` filesystem that kubelet uses for volumes, daemon logs, etc.
|
||||
1. The `imagefs` filesystem that container runtimes uses for storing images and container writable layers.
|
||||
|
||||
`imagefs` is optional. `kubelet` auto-discovers these filesystems using cAdvisor. `kubelet` does not care about any
|
||||
other filesystems. Any other types of configurations are not currently supported by the kubelet. For example, it is
|
||||
*not OK* to store volumes and logs in a dedicated `filesystem`.
|
||||
|
||||
In future releases, the `kubelet` will deprecate the existing [garbage collection](/docs/admin/garbage-collection/)
|
||||
support in favor of eviction in response to disk pressure.
|
||||
|
||||
### Eviction Thresholds
|
||||
|
||||
The `kubelet` supports the ability to specify eviction thresholds that trigger the `kubelet` to reclaim resources.
|
||||
|
||||
Each threshold is of the following form:
|
||||
|
||||
`<eviction-signal><operator><quantity>`
|
||||
|
||||
* valid `eviction-signal` tokens as defined above.
|
||||
* valid `operator` tokens are `<`
|
||||
* valid `quantity` tokens must match the quantity representation used by Kubernetes
|
||||
* an eviction threshold can be expressed as a percentage if ends with `%` token.
|
||||
|
||||
For example, if a node has `10Gi` of memory, and the desire is to induce eviction
|
||||
if available memory falls below `1Gi`, an eviction threshold can be specified as either
|
||||
of the following (but not both).
|
||||
|
||||
* `memory.available<10%`
|
||||
* `memory.available<1Gi`
|
||||
|
||||
#### Soft Eviction Thresholds
|
||||
|
||||
A soft eviction threshold pairs an eviction threshold with a required
|
||||
administrator specified grace period. No action is taken by the `kubelet`
|
||||
to reclaim resources associated with the eviction signal until that grace
|
||||
period has been exceeded. If no grace period is provided, the `kubelet` will
|
||||
error on startup.
|
||||
|
||||
In addition, if a soft eviction threshold has been met, an operator can
|
||||
specify a maximum allowed pod termination grace period to use when evicting
|
||||
pods from the node. If specified, the `kubelet` will use the lesser value among
|
||||
the `pod.Spec.TerminationGracePeriodSeconds` and the max allowed grace period.
|
||||
If not specified, the `kubelet` will kill pods immediately with no graceful
|
||||
termination.
|
||||
|
||||
To configure soft eviction thresholds, the following flags are supported:
|
||||
|
||||
* `eviction-soft` describes a set of eviction thresholds (e.g. `memory.available<1.5Gi`) that if met over a
|
||||
corresponding grace period would trigger a pod eviction.
|
||||
* `eviction-soft-grace-period` describes a set of eviction grace periods (e.g. `memory.available=1m30s`) that
|
||||
correspond to how long a soft eviction threshold must hold before triggering a pod eviction.
|
||||
* `eviction-max-pod-grace-period` describes the maximum allowed grace period (in seconds) to use when terminating
|
||||
pods in response to a soft eviction threshold being met.
|
||||
|
||||
#### Hard Eviction Thresholds
|
||||
|
||||
A hard eviction threshold has no grace period, and if observed, the `kubelet`
|
||||
will take immediate action to reclaim the associated starved resource. If a
|
||||
hard eviction threshold is met, the `kubelet` will kill the pod immediately
|
||||
with no graceful termination.
|
||||
|
||||
To configure hard eviction thresholds, the following flag is supported:
|
||||
|
||||
* `eviction-hard` describes a set of eviction thresholds (e.g. `memory.available<1Gi`) that if met
|
||||
would trigger a pod eviction.
|
||||
|
||||
The `kubelet` has the following default hard eviction thresholds:
|
||||
|
||||
* `--eviction-hard=memory.available<100Mi`
|
||||
|
||||
### Eviction Monitoring Interval
|
||||
|
||||
The `kubelet` evaluates eviction thresholds per its configured housekeeping interval.
|
||||
|
||||
* `housekeeping-interval` is the interval between container housekeepings.
|
||||
|
||||
### Node Conditions
|
||||
|
||||
The `kubelet` will map one or more eviction signals to a corresponding node condition.
|
||||
|
||||
If a hard eviction threshold has been met, or a soft eviction threshold has been met
|
||||
independent of its associated grace period, the `kubelet` will report a condition that
|
||||
reflects the node is under pressure.
|
||||
|
||||
The following node conditions are defined that correspond to the specified eviction signal.
|
||||
|
||||
| Node Condition | Eviction Signal | Description |
|
||||
|-------------------------|-------------------------------|--------------------------------------------|
|
||||
| `MemoryPressure` | `memory.available` | Available memory on the node has satisfied an eviction threshold |
|
||||
| `DiskPressure` | `nodefs.available`, `nodefs.inodesFree`, `imagefs.available`, or `imagefs.inodesFree` | Available disk space and inodes on either the node's root filesytem or image filesystem has satisfied an eviction threshold |
|
||||
|
||||
The `kubelet` will continue to report node status updates at the frequency specified by
|
||||
`--node-status-update-frequency` which defaults to `10s`.
|
||||
|
||||
### Oscillation of node conditions
|
||||
|
||||
If a node is oscillating above and below a soft eviction threshold, but not exceeding
|
||||
its associated grace period, it would cause the corresponding node condition to
|
||||
constantly oscillate between true and false, and could cause poor scheduling decisions
|
||||
as a consequence.
|
||||
|
||||
To protect against this oscillation, the following flag is defined to control how
|
||||
long the `kubelet` must wait before transitioning out of a pressure condition.
|
||||
|
||||
* `eviction-pressure-transition-period` is the duration for which the `kubelet` has
|
||||
to wait before transitioning out of an eviction pressure condition.
|
||||
|
||||
The `kubelet` would ensure that it has not observed an eviction threshold being met
|
||||
for the specified pressure condition for the period specified before toggling the
|
||||
condition back to `false`.
|
||||
|
||||
### Reclaiming node level resources
|
||||
|
||||
If an eviction threshold has been met and the grace period has passed,
|
||||
the `kubelet` will initiate the process of reclaiming the pressured resource
|
||||
until it has observed the signal has gone below its defined threshold.
|
||||
|
||||
The `kubelet` attempts to reclaim node level resources prior to evicting end-user pods. If
|
||||
disk pressure is observed, the `kubelet` reclaims node level resources differently if the
|
||||
machine has a dedicated `imagefs` configured for the container runtime.
|
||||
|
||||
#### With Imagefs
|
||||
|
||||
If `nodefs` filesystem has met eviction thresholds, `kubelet` will free up disk space in the following order:
|
||||
|
||||
1. Delete dead pods/containers
|
||||
|
||||
If `imagefs` filesystem has met eviction thresholds, `kubelet` will free up disk space in the following order:
|
||||
|
||||
1. Delete all unused images
|
||||
|
||||
#### Without Imagefs
|
||||
|
||||
If `nodefs` filesystem has met eviction thresholds, `kubelet` will free up disk space in the following order:
|
||||
|
||||
1. Delete dead pods/containers
|
||||
1. Delete all unused images
|
||||
|
||||
### Evicting end-user pods
|
||||
|
||||
If the `kubelet` is unable to reclaim sufficient resource on the node,
|
||||
it will begin evicting pods.
|
||||
|
||||
The `kubelet` ranks pods for eviction as follows:
|
||||
|
||||
* by their quality of service
|
||||
* by the consumption of the starved compute resource relative to the pods scheduling request.
|
||||
|
||||
As a result, pod eviction occurs in the following order:
|
||||
|
||||
* `BestEffort` pods that consume the most of the starved resource are failed
|
||||
first.
|
||||
* `Burstable` pods that consume the greatest amount of the starved resource
|
||||
relative to their request for that resource are killed first. If no pod
|
||||
has exceeded its request, the strategy targets the largest consumer of the
|
||||
starved resource.
|
||||
* `Guaranteed` pods that consume the greatest amount of the starved resource
|
||||
relative to their request are killed first. If no pod has exceeded its request,
|
||||
the strategy targets the largest consumer of the starved resource.
|
||||
|
||||
A `Guaranteed` pod is guaranteed to never be evicted because of another pod's
|
||||
resource consumption. If a system daemon (i.e. `kubelet`, `docker`, `journald`, etc.)
|
||||
is consuming more resources than were reserved via `system-reserved` or `kube-reserved` allocations,
|
||||
and the node only has `Guaranteed` pod(s) remaining, then the node must choose to evict a
|
||||
`Guaranteed` pod in order to preserve node stability, and to limit the impact
|
||||
of the unexpected consumption to other `Guaranteed` pod(s).
|
||||
|
||||
Local disk is a `BestEffort` resource. If necessary, `kubelet` will evict pods one at a time to reclaim
|
||||
disk when `DiskPressure` is encountered. The `kubelet` will rank pods by quality of service. If the `kubelet`
|
||||
is responding to `inode` starvation, it will reclaim `inodes` by evicting pods with the lowest quality of service
|
||||
first. If the `kubelet` is responding to lack of available disk, it will rank pods within a quality of service
|
||||
that consumes the largest amount of disk and kill those first.
|
||||
|
||||
#### With Imagefs
|
||||
|
||||
If `nodefs` is triggering evictions, `kubelet` will sort pods based on the usage on `nodefs`
|
||||
- local volumes + logs of all its containers.
|
||||
|
||||
If `imagefs` is triggering evictions, `kubelet` will sort pods based on the writable layer usage of all its containers.
|
||||
|
||||
#### Without Imagefs
|
||||
|
||||
If `nodefs` is triggering evictions, `kubelet` will sort pods based on their total disk usage
|
||||
- local volumes + logs & writable layer of all its containers.
|
||||
|
||||
### Minimum eviction reclaim
|
||||
|
||||
In certain scenarios, eviction of pods could result in reclamation of small amount of resources. This can result in
|
||||
`kubelet` hitting eviction thresholds in repeated successions. In addition to that, eviction of resources like `disk`,
|
||||
is time consuming.
|
||||
|
||||
To mitigate these issues, `kubelet` can have a per-resource `minimum-reclaim`. Whenever `kubelet` observes
|
||||
resource pressure, `kubelet` will attempt to reclaim at least `minimum-reclaim` amount of resource below
|
||||
the configured eviction threshold.
|
||||
|
||||
For example, with the following configuration:
|
||||
|
||||
```
|
||||
--eviction-hard=memory.available<500Mi,nodefs.available<1Gi,imagefs.available<100Gi
|
||||
--eviction-minimum-reclaim="memory.available=0Mi,nodefs.available=500Mi,imagefs.available=2Gi"`
|
||||
```
|
||||
|
||||
If an eviction threshold is triggered for `memory.available`, the `kubelet` will work to ensure
|
||||
that `memory.available` is at least `500Mi`. For `nodefs.available`, the `kubelet` will work
|
||||
to ensure that `nodefs.available` is at least `1.5Gi`, and for `imagefs.available` it will
|
||||
work to ensure that `imagefs.available` is at least `102Gi` before no longer reporting pressure
|
||||
on their associated resources.
|
||||
|
||||
The default `eviction-minimum-reclaim` is `0` for all resources.
|
||||
|
||||
### Scheduler
|
||||
|
||||
The node will report a condition when a compute resource is under pressure. The
|
||||
scheduler views that condition as a signal to dissuade placing additional
|
||||
pods on the node.
|
||||
|
||||
| Node Condition | Scheduler Behavior |
|
||||
| ---------------- | ------------------------------------------------ |
|
||||
| `MemoryPressure` | No new `BestEffort` pods are scheduled to the node. |
|
||||
| `DiskPressure` | No new pods are scheduled to the node. |
|
||||
|
||||
## Node OOM Behavior
|
||||
|
||||
If the node experiences a system OOM (out of memory) event prior to the `kubelet` is able to reclaim memory,
|
||||
the node depends on the [oom_killer](https://lwn.net/Articles/391222/) to respond.
|
||||
|
||||
The `kubelet` sets a `oom_score_adj` value for each container based on the quality of service for the pod.
|
||||
|
||||
| Quality of Service | oom_score_adj |
|
||||
|----------------------------|-----------------------------------------------------------------------|
|
||||
| `Guaranteed` | -998 |
|
||||
| `BestEffort` | 1000 |
|
||||
| `Burstable` | min(max(2, 1000 - (1000 * memoryRequestBytes) / machineMemoryCapacityBytes), 999) |
|
||||
|
||||
If the `kubelet` is unable to reclaim memory prior to a node experiencing system OOM, the `oom_killer` will calculate
|
||||
an `oom_score` based on the percentage of memory its using on the node, and then add the `oom_score_adj` to get an
|
||||
effective `oom_score` for the container, and then kills the container with the highest score.
|
||||
|
||||
The intended behavior should be that containers with the lowest quality of service that
|
||||
are consuming the largest amount of memory relative to the scheduling request should be killed first in order
|
||||
to reclaim memory.
|
||||
|
||||
Unlike pod eviction, if a pod container is OOM killed, it may be restarted by the `kubelet` based on its `RestartPolicy`.
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Schedulable resources and eviction policies
|
||||
|
||||
Let's imagine the following scenario:
|
||||
|
||||
* Node memory capacity: `10Gi`
|
||||
* Operator wants to reserve 10% of memory capacity for system daemons (kernel, `kubelet`, etc.)
|
||||
* Operator wants to evict pods at 95% memory utilization to reduce thrashing and incidence of system OOM.
|
||||
|
||||
To facilitate this scenario, the `kubelet` would be launched as follows:
|
||||
|
||||
```
|
||||
--eviction-hard=memory.available<500Mi
|
||||
--system-reserved=memory=1.5Gi
|
||||
```
|
||||
|
||||
Implicit in this configuration is the understanding that "System reserved" should include the amount of memory
|
||||
covered by the eviction threshold.
|
||||
|
||||
To reach that capacity, either some pod is using more than its request, or the system is using more than `500Mi`.
|
||||
|
||||
This configuration will ensure that the scheduler does not place pods on a node that immediately induce memory pressure
|
||||
and trigger eviction assuming those pods use less than their configured request.
|
||||
|
||||
### DaemonSet
|
||||
|
||||
It is never desired for a `kubelet` to evict a pod that was derived from
|
||||
a `DaemonSet` since the pod will immediately be recreated and rescheduled
|
||||
back to the same node.
|
||||
|
||||
At the moment, the `kubelet` has no ability to distinguish a pod created
|
||||
from `DaemonSet` versus any other object. If/when that information is
|
||||
available, the `kubelet` could pro-actively filter those pods from the
|
||||
candidate set of pods provided to the eviction strategy.
|
||||
|
||||
In general, it is strongly recommended that `DaemonSet` not
|
||||
create `BestEffort` pods to avoid being identified as a candidate pod
|
||||
for eviction. Instead `DaemonSet` should ideally launch `Guaranteed` pods.
|
||||
|
||||
## Deprecation of existing feature flags to reclaim disk
|
||||
|
||||
`kubelet` has been freeing up disk space on demand to keep the node stable.
|
||||
|
||||
As disk based eviction matures, the following `kubelet` flags will be marked for deprecation
|
||||
in favor of the simpler configuration supported around eviction.
|
||||
|
||||
| Existing Flag | New Flag |
|
||||
| ------------- | -------- |
|
||||
| `--image-gc-high-threshold` | `--eviction-hard` or `eviction-soft` |
|
||||
| `--image-gc-low-threshold` | `--eviction-minimum-reclaim` |
|
||||
| `--maximum-dead-containers` | deprecated |
|
||||
| `--maximum-dead-containers-per-container` | deprecated |
|
||||
| `--minimum-container-ttl-duration` | deprecated |
|
||||
| `--low-diskspace-threshold-mb` | `--eviction-hard` or `eviction-soft` |
|
||||
| `--outofdisk-transition-frequency` | `--eviction-pressure-transition-period` |
|
||||
|
||||
## Known issues
|
||||
|
||||
### kubelet may not observe memory pressure right away
|
||||
|
||||
The `kubelet` currently polls `cAdvisor` to collect memory usage stats at a regular interval. If memory usage
|
||||
increases within that window rapidly, the `kubelet` may not observe `MemoryPressure` fast enough, and the `OOMKiller`
|
||||
will still be invoked. We intend to integrate with the `memcg` notification API in a future release to reduce this
|
||||
latency, and instead have the kernel tell us when a threshold has been crossed immediately.
|
||||
|
||||
If you are not trying to achieve extreme utilization, but a sensible measure of overcommit, a viable workaround for
|
||||
this issue is to set eviction thresholds at approximately 75% capacity. This increases the ability of this feature
|
||||
to prevent system OOMs, and promote eviction of workloads so cluster state can rebalance.
|
||||
|
||||
### kubelet may evict more pods than needed
|
||||
|
||||
The pod eviction may evict more pods than needed due to stats collection timing gap. This can be mitigated by adding
|
||||
the ability to get root container stats on an on-demand basis (https://github.com/google/cadvisor/issues/1247) in the future.
|
||||
|
||||
### How kubelet ranks pods for eviction in response to inode exhaustion
|
||||
|
||||
At this time, it is not possible to know how many inodes were consumed by a particular container. If the `kubelet` observes
|
||||
inode exhaustion, it will evict pods by ranking them by quality of service. The following issue has been opened in cadvisor
|
||||
to track per container inode consumption (https://github.com/google/cadvisor/issues/1422) which would allow us to rank pods
|
||||
by inode consumption. For example, this would let us identify a container that created large numbers of 0 byte files, and evict
|
||||
that pod over others.
|
|
@ -0,0 +1,128 @@
|
|||
---
|
||||
assignees:
|
||||
- jsafrane
|
||||
title: Static Pods
|
||||
---
|
||||
|
||||
**If you are running clustered Kubernetes and are using static pods to run a pod on every node, you should probably be using a [DaemonSet](/docs/admin/daemons/)!**
|
||||
|
||||
*Static pods* are managed directly by kubelet daemon on a specific node, without API server observing it. It does not have associated any replication controller, kubelet daemon itself watches it and restarts it when it crashes. There is no health check though. Static pods are always bound to one kubelet daemon and always run on the same node with it.
|
||||
|
||||
Kubelet automatically creates so-called *mirror pod* on Kubernetes API server for each static pod, so the pods are visible there, but they cannot be controlled from the API server.
|
||||
|
||||
## Static pod creation
|
||||
|
||||
Static pod can be created in two ways: either by using configuration file(s) or by HTTP.
|
||||
|
||||
### Configuration files
|
||||
|
||||
The configuration files are just standard pod definition in json or yaml format in specific directory. Use `kubelet --pod-manifest-path=<the directory>` to start kubelet daemon, which periodically scans the directory and creates/deletes static pods as yaml/json files appear/disappear there.
|
||||
|
||||
For example, this is how to start a simple web server as a static pod:
|
||||
|
||||
1. Choose a node where we want to run the static pod. In this example, it's `my-node1`.
|
||||
|
||||
```
|
||||
[joe@host ~] $ ssh my-node1
|
||||
```
|
||||
|
||||
2. Choose a directory, say `/etc/kubelet.d` and place a web server pod definition there, e.g. `/etc/kubelet.d/static-web.yaml`:
|
||||
|
||||
```
|
||||
[root@my-node1 ~] $ mkdir /etc/kubernetes.d/
|
||||
[root@my-node1 ~] $ cat <<EOF >/etc/kubernetes.d/static-web.yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: static-web
|
||||
labels:
|
||||
role: myrole
|
||||
spec:
|
||||
containers:
|
||||
- name: web
|
||||
image: nginx
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 80
|
||||
protocol: TCP
|
||||
EOF
|
||||
```
|
||||
|
||||
3. Configure your kubelet daemon on the node to use this directory by running it with `--pod-manifest-path=/etc/kubelet.d/` argument.
|
||||
On Fedora edit `/etc/kubernetes/kubelet` to include this line:
|
||||
|
||||
```
|
||||
KUBELET_ARGS="--cluster-dns=10.254.0.10 --cluster-domain=kube.local --pod-manifest-path=/etc/kubelet.d/"
|
||||
```
|
||||
|
||||
Instructions for other distributions or Kubernetes installations may vary.
|
||||
|
||||
4. Restart kubelet. On Fedora, this is:
|
||||
|
||||
```
|
||||
[root@my-node1 ~] $ systemctl restart kubelet
|
||||
```
|
||||
|
||||
## Pods created via HTTP
|
||||
|
||||
Kubelet periodically downloads a file specified by `--manifest-url=<URL>` argument and interprets it as a json/yaml file with a pod definition. It works the same as `--pod-manifest-path=<directory>`, i.e. it's reloaded every now and then and changes are applied to running static pods (see below).
|
||||
|
||||
## Behavior of static pods
|
||||
|
||||
When kubelet starts, it automatically starts all pods defined in directory specified in `--pod-manifest-path=` or `--manifest-url=` arguments, i.e. our static-web. (It may take some time to pull nginx image, be patient…):
|
||||
|
||||
```shell
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
|
||||
f6d05272b57e nginx:latest "nginx" 8 minutes ago Up 8 minutes k8s_web.6f802af4_static-web-fk-node1_default_67e24ed9466ba55986d120c867395f3c_378e5f3c
|
||||
```
|
||||
|
||||
If we look at our Kubernetes API server (running on host `my-master`), we see that a new mirror-pod was created there too:
|
||||
|
||||
```shell
|
||||
[joe@host ~] $ ssh my-master
|
||||
[joe@my-master ~] $ kubectl get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
static-web-my-node1 1/1 Running 0 2m
|
||||
|
||||
```
|
||||
|
||||
Labels from the static pod are propagated into the mirror-pod and can be used as usual for filtering.
|
||||
|
||||
Notice we cannot delete the pod with the API server (e.g. via [`kubectl`](/docs/user-guide/kubectl/) command), kubelet simply won't remove it.
|
||||
|
||||
```shell
|
||||
[joe@my-master ~] $ kubectl delete pod static-web-my-node1
|
||||
pods/static-web-my-node1
|
||||
[joe@my-master ~] $ kubectl get pods
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
static-web-my-node1 1/1 Running 0 12s
|
||||
|
||||
```
|
||||
|
||||
Back to our `my-node1` host, we can try to stop the container manually and see, that kubelet automatically restarts it in a while:
|
||||
|
||||
```shell
|
||||
[joe@host ~] $ ssh my-node1
|
||||
[joe@my-node1 ~] $ docker stop f6d05272b57e
|
||||
[joe@my-node1 ~] $ sleep 20
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED ...
|
||||
5b920cbaf8b1 nginx:latest "nginx -g 'daemon of 2 seconds ago ...
|
||||
```
|
||||
|
||||
## Dynamic addition and removal of static pods
|
||||
|
||||
Running kubelet periodically scans the configured directory (`/etc/kubelet.d` in our example) for changes and adds/removes pods as files appear/disappear in this directory.
|
||||
|
||||
```shell
|
||||
[joe@my-node1 ~] $ mv /etc/kubelet.d/static-web.yaml /tmp
|
||||
[joe@my-node1 ~] $ sleep 20
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
// no nginx container is running
|
||||
[joe@my-node1 ~] $ mv /tmp/static-web.yaml /etc/kubelet.d/
|
||||
[joe@my-node1 ~] $ sleep 20
|
||||
[joe@my-node1 ~] $ docker ps
|
||||
CONTAINER ID IMAGE COMMAND CREATED ...
|
||||
e7a62e3427f1 nginx:latest "nginx -g 'daemon of 27 seconds ago
|
||||
```
|
|
@ -0,0 +1,122 @@
|
|||
---
|
||||
assignees:
|
||||
- sttts
|
||||
title: Using Sysctls in a Kubernetes Cluster
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
This document describes how sysctls are used within a Kubernetes cluster.
|
||||
|
||||
## What is a Sysctl?
|
||||
|
||||
In Linux, the sysctl interface allows an administrator to modify kernel
|
||||
parameters at runtime. Parameters are available via the `/proc/sys/` virtual
|
||||
process file system. The parameters cover various subsystems such as:
|
||||
|
||||
- kernel (common prefix: `kernel.`)
|
||||
- networking (common prefix: `net.`)
|
||||
- virtual memory (common prefix: `vm.`)
|
||||
- MDADM (common prefix: `dev.`)
|
||||
- More subsystems are described in [Kernel docs](https://www.kernel.org/doc/Documentation/sysctl/README).
|
||||
|
||||
To get a list of all parameters, you can run
|
||||
|
||||
```
|
||||
$ sudo sysctl -a
|
||||
```
|
||||
|
||||
## Namespaced vs. Node-Level Sysctls
|
||||
|
||||
A number of sysctls are _namespaced_ in today's Linux kernels. This means that
|
||||
they can be set independently for each pod on a node. Being namespaced is a
|
||||
requirement for sysctls to be accessible in a pod context within Kubernetes.
|
||||
|
||||
The following sysctls are known to be _namespaced_:
|
||||
|
||||
- `kernel.shm*`,
|
||||
- `kernel.msg*`,
|
||||
- `kernel.sem`,
|
||||
- `fs.mqueue.*`,
|
||||
- `net.*`.
|
||||
|
||||
Sysctls which are not namespaced are called _node-level_ and must be set
|
||||
manually by the cluster admin, either by means of the underlying Linux
|
||||
distribution of the nodes (e.g. via `/etc/sysctls.conf`) or using a DaemonSet
|
||||
with privileged containers.
|
||||
|
||||
**Note**: it is good practice to consider nodes with special sysctl settings as
|
||||
_tainted_ within a cluster, and only schedule pods onto them which need those
|
||||
sysctl settings. It is suggested to use the Kubernetes [_taints and toleration_
|
||||
feature](/docs/user-guide/kubectl/kubectl_taint.md) to implement this.
|
||||
|
||||
## Safe vs. Unsafe Sysctls
|
||||
|
||||
Sysctls are grouped into _safe_ and _unsafe_ sysctls. In addition to proper
|
||||
namespacing a _safe_ sysctl must be properly _isolated_ between pods on the same
|
||||
node. This means that setting a _safe_ sysctl for one pod
|
||||
|
||||
- must not have any influence on any other pod on the node
|
||||
- must not allow to harm the node's health
|
||||
- must not allow to gain CPU or memory resources outside of the resource limits
|
||||
of a pod.
|
||||
|
||||
By far, most of the _namespaced_ sysctls are not necessarily considered _safe_.
|
||||
|
||||
For Kubernetes 1.4, the following sysctls are supported in the _safe_ set:
|
||||
|
||||
- `kernel.shm_rmid_forced`,
|
||||
- `net.ipv4.ip_local_port_range`,
|
||||
- `net.ipv4.tcp_syncookies`.
|
||||
|
||||
This list will be extended in future Kubernetes versions when the kubelet
|
||||
supports better isolation mechanisms.
|
||||
|
||||
All _safe_ sysctls are enabled by default.
|
||||
|
||||
All _unsafe_ sysctls are disabled by default and must be allowed manually by the
|
||||
cluster admin on a per-node basis. Pods with disabled unsafe sysctls will be
|
||||
scheduled, but will fail to launch.
|
||||
|
||||
**Warning**: Due to their nature of being _unsafe_, the use of _unsafe_ sysctls
|
||||
is at-your-own-risk and can lead to severe problems like wrong behavior of
|
||||
containers, resource shortage or complete breakage of a node.
|
||||
|
||||
## Enabling Unsafe Sysctls
|
||||
|
||||
With the warning above in mind, the cluster admin can allow certain _unsafe_
|
||||
sysctls for very special situations like e.g. high-performance or real-time
|
||||
application tuning. _Unsafe_ sysctls are enabled on a node-by-node basis with a
|
||||
flag of the kubelet, e.g.:
|
||||
|
||||
```shell
|
||||
$ kubelet --experimental-allowed-unsafe-sysctls 'kernel.msg*,net.ipv4.route.min_pmtu' ...
|
||||
```
|
||||
|
||||
Only _namespaced_ sysctls can be enabled this way.
|
||||
|
||||
## Setting Sysctls for a Pod
|
||||
|
||||
The sysctl feature is an alpha API in Kubernetes 1.4. Therefore, sysctls are set
|
||||
using annotations on pods. They apply to all containers in the same pod.
|
||||
|
||||
Here is an example, with different annotations for _safe_ and _unsafe_ sysctls:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: sysctl-example
|
||||
annotations:
|
||||
security.alpha.kubernetes.io/sysctls: kernel.shm_rmid_forced=1
|
||||
security.alpha.kubernetes.io/unsafe-sysctls: net.ipv4.route.min_pmtu=1000,kernel.msgmax=1 2 3
|
||||
spec:
|
||||
...
|
||||
```
|
||||
|
||||
**Note**: a pod with the _unsafe_ sysctls specified above will fail to launch on
|
||||
any node which has not enabled those two _unsafe_ sysctls explicitly. As with
|
||||
_node-level_ sysctls it is recommended to use [_taints and toleration_
|
||||
feature](/docs/user-guide/kubectl/kubectl_taint.md) or [labels on nodes](/docs
|
||||
/user-guide/labels.md) to schedule those pods onto the right nodes.
|
|
@ -1,5 +1,8 @@
|
|||
---
|
||||
title: Container Command and Arguments
|
||||
redirect_from:
|
||||
- "/docs/user-guide/containers/"
|
||||
- "/docs/user-guide/containers.html"
|
||||
---
|
||||
|
||||
{% capture overview %}
|
||||
|
@ -13,7 +16,7 @@ fields to override the default Entrypoint and Cmd of the the Container's image.
|
|||
|
||||
## Container entry points and arguments
|
||||
|
||||
The configuration file for a Container has an `image` field that specifies the
|
||||
The configuration file for a Container has an `image` field that specifies
|
||||
the Docker image to be run in the Container. A Docker image has metadata that includes
|
||||
a default Entrypoint and a default Cmd.
|
||||
|
||||
|
@ -66,6 +69,7 @@ Here are some examples:
|
|||
| `[/ep-1]` | `[foo bar]` | <not set> | <not set> | `[ep-1 foo bar]` |
|
||||
| `[/ep-1]` | `[foo bar]` | `[/ep-2]` | <not set> | `[ep-2]` |
|
||||
| `[/ep-1]` | `[foo bar]` | <not set> | `[zoo boo]` | `[ep-1 zoo boo]` |
|
||||
| `[/ep-1]` | `[foo bar]` | `[/ep-2]` | `[zoo boo]` | `[ep-2 zoo boo]` |
|
||||
|
||||
{% endcapture %}
|
||||
|
||||
|
|
|
@ -0,0 +1,430 @@
|
|||
---
|
||||
title: Managing Compute Resources for Containers
|
||||
---
|
||||
|
||||
{% capture overview %}
|
||||
|
||||
When you specify a [Pod](/docs/user-guide/pods), you can optionally specify how
|
||||
much CPU and memory (RAM) each Container needs. When Containers have resource
|
||||
requests specified, the scheduler can make better decisions about which nodes to
|
||||
place Pods on. And when Containers have their limits specified, contention for
|
||||
resources on a node can be handled in a specified manner. For more details about
|
||||
the difference between requests and limits, see
|
||||
[Resource QoS](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/resource-qos.md).
|
||||
|
||||
{% endcapture %}
|
||||
|
||||
|
||||
{% capture body %}
|
||||
|
||||
## Resource types
|
||||
|
||||
*CPU* and *memory* are each a *resource type*. A resource type has a base unit.
|
||||
CPU is specified in units of cores, and memory is specified in units of bytes.
|
||||
|
||||
CPU and memory are collectively referred to as *compute resources*, or just
|
||||
*resources*. Compute
|
||||
resources are measurable quantities that can be requested, allocated, and
|
||||
consumed. They are distinct from
|
||||
[API resources](/docs/api/). API resources, such as Pods and
|
||||
[Services](/docs/user-guide/services) are objects that can be read and modified
|
||||
through the Kubernetes API server.
|
||||
|
||||
## Resource requests and limits of Pod and Container
|
||||
|
||||
Each Container of a Pod can specify one or more of the following:
|
||||
|
||||
* `spec.containers[].resources.limits.cpu`
|
||||
* `spec.containers[].resources.limits.memory`
|
||||
* `spec.containers[].resources.requests.cpu`
|
||||
* `spec.containers[].resources.requests.memory`
|
||||
|
||||
Although requests and limits can only be specified on individual Containers, it
|
||||
is convenient to talk about Pod resource requests and limits. A
|
||||
*Pod resource request/limit* for a particular resource type is the sum of the
|
||||
resource requests/limits of that type for each Container in the Pod.
|
||||
|
||||
## Meaning of CPU
|
||||
|
||||
Limits and requests for CPU resources are measured in *cpu* units.
|
||||
One cpu, in Kubernetes, is equivalent to:
|
||||
|
||||
- 1 AWS vCPU
|
||||
- 1 GCP Core
|
||||
- 1 Azure vCore
|
||||
- 1 *Hyperthread* on a bare-metal Intel processor with Hyperthreading
|
||||
|
||||
Fractional requests are allowed. A Container with
|
||||
`spec.containers[].resources.requests.cpu` of `0.5` is guaranteed half as much
|
||||
CPU as one that asks for 1 CPU. The expression `0.1` is equivalent to the
|
||||
expression `100m`, which can be read as "one hundred millicpu". Some people say
|
||||
"one hundred millicores", and this is understood to mean the same thing. A
|
||||
request with a decimal point, like `0.1`, is converted to `100m` by the API, and
|
||||
precision finer than `1m` is not allowed. For this reason, the form `100m` might
|
||||
be preferred.
|
||||
|
||||
CPU is always requested as an absolute quantity, never as a relative quantity;
|
||||
0.1 is the same amount of CPU on a single-core, dual-core, or 48-core machine.
|
||||
|
||||
## Meaning of memory
|
||||
|
||||
Limits and requests for `memory` are measured in bytes. You can express memory as
|
||||
a plain integer or as a fixed-point integer using one of these SI suffixes:
|
||||
E, P, T, G, M, K. You can also use the power-of-two equivalents: Ei, Pi, Ti, Gi,
|
||||
Mi, Ki. For example, the following represent roughly the same value:
|
||||
|
||||
```shell
|
||||
128974848, 129e6, 129M, 123Mi
|
||||
```
|
||||
|
||||
Here's an example.
|
||||
The following Pod has two Containers. Each Container has a request of 0.25 cpu
|
||||
and 64MiB (2<sup>26</sup> bytes) of memory Each Container has a limit of 0.5
|
||||
cpu and 128MiB of memory. You can say the Pod has a request of 0.5 cpu and 128
|
||||
MiB of memory, and a limit of 1 core and 256MiB of memory.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: frontend
|
||||
spec:
|
||||
containers:
|
||||
- name: db
|
||||
image: mysql
|
||||
resources:
|
||||
requests:
|
||||
memory: "64Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "128Mi"
|
||||
cpu: "500m"
|
||||
- name: wp
|
||||
image: wordpress
|
||||
resources:
|
||||
requests:
|
||||
memory: "64Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "128Mi"
|
||||
cpu: "500m"
|
||||
```
|
||||
|
||||
## How Pods with resource requests are scheduled
|
||||
|
||||
When you create a Pod, the Kubernetes scheduler selects a node for the Pod to
|
||||
run on. Each node has a maximum capacity for each of the resource types: the
|
||||
amount of CPU and memory it can provide for Pods. The scheduler ensures that,
|
||||
for each resource type, the sum of the resource requests of the scheduled
|
||||
Containers is less than the capacity of the node. Note that although actual memory
|
||||
or CPU resource usage on nodes is very low, the scheduler still refuses to place
|
||||
a Pod on a node if the capacity check fails. This protects against a resource
|
||||
shortage on a node when resource usage later increases, for example, during a
|
||||
daily peak in request rate.
|
||||
|
||||
## How Pods with resource limits are run
|
||||
|
||||
When the kubelet starts a Container of a Pod, it passes the CPU and memory limits
|
||||
to the container runtime.
|
||||
|
||||
When using Docker:
|
||||
|
||||
- The `spec.containers[].resources.requests.cpu` is converted to its core value,
|
||||
which is potentially fractional, and multiplied by 1024. This number is used
|
||||
as the value of the
|
||||
[`--cpu-shares`](https://docs.docker.com/engine/reference/run/#/cpu-share-constraint)
|
||||
flag in the `docker run` command.
|
||||
|
||||
- The `spec.containers[].resources.limits.cpu` is converted to its millicore value,
|
||||
multiplied by 100000, and then divided by 1000. This number is used as the value
|
||||
of the [`--cpu-quota`](https://docs.docker.com/engine/reference/run/#/cpu-quota-constraint)
|
||||
flag in the `docker run` command. he [`--cpu-period`] flag is set to 100000,
|
||||
which represents the default 100ms period for measuring quota usage. The
|
||||
kubelet enforces cpu limits if it is started with the
|
||||
[`--cpu-cfs-quota`] flag set to true. As of Kubernetes version 1.2, this flag
|
||||
defaults to true.
|
||||
|
||||
- The `spec.containers[].resources.limits.memory` is converted to an integer, and
|
||||
used as the value of the
|
||||
[`--memory`](https://docs.docker.com/engine/reference/run/#/user-memory-constraints)
|
||||
flag in the `docker run` command.
|
||||
|
||||
If a Container exceeds its memory limit, it might be terminated. If it is
|
||||
restartable, the kubelet will restart it, as with any other type of runtime
|
||||
failure.
|
||||
|
||||
If a Container exceeds its memory request, it is likely that its Pod will
|
||||
be evicted whenever the node runs out of memory.
|
||||
|
||||
A Container might or might not be allowed to exceed its CPU limit for extended
|
||||
periods of time. However, it will not be killed for excessive CPU usage.
|
||||
|
||||
To determine whether a Container cannot be scheduled or is being killed due to
|
||||
resource limits, see the
|
||||
[Troubleshooting](#troubleshooting) section.
|
||||
|
||||
## Monitoring compute resource usage
|
||||
|
||||
The resource usage of a Pod is reported as part of the Pod status.
|
||||
|
||||
If [optional monitoring](http://releases.k8s.io/{{page.githubbranch}}/cluster/addons/cluster-monitoring/README.md)
|
||||
is configured for your cluster, then Pod resource usage can be retrieved from
|
||||
the monitoring system.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### My Pods are pending with event message failedScheduling
|
||||
|
||||
If the scheduler cannot find any node where a Pod can fit, the Pod remains
|
||||
unscheduled until a place can be found. An event is produced each time the
|
||||
scheduler fails to find a place for the Pod, like this:
|
||||
|
||||
```shell
|
||||
$ kubectl describe pod frontend | grep -A 3 Events
|
||||
Events:
|
||||
FirstSeen LastSeen Count From Subobject PathReason Message
|
||||
36s 5s 6 {scheduler } FailedScheduling Failed for reason PodExceedsFreeCPU and possibly others
|
||||
```
|
||||
|
||||
In the preceding example, the Pod named "frontend" fails to be scheduled due to
|
||||
insufficient CPU resource on the node. Similar error messages can also suggest
|
||||
failure due to insufficient memory (PodExceedsFreeMemory). In general, if a Pod
|
||||
is pending with a message of this type, there are several things to try:
|
||||
|
||||
- Add more nodes to the cluster.
|
||||
- Terminate unneeded Pods to make room for pending Pods.
|
||||
- Check that the Pod is not larger than all the nodes. For example, if all the
|
||||
nodes have a capacity of `cpu: 1`, then a Pod with a limit of `cpu: 1.1` will
|
||||
never be scheduled.
|
||||
|
||||
You can check node capacities and amounts allocated with the
|
||||
`kubectl describe nodes` command. For example:
|
||||
|
||||
```shell
|
||||
$ kubectl.sh describe nodes e2e-test-minion-group-4lw4
|
||||
Name: e2e-test-minion-group-4lw4
|
||||
[ ... lines removed for clarity ...]
|
||||
Capacity:
|
||||
alpha.kubernetes.io/nvidia-gpu: 0
|
||||
cpu: 2
|
||||
memory: 7679792Ki
|
||||
pods: 110
|
||||
Allocatable:
|
||||
alpha.kubernetes.io/nvidia-gpu: 0
|
||||
cpu: 1800m
|
||||
memory: 7474992Ki
|
||||
pods: 110
|
||||
[ ... lines removed for clarity ...]
|
||||
Non-terminated Pods: (5 in total)
|
||||
Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits
|
||||
--------- ---- ------------ ---------- --------------- -------------
|
||||
kube-system fluentd-gcp-v1.38-28bv1 100m (5%) 0 (0%) 200Mi (2%) 200Mi (2%)
|
||||
kube-system kube-dns-3297075139-61lj3 260m (13%) 0 (0%) 100Mi (1%) 170Mi (2%)
|
||||
kube-system kube-proxy-e2e-test-... 100m (5%) 0 (0%) 0 (0%) 0 (0%)
|
||||
kube-system monitoring-influxdb-grafana-v4-z1m12 200m (10%) 200m (10%) 600Mi (8%) 600Mi (8%)
|
||||
kube-system node-problem-detector-v0.1-fj7m3 20m (1%) 200m (10%) 20Mi (0%) 100Mi (1%)
|
||||
Allocated resources:
|
||||
(Total limits may be over 100 percent, i.e., overcommitted.)
|
||||
CPU Requests CPU Limits Memory Requests Memory Limits
|
||||
------------ ---------- --------------- -------------
|
||||
680m (34%) 400m (20%) 920Mi (12%) 1070Mi (14%)
|
||||
```
|
||||
|
||||
In the preceding output, you can see that if a Pod requests more than 1120m
|
||||
CPUs or 6.23Gi of memory, it will not fit on the node.
|
||||
|
||||
By looking at the `Pods` section, you can see which Pods are taking up space on
|
||||
the node.
|
||||
|
||||
The amount of resources available to Pods is less than the node capacity, because
|
||||
system daemons use a portion of the available resources. The `allocatable` field
|
||||
[NodeStatus](/docs/resources-reference/v1.5/#nodestatus-v1)
|
||||
gives the amount of resources that are available to Pods. For more information, see
|
||||
[Node Allocatable Resources](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md).
|
||||
|
||||
The [resource quota](/docs/admin/resourcequota/) feature can be configured
|
||||
to limit the total amount of resources that can be consumed. If used in conjunction
|
||||
with namespaces, it can prevent one team from hogging all the resources.
|
||||
|
||||
### My Container is terminated
|
||||
|
||||
Your Container might get terminated because it is resource-starved. To check
|
||||
whether a Container is being killed because it is hitting a resource limit, call
|
||||
`kubectl describe pod` on the Pod of interest:
|
||||
|
||||
```shell
|
||||
[12:54:41] $ ./cluster/kubectl.sh describe pod simmemleak-hra99
|
||||
Name: simmemleak-hra99
|
||||
Namespace: default
|
||||
Image(s): saadali/simmemleak
|
||||
Node: kubernetes-node-tf0f/10.240.216.66
|
||||
Labels: name=simmemleak
|
||||
Status: Running
|
||||
Reason:
|
||||
Message:
|
||||
IP: 10.244.2.75
|
||||
Replication Controllers: simmemleak (1/1 replicas created)
|
||||
Containers:
|
||||
simmemleak:
|
||||
Image: saadali/simmemleak
|
||||
Limits:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
State: Running
|
||||
Started: Tue, 07 Jul 2015 12:54:41 -0700
|
||||
Last Termination State: Terminated
|
||||
Exit Code: 1
|
||||
Started: Fri, 07 Jul 2015 12:54:30 -0700
|
||||
Finished: Fri, 07 Jul 2015 12:54:33 -0700
|
||||
Ready: False
|
||||
Restart Count: 5
|
||||
Conditions:
|
||||
Type Status
|
||||
Ready False
|
||||
Events:
|
||||
FirstSeen LastSeen Count From SubobjectPath Reason Message
|
||||
Tue, 07 Jul 2015 12:53:51 -0700 Tue, 07 Jul 2015 12:53:51 -0700 1 {scheduler } scheduled Successfully assigned simmemleak-hra99 to kubernetes-node-tf0f
|
||||
Tue, 07 Jul 2015 12:53:51 -0700 Tue, 07 Jul 2015 12:53:51 -0700 1 {kubelet kubernetes-node-tf0f} implicitly required container POD pulled Pod container image "gcr.io/google_containers/pause:0.8.0" already present on machine
|
||||
Tue, 07 Jul 2015 12:53:51 -0700 Tue, 07 Jul 2015 12:53:51 -0700 1 {kubelet kubernetes-node-tf0f} implicitly required container POD created Created with docker id 6a41280f516d
|
||||
Tue, 07 Jul 2015 12:53:51 -0700 Tue, 07 Jul 2015 12:53:51 -0700 1 {kubelet kubernetes-node-tf0f} implicitly required container POD started Started with docker id 6a41280f516d
|
||||
Tue, 07 Jul 2015 12:53:51 -0700 Tue, 07 Jul 2015 12:53:51 -0700 1 {kubelet kubernetes-node-tf0f} spec.containers{simmemleak} created Created with docker id 87348f12526a
|
||||
```
|
||||
|
||||
In the preceding example, the `Restart Count: 5` indicates that the `simmemleak`
|
||||
Container in the Pod was terminated and restarted five times.
|
||||
|
||||
You can call `get pod` with the `-o go-template=...` option to fetch the status
|
||||
of previously terminated Containers:
|
||||
|
||||
```shell{% raw %}
|
||||
[13:59:01] $ ./cluster/kubectl.sh get pod -o go-template='{{range.status.containerStatuses}}{{"Container Name: "}}{{.name}}{{"\r\nLastState: "}}{{.lastState}}{{end}}' simmemleak-60xbc
|
||||
Container Name: simmemleak
|
||||
LastState: map[terminated:map[exitCode:137 reason:OOM Killed startedAt:2015-07-07T20:58:43Z finishedAt:2015-07-07T20:58:43Z containerID:docker://0e4095bba1feccdfe7ef9fb6ebffe972b4b14285d5acdec6f0d3ae8a22fad8b2]]{% endraw %}
|
||||
```
|
||||
|
||||
You can see that the Container was terminated because of `reason:OOM Killed`,
|
||||
where `OOM` stands for Out Of Memory.
|
||||
|
||||
## Opaque integer resources (Alpha feature)
|
||||
|
||||
Kubernetes version 1.5 introduces Opaque integer resources. Opaque
|
||||
integer resources allow cluster operators to advertise new node-level
|
||||
resources that would be otherwise unknown to the system.
|
||||
|
||||
Users can consume these resources in Pod specs just like CPU and memory.
|
||||
The scheduler takes care of the resource accounting so that no more than the
|
||||
available amount is simultaneously allocated to Pods.
|
||||
|
||||
**Note:** Opaque integer resources are Alpha in Kubernetes version 1.5.
|
||||
Only resource accounting is implemented; node-level isolation is still
|
||||
under active development.
|
||||
|
||||
Opaque integer resources are resources that begin with the prefix
|
||||
`pod.alpha.kubernetes.io/opaque-int-resource-`. The API server
|
||||
restricts quantities of these resources to whole numbers. Examples of
|
||||
_valid_ quantities are `3`, `3000m` and `3Ki`. Examples of _invalid_
|
||||
quantities are `0.5` and `1500m`.
|
||||
|
||||
There are two steps required to use opaque integer resources. First, the
|
||||
cluster operator must advertise a per-node opaque resource on one or more
|
||||
nodes. Second, users must request the opaque resource in Pods.
|
||||
|
||||
To advertise a new opaque integer resource, the cluster operator should
|
||||
submit a `PATCH` HTTP request to the API server to specify the available
|
||||
quantity in the `status.capacity` for a node in the cluster. After this
|
||||
operation, the node's `status.capacity` will include a new resource. The
|
||||
`status.allocatable` field is updated automatically with the new resource
|
||||
asynchronously by the kubelet. Note that because the scheduler uses the
|
||||
node `status.allocatable` value when evaluating Pod fitness, there may
|
||||
be a short delay between patching the node capacity with a new resource and the
|
||||
first pod that requests the resource to be scheduled on that node.
|
||||
|
||||
**Example:**
|
||||
|
||||
Here is an HTTP request that advertises five "foo" resources on node `k8s-node-1`.
|
||||
|
||||
```http
|
||||
PATCH /api/v1/nodes/k8s-node-1/status HTTP/1.1
|
||||
Accept: application/json
|
||||
Content-Type: application/json-patch+json
|
||||
Host: k8s-master:8080
|
||||
|
||||
[
|
||||
{
|
||||
"op": "add",
|
||||
"path": "/status/capacity/pod.alpha.kubernetes.io~1opaque-int-resource-foo",
|
||||
"value": "5"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
**Note**: In the preceding request, `~1` is the encoding for the character `/`
|
||||
in the patch path. The operation path value in JSON-Patch is interpreted as a
|
||||
JSON-Pointer. For more details, see
|
||||
[IETF RFC 6901, section 3](https://tools.ietf.org/html/rfc6901#section-3).
|
||||
|
||||
To consume an opaque resource in a Pod, include the name of the opaque
|
||||
resource as a key in the `spec.containers[].resources.requests` map.
|
||||
|
||||
The Pod is scheduled only if all of the resource requests are
|
||||
satisfied, including cpu, memory and any opaque resources. The Pod will
|
||||
remain in the `PENDING` state as long as the resource request cannot be met by
|
||||
any node.
|
||||
|
||||
**Example:**
|
||||
|
||||
The Pod below requests 2 cpus and 1 "foo" (an opaque resource.)
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: my-pod
|
||||
spec:
|
||||
containers:
|
||||
- name: my-container
|
||||
image: myimage
|
||||
resources:
|
||||
requests:
|
||||
cpu: 2
|
||||
pod.alpha.kubernetes.io/opaque-int-resource-foo: 1
|
||||
```
|
||||
|
||||
## Planned Improvements
|
||||
|
||||
Kubernetes version 1.5 only allows resource quantities to be specified on a
|
||||
Container. It is planned to improve accounting for resources that are shared by
|
||||
all Containers in a Pod, such as
|
||||
[emptyDir volumes](/docs/user-guide/volumes/#emptydir).
|
||||
|
||||
Kubernetes version 1.5 only supports Container requests and limits for CPU and
|
||||
memory. It is planned to add new resource types, including a node disk space
|
||||
resource, and a framework for adding custom
|
||||
[resource types](https://github.com/kubernetes/community/blob/{{page.githubbranch}}/contributors/design-proposals/resources.md).
|
||||
|
||||
Kubernetes supports overcommitment of resources by supporting multiple levels of
|
||||
[Quality of Service](http://issue.k8s.io/168).
|
||||
|
||||
In Kubernetes version 1.5, one unit of CPU means different things on different
|
||||
cloud providers, and on different machine types within the same cloud providers.
|
||||
For example, on AWS, the capacity of a node is reported in
|
||||
[ECUs](http://aws.amazon.com/ec2/faqs/), while in GCE it is reported in logical
|
||||
cores. We plan to revise the definition of the cpu resource to allow for more
|
||||
consistency across providers and platforms.
|
||||
|
||||
{% endcapture %}
|
||||
|
||||
|
||||
{% capture whatsnext %}
|
||||
|
||||
* Get hands-on experience
|
||||
[assigning CPU and RAM resources to a container](/docs/tasks/configure-pod-container/assign-cpu-ram-container/).
|
||||
|
||||
* [Container](/docs/api-reference/v1/definitions/#_v1_container)
|
||||
|
||||
* [ResourceRequirements](/docs/resources-reference/v1.5/#resourcerequirements-v1)
|
||||
|
||||
{% endcapture %}
|
||||
|
||||
{% include templates/concept.md %}
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
---
|
||||
assignees:
|
||||
- mikedanese
|
||||
title: Configuration Best Practices
|
||||
---
|
||||
|
||||
This document is meant to highlight and consolidate in one place configuration best practices that are introduced throughout the user-guide and getting-started documentation and examples. This is a living document so if you think of something that is not on this list but might be useful to others, please don't hesitate to file an issue or submit a PR.
|
||||
|
||||
## General Config Tips
|
||||
|
||||
- When defining configurations, specify the latest stable API version (currently v1).
|
||||
|
||||
- Configuration files should be stored in version control before being pushed to the cluster. This allows a configuration to be quickly rolled back if needed, and will aid with cluster re-creation and restoration if necessary.
|
||||
|
||||
- Write your configuration files using YAML rather than JSON. They can be used interchangeably in almost all scenarios, but YAML tends to be more user-friendly for config.
|
||||
|
||||
- Group related objects together in a single file where this makes sense. This format is often easier to manage than separate files. See the [guestbook-all-in-one.yaml](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/guestbook/all-in-one/guestbook-all-in-one.yaml) file as an example of this syntax.
|
||||
(Note also that many `kubectl` commands can be called on a directory, and so you can also call
|
||||
`kubectl create` on a directory of config files— see below for more detail).
|
||||
|
||||
- Don't specify default values unnecessarily, in order to simplify and minimize configs, and to
|
||||
reduce error. For example, omit the selector and labels in a `ReplicationController` if you want
|
||||
them to be the same as the labels in its `podTemplate`, since those fields are populated from the
|
||||
`podTemplate` labels by default. See the [guestbook app's](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/guestbook/) .yaml files for some [examples](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/guestbook/frontend-deployment.yaml) of this.
|
||||
|
||||
- Put an object description in an annotation to allow better introspection.
|
||||
|
||||
|
||||
## "Naked" Pods vs Replication Controllers and Jobs
|
||||
|
||||
- If there is a viable alternative to naked pods (i.e., pods not bound to a [replication controller
|
||||
](/docs/user-guide/replication-controller)), go with the alternative. Naked pods will not be rescheduled in the
|
||||
event of node failure.
|
||||
|
||||
Replication controllers are almost always preferable to creating pods, except for some explicit
|
||||
[`restartPolicy: Never`](/docs/user-guide/pod-states/#restartpolicy) scenarios. A
|
||||
[Job](/docs/concepts/jobs/run-to-completion-finite-workloads/) object (currently in Beta), may also be appropriate.
|
||||
|
||||
|
||||
## Services
|
||||
|
||||
- It's typically best to create a [service](/docs/user-guide/services/) before corresponding [replication
|
||||
controllers](/docs/user-guide/replication-controller/), so that the scheduler can spread the pods comprising the
|
||||
service. You can also create a replication controller without specifying replicas (this will set
|
||||
replicas=1), create a service, then scale up the replication controller. This can be useful in
|
||||
ensuring that one replica works before creating lots of them.
|
||||
|
||||
- Don't use `hostPort` (which specifies the port number to expose on the host) unless absolutely
|
||||
necessary, e.g., for a node daemon. When you bind a Pod to a `hostPort`, there are a limited
|
||||
number of places that pod can be scheduled, due to port conflicts— you can only schedule as many
|
||||
such Pods as there are nodes in your Kubernetes cluster.
|
||||
|
||||
If you only need access to the port for debugging purposes, you can use the [kubectl proxy and apiserver proxy](/docs/user-guide/connecting-to-applications-proxy/) or [kubectl port-forward](/docs/user-guide/connecting-to-applications-port-forward/).
|
||||
You can use a [Service](/docs/user-guide/services/) object for external service access.
|
||||
If you do need to expose a pod's port on the host machine, consider using a [NodePort](/docs/user-guide/services/#type-nodeport) service before resorting to `hostPort`.
|
||||
|
||||
- Avoid using `hostNetwork`, for the same reasons as `hostPort`.
|
||||
|
||||
- Use _headless services_ for easy service discovery when you don't need kube-proxy load balancing.
|
||||
See [headless services](/docs/user-guide/services/#headless-services).
|
||||
|
||||
## Using Labels
|
||||
|
||||
- Define and use [labels](/docs/user-guide/labels/) that identify __semantic attributes__ of your application or
|
||||
deployment. For example, instead of attaching a label to a set of pods to explicitly represent
|
||||
some service (e.g., `service: myservice`), or explicitly representing the replication
|
||||
controller managing the pods (e.g., `controller: mycontroller`), attach labels that identify
|
||||
semantic attributes, such as `{ app: myapp, tier: frontend, phase: test, deployment: v3 }`. This
|
||||
will let you select the object groups appropriate to the context— e.g., a service for all "tier:
|
||||
frontend" pods, or all "test" phase components of app "myapp". See the
|
||||
[guestbook](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/guestbook/) app for an example of this approach.
|
||||
|
||||
A service can be made to span multiple deployments, such as is done across [rolling updates](/docs/user-guide/kubectl/kubectl_rolling-update/), by simply omitting release-specific labels from its selector, rather than updating a service's selector to match the replication controller's selector fully.
|
||||
|
||||
- To facilitate rolling updates, include version info in replication controller names, e.g. as a
|
||||
suffix to the name. It is useful to set a 'version' label as well. The rolling update creates a
|
||||
new controller as opposed to modifying the existing controller. So, there will be issues with
|
||||
version-agnostic controller names. See the [documentation](/docs/user-guide/kubectl/kubectl_rolling-update/) on
|
||||
the rolling-update command for more detail.
|
||||
|
||||
Note that the [Deployment](/docs/user-guide/deployments/) object obviates the need to manage replication
|
||||
controller 'version names'. A desired state of an object is described by a Deployment, and if
|
||||
changes to that spec are _applied_, the deployment controller changes the actual state to the
|
||||
desired state at a controlled rate. (Deployment objects are currently part of the [`extensions`
|
||||
API Group](/docs/api/#api-groups).)
|
||||
|
||||
- You can manipulate labels for debugging. Because Kubernetes replication controllers and services
|
||||
match to pods using labels, this allows you to remove a pod from being considered by a
|
||||
controller, or served traffic by a service, by removing the relevant selector labels. If you
|
||||
remove the labels of an existing pod, its controller will create a new pod to take its place.
|
||||
This is a useful way to debug a previously "live" pod in a quarantine environment. See the
|
||||
[`kubectl label`](/docs/user-guide/kubectl/kubectl_label/) command.
|
||||
|
||||
## Container Images
|
||||
|
||||
- The [default container image pull policy](/docs/user-guide/images/) is `IfNotPresent`, which causes the
|
||||
[Kubelet](/docs/admin/kubelet/) to not pull an image if it already exists. If you would like to
|
||||
always force a pull, you must specify a pull image policy of `Always` in your .yaml file
|
||||
(`imagePullPolicy: Always`) or specify a `:latest` tag on your image.
|
||||
|
||||
That is, if you're specifying an image with other than the `:latest` tag, e.g. `myimage:v1`, and
|
||||
there is an image update to that same tag, the Kubelet won't pull the updated image. You can
|
||||
address this by ensuring that any updates to an image bump the image tag as well (e.g.
|
||||
`myimage:v2`), and ensuring that your configs point to the correct version.
|
||||
|
||||
**Note:** you should avoid using `:latest` tag when deploying containers in production, because this makes it hard
|
||||
to track which version of the image is running and hard to roll back.
|
||||
|
||||
## Using kubectl
|
||||
|
||||
- Use `kubectl create -f <directory>` where possible. This looks for config objects in all `.yaml`, `.yml`, and `.json` files in `<directory>` and passes them to `create`.
|
||||
|
||||
- Use `kubectl delete` rather than `stop`. `Delete` has a superset of the functionality of `stop`, and `stop` is deprecated.
|
||||
|
||||
- Use kubectl bulk operations (via files and/or labels) for get and delete. See [label selectors](/docs/user-guide/labels/#label-selectors) and [using labels effectively](/docs/concepts/cluster-administration/manage-deployment/#using-labels-effectively).
|
||||
|
||||
- Use `kubectl run` and `expose` to quickly create and expose single container Deployments. See the [quick start guide](/docs/user-guide/quick-start/) for an example.
|
||||
|
||||
|
|
@ -8,7 +8,7 @@ The Concepts section helps you learn about the parts of the Kubernetes system an
|
|||
|
||||
To work with Kubernetes, you use *Kubernetes API objects* to describe your cluster's *desired state*: what applications or other workloads you want to run, what container images they use, the number of replicas, what network and disk resources you want to make available, and more. You set your desired state by creating objects using the Kubernetes API, typically via the command-line interface, `kubectl`. You can also use the Kubernetes API directly to interact with the cluster and set or modify your desired state.
|
||||
|
||||
Once you've set your desired state, the *Kubernetes Control Plane* works to make the cluster's current state match the desired state. To do so, Kuberentes performs a variety of tasks automatically--such as starting or restarting containers, scaling the number of replicas of a given application, and more. The Kubernetes Control Plane consists of a collection processes running on your cluster:
|
||||
Once you've set your desired state, the *Kubernetes Control Plane* works to make the cluster's current state match the desired state. To do so, Kuberentes performs a variety of tasks automatically--such as starting or restarting containers, scaling the number of replicas of a given application, and more. The Kubernetes Control Plane consists of a collection of processes running on your cluster:
|
||||
|
||||
* The **Kubernetes Master** is a collection of four processes that run on a single node in your cluster, which is designated as the master node.
|
||||
* Each individual non-master node in your cluster runs two processes:
|
||||
|
@ -17,7 +17,7 @@ Once you've set your desired state, the *Kubernetes Control Plane* works to make
|
|||
|
||||
## Kubernetes Objects
|
||||
|
||||
Kubernetes contains a number of abstractions that represent your the state of your system: deployed containerized applications and workloads, their associated network and disk resources, and other information about what your cluster is doing. These abstractions are represented by objects in the Kubernetes API; see the [Kubernetes Objects overview](/docs/concepts/abstractions/overview/) for more details.
|
||||
Kubernetes contains a number of abstractions that represent the state of your system: deployed containerized applications and workloads, their associated network and disk resources, and other information about what your cluster is doing. These abstractions are represented by objects in the Kubernetes API; see the [Kubernetes Objects overview](/docs/concepts/abstractions/overview/) for more details.
|
||||
|
||||
The basic Kubernetes objects include:
|
||||
|
||||
|
@ -44,7 +44,7 @@ For example, when you use the Kubernetes API to create a Deployment object, you
|
|||
|
||||
The Kubernetes master is responsible for maintaining the desired state for your cluster. When you interact with Kubernetes, such as by using the `kubectl` command-line interface, you're communicating with your cluster's Kubernetes master.
|
||||
|
||||
> The "master" refers to a collection of processes managing the cluster state. Typically these processes are all run on a single node in the cluster, and this node is also referred to as the master. The master can also be replicated for availability and redundnacy.
|
||||
> The "master" refers to a collection of processes managing the cluster state. Typically these processes are all run on a single node in the cluster, and this node is also referred to as the master. The master can also be replicated for availability and redundancy.
|
||||
|
||||
### Kubernetes Nodes
|
||||
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: pi
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
name: pi
|
||||
spec:
|
||||
containers:
|
||||
- name: pi
|
||||
image: perl
|
||||
command: ["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"]
|
||||
restartPolicy: Never
|
||||
|
|
@ -0,0 +1,385 @@
|
|||
---
|
||||
assignees:
|
||||
- erictune
|
||||
- soltysh
|
||||
title: Run to Completion Finite Workloads
|
||||
---
|
||||
|
||||
* TOC
|
||||
{:toc}
|
||||
|
||||
## What is a Job?
|
||||
|
||||
A _job_ creates one or more pods and ensures that a specified number of them successfully terminate.
|
||||
As pods successfully complete, the _job_ tracks the successful completions. When a specified number
|
||||
of successful completions is reached, the job itself is complete. Deleting a Job will cleanup the
|
||||
pods it created.
|
||||
|
||||
A simple case is to create one Job object in order to reliably run one Pod to completion.
|
||||
The Job object will start a new Pod if the first pod fails or is deleted (for example
|
||||
due to a node hardware failure or a node reboot).
|
||||
|
||||
A Job can also be used to run multiple pods in parallel.
|
||||
|
||||
### extensions/v1beta1.Job is deprecated
|
||||
|
||||
Starting from version 1.5 `extensions/v1beta1.Job` is being deprecated, with a plan to be removed in
|
||||
version 1.6 of Kubernetes (see this [issue](https://github.com/kubernetes/kubernetes/issues/32763)).
|
||||
Please use `batch/v1.Job` instead.
|
||||
|
||||
## Running an example Job
|
||||
|
||||
Here is an example Job config. It computes π to 2000 places and prints it out.
|
||||
It takes around 10s to complete.
|
||||
|
||||
{% include code.html language="yaml" file="job.yaml" ghlink="/docs/user-guide/job.yaml" %}
|
||||
|
||||
Run the example job by downloading the example file and then running this command:
|
||||
|
||||
```shell
|
||||
$ kubectl create -f ./job.yaml
|
||||
job "pi" created
|
||||
```
|
||||
|
||||
Check on the status of the job using this command:
|
||||
|
||||
```shell
|
||||
$ kubectl describe jobs/pi
|
||||
Name: pi
|
||||
Namespace: default
|
||||
Image(s): perl
|
||||
Selector: controller-uid=b1db589a-2c8d-11e6-b324-0209dc45a495
|
||||
Parallelism: 1
|
||||
Completions: 1
|
||||
Start Time: Tue, 07 Jun 2016 10:56:16 +0200
|
||||
Labels: controller-uid=b1db589a-2c8d-11e6-b324-0209dc45a495,job-name=pi
|
||||
Pods Statuses: 0 Running / 1 Succeeded / 0 Failed
|
||||
No volumes.
|
||||
Events:
|
||||
FirstSeen LastSeen Count From SubobjectPath Type Reason Message
|
||||
--------- -------- ----- ---- ------------- -------- ------ -------
|
||||
1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: pi-dtn4q
|
||||
```
|
||||
|
||||
To view completed pods of a job, use `kubectl get pods --show-all`. The `--show-all` will show completed pods too.
|
||||
|
||||
To list all the pods that belong to a job in a machine readable form, you can use a command like this:
|
||||
|
||||
```shell
|
||||
$ pods=$(kubectl get pods --show-all --selector=job-name=pi --output=jsonpath={.items..metadata.name})
|
||||
echo $pods
|
||||
pi-aiw0a
|
||||
```
|
||||
|
||||
Here, the selector is the same as the selector for the job. The `--output=jsonpath` option specifies an expression
|
||||
that just gets the name from each pod in the returned list.
|
||||
|
||||
View the standard output of one of the pods:
|
||||
|
||||
```shell
|
||||
$ kubectl logs $pods
|
||||
3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679821480865132823066470938446095505822317253594081284811174502841027019385211055596446229489549303819644288109756659334461284756482337867831652712019091456485669234603486104543266482133936072602491412737245870066063155881748815209209628292540917153643678925903600113305305488204665213841469519415116094330572703657595919530921861173819326117931051185480744623799627495673518857527248912279381830119491298336733624406566430860213949463952247371907021798609437027705392171762931767523846748184676694051320005681271452635608277857713427577896091736371787214684409012249534301465495853710507922796892589235420199561121290219608640344181598136297747713099605187072113499999983729780499510597317328160963185950244594553469083026425223082533446850352619311881710100031378387528865875332083814206171776691473035982534904287554687311595628638823537875937519577818577805321712268066130019278766111959092164201989380952572010654858632788659361533818279682303019520353018529689957736225994138912497217752834791315155748572424541506959508295331168617278558890750983817546374649393192550604009277016711390098488240128583616035637076601047101819429555961989467678374494482553797747268471040475346462080466842590694912933136770289891521047521620569660240580381501935112533824300355876402474964732639141992726042699227967823547816360093417216412199245863150302861829745557067498385054945885869269956909272107975093029553211653449872027559602364806654991198818347977535663698074265425278625518184175746728909777727938000816470600161452491921732172147723501414419735685481613611573525521334757418494684385233239073941433345477624168625189835694855620992192221842725502542568876717904946016534668049886272327917860857843838279679766814541009538837863609506800642251252051173929848960841284886269456042419652850222106611863067442786220391949450471237137869609563643719172874677646575739624138908658326459958133904780275901
|
||||
```
|
||||
|
||||
## Writing a Job Spec
|
||||
|
||||
As with all other Kubernetes config, a Job needs `apiVersion`, `kind`, and `metadata` fields. For
|
||||
general information about working with config files, see [here](/docs/user-guide/simple-yaml),
|
||||
[here](/docs/user-guide/configuring-containers), and [here](/docs/user-guide/working-with-resources).
|
||||
|
||||
A Job also needs a [`.spec` section](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status).
|
||||
|
||||
### Pod Template
|
||||
|
||||
The `.spec.template` is the only required field of the `.spec`.
|
||||
|
||||
The `.spec.template` is a [pod template](/docs/user-guide/replication-controller/#pod-template). It has exactly
|
||||
the same schema as a [pod](/docs/user-guide/pods), except it is nested and does not have an `apiVersion` or
|
||||
`kind`.
|
||||
|
||||
In addition to required fields for a Pod, a pod template in a job must specify appropriate
|
||||
labels (see [pod selector](#pod-selector)) and an appropriate restart policy.
|
||||
|
||||
Only a [`RestartPolicy`](/docs/user-guide/pod-states/#restartpolicy) equal to `Never` or `OnFailure` is allowed.
|
||||
|
||||
### Pod Selector
|
||||
|
||||
The `.spec.selector` field is optional. In almost all cases you should not specify it.
|
||||
See section [specifying your own pod selector](#specifying-your-own-pod-selector).
|
||||
|
||||
|
||||
### Parallel Jobs
|
||||
|
||||
There are three main types of jobs:
|
||||
|
||||
1. Non-parallel Jobs
|
||||
- normally only one pod is started, unless the pod fails.
|
||||
- job is complete as soon as Pod terminates successfully.
|
||||
1. Parallel Jobs with a *fixed completion count*:
|
||||
- specify a non-zero positive value for `.spec.completions`
|
||||
- the job is complete when there is one successful pod for each value in the range 1 to `.spec.completions`.
|
||||
- **not implemented yet:** each pod passed a different index in the range 1 to `.spec.completions`.
|
||||
1. Parallel Jobs with a *work queue*:
|
||||
- do not specify `.spec.completions`, default to `.spec.Parallelism`
|
||||
- the pods must coordinate with themselves or an external service to determine what each should work on
|
||||
- each pod is independently capable of determining whether or not all its peers are done, thus the entire Job is done.
|
||||
- when _any_ pod terminates with success, no new pods are created.
|
||||
- once at least one pod has terminated with success and all pods are terminated, then the job is completed with success.
|
||||
- once any pod has exited with success, no other pod should still be doing any work or writing any output. They should all be
|
||||
in the process of exiting.
|
||||
|
||||
For a Non-parallel job, you can leave both `.spec.completions` and `.spec.parallelism` unset. When both are
|
||||
unset, both are defaulted to 1.
|
||||
|
||||
For a Fixed Completion Count job, you should set `.spec.completions` to the number of completions needed.
|
||||
You can set `.spec.parallelism`, or leave it unset and it will default to 1.
|
||||
|
||||
For a Work Queue Job, you must leave `.spec.completions` unset, and set `.spec.parallelism` to
|
||||
a non-negative integer.
|
||||
|
||||
For more information about how to make use of the different types of job, see the [job patterns](#job-patterns) section.
|
||||
|
||||
|
||||
#### Controlling Parallelism
|
||||
|
||||
The requested parallelism (`.spec.parallelism`) can be set to any non-negative value.
|
||||
If it is unspecified, it defaults to 1.
|
||||
If it is specified as 0, then the Job is effectively paused until it is increased.
|
||||
|
||||
A job can be scaled up using the `kubectl scale` command. For example, the following
|
||||
command sets `.spec.parallelism` of a job called `myjob` to 10:
|
||||
|
||||
```shell
|
||||
$ kubectl scale --replicas=$N jobs/myjob
|
||||
job "myjob" scaled
|
||||
```
|
||||
|
||||
You can also use the `scale` subresource of the Job resource.
|
||||
|
||||
Actual parallelism (number of pods running at any instant) may be more or less than requested
|
||||
parallelism, for a variety or reasons:
|
||||
|
||||
- For Fixed Completion Count jobs, the actual number of pods running in parallel will not exceed the number of
|
||||
remaining completions. Higher values of `.spec.parallelism` are effectively ignored.
|
||||
- For work queue jobs, no new pods are started after any pod has succeeded -- remaining pods are allowed to complete, however.
|
||||
- If the controller has not had time to react.
|
||||
- If the controller failed to create pods for any reason (lack of ResourceQuota, lack of permission, etc.),
|
||||
then there may be fewer pods than requested.
|
||||
- The controller may throttle new pod creation due to excessive previous pod failures in the same Job.
|
||||
- When a pod is gracefully shutdown, it takes time to stop.
|
||||
|
||||
## Handling Pod and Container Failures
|
||||
|
||||
A Container in a Pod may fail for a number of reasons, such as because the process in it exited with
|
||||
a non-zero exit code, or the Container was killed for exceeding a memory limit, etc. If this
|
||||
happens, and the `.spec.template.spec.restartPolicy = "OnFailure"`, then the Pod stays
|
||||
on the node, but the Container is re-run. Therefore, your program needs to handle the case when it is
|
||||
restarted locally, or else specify `.spec.template.spec.restartPolicy = "Never"`.
|
||||
See [pods-states](/docs/user-guide/pod-states) for more information on `restartPolicy`.
|
||||
|
||||
An entire Pod can also fail, for a number of reasons, such as when the pod is kicked off the node
|
||||
(node is upgraded, rebooted, deleted, etc.), or if a container of the Pod fails and the
|
||||
`.spec.template.spec.restartPolicy = "Never"`. When a Pod fails, then the Job controller
|
||||
starts a new Pod. Therefore, your program needs to handle the case when it is restarted in a new
|
||||
pod. In particular, it needs to handle temporary files, locks, incomplete output and the like
|
||||
caused by previous runs.
|
||||
|
||||
Note that even if you specify `.spec.parallelism = 1` and `.spec.completions = 1` and
|
||||
`.spec.template.spec.restartPolicy = "Never"`, the same program may
|
||||
sometimes be started twice.
|
||||
|
||||
If you do specify `.spec.parallelism` and `.spec.completions` both greater than 1, then there may be
|
||||
multiple pods running at once. Therefore, your pods must also be tolerant of concurrency.
|
||||
|
||||
## Job Termination and Cleanup
|
||||
|
||||
When a Job completes, no more Pods are created, but the Pods are not deleted either. Since they are terminated,
|
||||
they don't show up with `kubectl get pods`, but they will show up with `kubectl get pods -a`. Keeping them around
|
||||
allows you to still view the logs of completed pods to check for errors, warnings, or other diagnostic output.
|
||||
The job object also remains after it is completed so that you can view its status. It is up to the user to delete
|
||||
old jobs after noting their status. Delete the job with `kubectl` (e.g. `kubectl delete jobs/pi` or `kubectl delete -f ./job.yaml`). When you delete the job using `kubectl`, all the pods it created are deleted too.
|
||||
|
||||
If a Job's pods are failing repeatedly, the Job will keep creating new pods forever, by default.
|
||||
Retrying forever can be a useful pattern. If an external dependency of the Job's
|
||||
pods is missing (for example an input file on a networked storage volume is not present), then the
|
||||
Job will keep trying Pods, and when you later resolve the external dependency (for example, creating
|
||||
the missing file) the Job will then complete without any further action.
|
||||
|
||||
However, if you prefer not to retry forever, you can set a deadline on the job. Do this by setting the
|
||||
`spec.activeDeadlineSeconds` field of the job to a number of seconds. The job will have status with
|
||||
`reason: DeadlineExceeded`. No more pods will be created, and existing pods will be deleted.
|
||||
|
||||
```yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: pi-with-timeout
|
||||
spec:
|
||||
activeDeadlineSeconds: 100
|
||||
template:
|
||||
metadata:
|
||||
name: pi
|
||||
spec:
|
||||
containers:
|
||||
- name: pi
|
||||
image: perl
|
||||
command: ["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"]
|
||||
restartPolicy: Never
|
||||
```
|
||||
|
||||
Note that both the Job Spec and the Pod Template Spec within the Job have a field with the same name.
|
||||
Set the one on the Job.
|
||||
|
||||
## Job Patterns
|
||||
|
||||
The Job object can be used to support reliable parallel execution of Pods. The Job object is not
|
||||
designed to support closely-communicating parallel processes, as commonly found in scientific
|
||||
computing. It does support parallel processing of a set of independent but related *work items*.
|
||||
These might be emails to be sent, frames to be rendered, files to be transcoded, ranges of keys in a
|
||||
NoSQL database to scan, and so on.
|
||||
|
||||
In a complex system, there may be multiple different sets of work items. Here we are just
|
||||
considering one set of work items that the user wants to manage together — a *batch job*.
|
||||
|
||||
There are several different patterns for parallel computation, each with strengths and weaknesses.
|
||||
The tradeoffs are:
|
||||
|
||||
- One Job object for each work item, vs. a single Job object for all work items. The latter is
|
||||
better for large numbers of work items. The former creates some overhead for the user and for the
|
||||
system to manage large numbers of Job objects. Also, with the latter, the resource usage of the job
|
||||
(number of concurrently running pods) can be easily adjusted using the `kubectl scale` command.
|
||||
- Number of pods created equals number of work items, vs. each pod can process multiple work items.
|
||||
The former typically requires less modification to existing code and containers. The latter
|
||||
is better for large numbers of work items, for similar reasons to the previous bullet.
|
||||
- Several approaches use a work queue. This requires running a queue service,
|
||||
and modifications to the existing program or container to make it use the work queue.
|
||||
Other approaches are easier to adapt to an existing containerised application.
|
||||
|
||||
|
||||
The tradeoffs are summarized here, with columns 2 to 4 corresponding to the above tradeoffs.
|
||||
The pattern names are also links to examples and more detailed description.
|
||||
|
||||
| Pattern | Single Job object | Fewer pods than work items? | Use app unmodified? | Works in Kube 1.1? |
|
||||
| -------------------------------------------------------------------- |:-----------------:|:---------------------------:|:-------------------:|:-------------------:|
|
||||
| [Job Template Expansion](/docs/user-guide/jobs/expansions) | | | ✓ | ✓ |
|
||||
| [Queue with Pod Per Work Item](/docs/tasks/job/coarse-parallel-processing-work-queue/) | ✓ | | sometimes | ✓ |
|
||||
| [Queue with Variable Pod Count](/docs/tasks/job/fine-parallel-processing-work-queue/) | ✓ | ✓ | | ✓ |
|
||||
| Single Job with Static Work Assignment | ✓ | | ✓ | |
|
||||
|
||||
When you specify completions with `.spec.completions`, each Pod created by the Job controller
|
||||
has an identical [`spec`](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md#spec-and-status). This means that
|
||||
all pods will have the same command line and the same
|
||||
image, the same volumes, and (almost) the same environment variables. These patterns
|
||||
are different ways to arrange for pods to work on different things.
|
||||
|
||||
This table shows the required settings for `.spec.parallelism` and `.spec.completions` for each of the patterns.
|
||||
Here, `W` is the number of work items.
|
||||
|
||||
| Pattern | `.spec.completions` | `.spec.parallelism` |
|
||||
| -------------------------------------------------------------------- |:-------------------:|:--------------------:|
|
||||
| [Job Template Expansion](/docs/tasks/job/parallel-processing-expansion/) | 1 | should be 1 |
|
||||
| [Queue with Pod Per Work Item](/docs/tasks/job/coarse-parallel-processing-work-queue/) | W | any |
|
||||
| [Queue with Variable Pod Count](/docs/tasks/job/fine-parallel-processing-work-queue/) | 1 | any |
|
||||
| Single Job with Static Work Assignment | W | any |
|
||||
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Specifying your own pod selector
|
||||
|
||||
Normally, when you create a job object, you do not specify `spec.selector`.
|
||||
The system defaulting logic adds this field when the job is created.
|
||||
It picks a selector value that will not overlap with any other jobs.
|
||||
|
||||
However, in some cases, you might need to override this automatically set selector.
|
||||
To do this, you can specify the `spec.selector` of the job.
|
||||
|
||||
Be very careful when doing this. If you specify a label selector which is not
|
||||
unique to the pods of that job, and which matches unrelated pods, then pods of the unrelated
|
||||
job may be deleted, or this job may count other pods as completing it, or one or both
|
||||
of the jobs may refuse to create pods or run to completion. If a non-unique selector is
|
||||
chosen, then other controllers (e.g. ReplicationController) and their pods may behave
|
||||
in unpredicatable ways too. Kubernetes will not stop you from making a mistake when
|
||||
specifying `spec.selector`.
|
||||
|
||||
Here is an example of a case when you might want to use this feature.
|
||||
|
||||
Say job `old` is already running. You want existing pods
|
||||
to keep running, but you want the rest of the pods it creates
|
||||
to use a different pod template and for the job to have a new name.
|
||||
You cannot update the job because these fields are not updatable.
|
||||
Therefore, you delete job `old` but leave its pods
|
||||
running, using `kubectl delete jobs/old-one --cascade=false`.
|
||||
Before deleting it, you make a note of what selector it uses:
|
||||
|
||||
```
|
||||
kind: Job
|
||||
metadata:
|
||||
name: old
|
||||
...
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
job-uid: a8f3d00d-c6d2-11e5-9f87-42010af00002
|
||||
...
|
||||
```
|
||||
|
||||
Then you create a new job with name `new` and you explicitly specify the same selector.
|
||||
Since the existing pods have label `job-uid=a8f3d00d-c6d2-11e5-9f87-42010af00002`,
|
||||
they are controlled by job `new` as well.
|
||||
|
||||
You need to specify `manualSelector: true` in the new job since you are not using
|
||||
the selector that the system normally generates for you automatically.
|
||||
|
||||
```
|
||||
kind: Job
|
||||
metadata:
|
||||
name: new
|
||||
...
|
||||
spec:
|
||||
manualSelector: true
|
||||
selector:
|
||||
matchLabels:
|
||||
job-uid: a8f3d00d-c6d2-11e5-9f87-42010af00002
|
||||
...
|
||||
```
|
||||
|
||||
The new Job itself will have a different uid from `a8f3d00d-c6d2-11e5-9f87-42010af00002`. Setting
|
||||
`manualSelector: true` tells the system to that you know what you are doing and to allow this
|
||||
mismatch.
|
||||
|
||||
## Alternatives
|
||||
|
||||
### Bare Pods
|
||||
|
||||
When the node that a pod is running on reboots or fails, the pod is terminated
|
||||
and will not be restarted. However, a Job will create new pods to replace terminated ones.
|
||||
For this reason, we recommend that you use a job rather than a bare pod, even if your application
|
||||
requires only a single pod.
|
||||
|
||||
### Replication Controller
|
||||
|
||||
Jobs are complementary to [Replication Controllers](/docs/user-guide/replication-controller).
|
||||
A Replication Controller manages pods which are not expected to terminate (e.g. web servers), and a Job
|
||||
manages pods that are expected to terminate (e.g. batch jobs).
|
||||
|
||||
As discussed in [life of a pod](/docs/user-guide/pod-states), `Job` is *only* appropriate for pods with
|
||||
`RestartPolicy` equal to `OnFailure` or `Never`. (Note: If `RestartPolicy` is not set, the default
|
||||
value is `Always`.)
|
||||
|
||||
### Single Job starts Controller Pod
|
||||
|
||||
Another pattern is for a single Job to create a pod which then creates other pods, acting as a sort
|
||||
of custom controller for those pods. This allows the most flexibility, but may be somewhat
|
||||
complicated to get started with and offers less integration with Kubernetes.
|
||||
|
||||
One example of this pattern would be a Job which starts a Pod which runs a script that in turn
|
||||
starts a Spark master controller (see [spark example](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/examples/spark/README.md)), runs a spark
|
||||
driver, and then cleans up.
|
||||
|
||||
An advantage of this approach is that the overall process gets the completion guarantee of a Job
|
||||
object, but complete control over what pods are created and how work is assigned to them.
|
||||
|
||||
## Cron Jobs
|
||||
|
||||
Support for creating Jobs at specified times/dates (i.e. cron) is available in Kubernetes [1.4](https://github.com/kubernetes/kubernetes/pull/11980). More information is available in the [cron job documents](http://kubernetes.io/docs/user-guide/cron-jobs/)
|
|
@ -0,0 +1,136 @@
|
|||
---
|
||||
assignees:
|
||||
- lavalamp
|
||||
title: Kubernetes Components
|
||||
---
|
||||
|
||||
This document outlines the various binary components that need to run to
|
||||
deliver a functioning Kubernetes cluster.
|
||||
|
||||
## Master Components
|
||||
|
||||
Master components are those that provide the cluster's control plane. For
|
||||
example, master components are responsible for making global decisions about the
|
||||
cluster (e.g., scheduling), and detecting and responding to cluster events
|
||||
(e.g., starting up a new pod when a replication controller's 'replicas' field is
|
||||
unsatisfied).
|
||||
|
||||
In theory, Master components can be run on any node in the cluster. However,
|
||||
for simplicity, current set up scripts typically start all master components on
|
||||
the same VM, and does not run user containers on this VM. See
|
||||
[high-availability.md](/docs/admin/high-availability) for an example multi-master-VM setup.
|
||||
|
||||
Even in the future, when Kubernetes is fully self-hosting, it will probably be
|
||||
wise to only allow master components to schedule on a subset of nodes, to limit
|
||||
co-running with user-run pods, reducing the possible scope of a
|
||||
node-compromising security exploit.
|
||||
|
||||
### kube-apiserver
|
||||
|
||||
[kube-apiserver](/docs/admin/kube-apiserver) exposes the Kubernetes API; it is the front-end for the
|
||||
Kubernetes control plane. It is designed to scale horizontally (i.e., one scales
|
||||
it by running more of them-- [high-availability.md](/docs/admin/high-availability)).
|
||||
|
||||
### etcd
|
||||
|
||||
[etcd](/docs/admin/etcd) is used as Kubernetes' backing store. All cluster data is stored here.
|
||||
Proper administration of a Kubernetes cluster includes a backup plan for etcd's
|
||||
data.
|
||||
|
||||
### kube-controller-manager
|
||||
|
||||
[kube-controller-manager](/docs/admin/kube-controller-manager) is a binary that runs controllers, which are the
|
||||
background threads that handle routine tasks in the cluster. Logically, each
|
||||
controller is a separate process, but to reduce the number of moving pieces in
|
||||
the system, they are all compiled into a single binary and run in a single
|
||||
process.
|
||||
|
||||
These controllers include:
|
||||
|
||||
* Node Controller: Responsible for noticing & responding when nodes go down.
|
||||
* Replication Controller: Responsible for maintaining the correct number of pods for every replication
|
||||
controller object in the system.
|
||||
* Endpoints Controller: Populates the Endpoints object (i.e., join Services & Pods).
|
||||
* Service Account & Token Controllers: Create default accounts and API access tokens for new namespaces.
|
||||
* ... and others.
|
||||
|
||||
### kube-scheduler
|
||||
|
||||
[kube-scheduler](/docs/admin/kube-scheduler) watches newly created pods that have no node assigned, and
|
||||
selects a node for them to run on.
|
||||
|
||||
### addons
|
||||
|
||||
Addons are pods and services that implement cluster features. The pods may be managed
|
||||
by Deployments, ReplicationContollers, etc. Namespaced addon objects are created in
|
||||
the "kube-system" namespace.
|
||||
|
||||
Addon manager takes the responsibility for creating and maintaining addon resources.
|
||||
See [here](http://releases.k8s.io/HEAD/cluster/addons) for more details.
|
||||
|
||||
#### DNS
|
||||
|
||||
While the other addons are not strictly required, all Kubernetes
|
||||
clusters should have [cluster DNS](/docs/admin/dns/), as many examples rely on it.
|
||||
|
||||
Cluster DNS is a DNS server, in addition to the other DNS server(s) in your
|
||||
environment, which serves DNS records for Kubernetes services.
|
||||
|
||||
Containers started by Kubernetes automatically include this DNS server
|
||||
in their DNS searches.
|
||||
|
||||
#### User interface
|
||||
|
||||
The kube-ui provides a read-only overview of the cluster state. Access
|
||||
[the UI using kubectl proxy](/docs/user-guide/connecting-to-applications-proxy/#connecting-to-the-kube-ui-service-from-your-local-workstation)
|
||||
|
||||
#### Container Resource Monitoring
|
||||
|
||||
[Container Resource Monitoring](/docs/user-guide/monitoring) records generic time-series metrics
|
||||
about containers in a central database, and provides a UI for browsing that data.
|
||||
|
||||
#### Cluster-level Logging
|
||||
|
||||
A [Cluster-level logging](/docs/user-guide/logging/overview) mechanism is responsible for
|
||||
saving container logs to a central log store with search/browsing interface.
|
||||
|
||||
## Node components
|
||||
|
||||
Node components run on every node, maintaining running pods and providing them
|
||||
the Kubernetes runtime environment.
|
||||
|
||||
### kubelet
|
||||
|
||||
[kubelet](/docs/admin/kubelet) is the primary node agent. It:
|
||||
|
||||
* Watches for pods that have been assigned to its node (either by apiserver
|
||||
or via local configuration file) and:
|
||||
* Mounts the pod's required volumes
|
||||
* Downloads the pod's secrets
|
||||
* Runs the pod's containers via docker (or, experimentally, rkt).
|
||||
* Periodically executes any requested container liveness probes.
|
||||
* Reports the status of the pod back to the rest of the system, by creating a
|
||||
"mirror pod" if necessary.
|
||||
* Reports the status of the node back to the rest of the system.
|
||||
|
||||
### kube-proxy
|
||||
|
||||
[kube-proxy](/docs/admin/kube-proxy) enables the Kubernetes service abstraction by maintaining
|
||||
network rules on the host and performing connection forwarding.
|
||||
|
||||
### docker
|
||||
|
||||
`docker` is of course used for actually running containers.
|
||||
|
||||
### rkt
|
||||
|
||||
`rkt` is supported experimentally as an alternative to docker.
|
||||
|
||||
### supervisord
|
||||
|
||||
`supervisord` is a lightweight process babysitting system for keeping kubelet and docker
|
||||
running.
|
||||
|
||||
### fluentd
|
||||
|
||||
`fluentd` is a daemon which helps provide [cluster-level logging](#cluster-level-logging).
|
|
@ -0,0 +1,109 @@
|
|||
---
|
||||
assignees:
|
||||
- bgrant0607
|
||||
- erictune
|
||||
- lavalamp
|
||||
title: The Kubernetes API
|
||||
---
|
||||
|
||||
Primary system and API concepts are documented in the [User guide](/docs/user-guide/).
|
||||
|
||||
Overall API conventions are described in the [API conventions doc](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api-conventions.md).
|
||||
|
||||
Remote access to the API is discussed in the [access doc](/docs/admin/accessing-the-api).
|
||||
|
||||
The Kubernetes API also serves as the foundation for the declarative configuration schema for the system. The [Kubectl](/docs/user-guide/kubectl) command-line tool can be used to create, update, delete, and get API objects.
|
||||
|
||||
Kubernetes also stores its serialized state (currently in [etcd](https://coreos.com/docs/distributed-configuration/getting-started-with-etcd/)) in terms of the API resources.
|
||||
|
||||
Kubernetes itself is decomposed into multiple components, which interact through its API.
|
||||
|
||||
## API changes
|
||||
|
||||
In our experience, any system that is successful needs to grow and change as new use cases emerge or existing ones change. Therefore, we expect the Kubernetes API to continuously change and grow. However, we intend to not break compatibility with existing clients, for an extended period of time. In general, new API resources and new resource fields can be expected to be added frequently. Elimination of resources or fields will require following a deprecation process. The precise deprecation policy for eliminating features is TBD, but once we reach our 1.0 milestone, there will be a specific policy.
|
||||
|
||||
What constitutes a compatible change and how to change the API are detailed by the [API change document](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api_changes.md).
|
||||
|
||||
## OpenAPI and Swagger definitions
|
||||
|
||||
Complete API details are documented using [Swagger v1.2](http://swagger.io/) and [OpenAPI](https://www.openapis.org/). The Kubernetes apiserver (aka "master") exposes an API that can be used to retrieve the Swagger v1.2 Kubernetes API spec located at `/swaggerapi`. You can also enable a UI to browse the API documentation at `/swagger-ui` by passing the `--enable-swagger-ui=true` flag to apiserver.
|
||||
|
||||
We also host a version of the [latest v1.2 API documentation UI](http://kubernetes.io/kubernetes/third_party/swagger-ui/). This is updated with the latest release, so if you are using a different version of Kubernetes you will want to use the spec from your apiserver.
|
||||
|
||||
Starting with kubernetes 1.4, OpenAPI spec is also available at `/swagger.json`. While we are transitioning from Swagger v1.2 to OpenAPI (aka Swagger v2.0), some of the tools such as kubectl and swagger-ui are still using v1.2 spec. OpenAPI spec is in Beta as of Kubernetes 1.5.
|
||||
|
||||
Kubernetes implements an alternative Protobuf based serialization format for the API that is primarily intended for intra-cluster communication, documented in the [design proposal](https://github.com/kubernetes/kubernetes/blob/{{ page.githubbranch }}/docs/proposals/protobuf.md) and the IDL files for each schema are located in the Go packages that define the API objects.
|
||||
|
||||
## API versioning
|
||||
|
||||
To make it easier to eliminate fields or restructure resource representations, Kubernetes supports
|
||||
multiple API versions, each at a different API path, such as `/api/v1` or
|
||||
`/apis/extensions/v1beta1`.
|
||||
|
||||
We chose to version at the API level rather than at the resource or field level to ensure that the API presents a clear, consistent view of system resources and behavior, and to enable controlling access to end-of-lifed and/or experimental APIs. The JSON and Protobuf serialization schemas follow the same guidelines for schema changes - all descriptions below cover both formats.
|
||||
|
||||
Note that API versioning and Software versioning are only indirectly related. The [API and release
|
||||
versioning proposal](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/versioning.md) describes the relationship between API versioning and
|
||||
software versioning.
|
||||
|
||||
|
||||
Different API versions imply different levels of stability and support. The criteria for each level are described
|
||||
in more detail in the [API Changes documentation](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/api_changes.md#alpha-beta-and-stable-versions). They are summarized here:
|
||||
|
||||
- Alpha level:
|
||||
- The version names contain `alpha` (e.g. `v1alpha1`).
|
||||
- May be buggy. Enabling the feature may expose bugs. Disabled by default.
|
||||
- Support for feature may be dropped at any time without notice.
|
||||
- The API may change in incompatible ways in a later software release without notice.
|
||||
- Recommended for use only in short-lived testing clusters, due to increased risk of bugs and lack of long-term support.
|
||||
- Beta level:
|
||||
- The version names contain `beta` (e.g. `v2beta3`).
|
||||
- Code is well tested. Enabling the feature is considered safe. Enabled by default.
|
||||
- Support for the overall feature will not be dropped, though details may change.
|
||||
- The schema and/or semantics of objects may change in incompatible ways in a subsequent beta or stable release. When this happens,
|
||||
we will provide instructions for migrating to the next version. This may require deleting, editing, and re-creating
|
||||
API objects. The editing process may require some thought. This may require downtime for applications that rely on the feature.
|
||||
- Recommended for only non-business-critical uses because of potential for incompatible changes in subsequent releases. If you have
|
||||
multiple clusters which can be upgraded independently, you may be able to relax this restriction.
|
||||
- **Please do try our beta features and give feedback on them! Once they exit beta, it may not be practical for us to make more changes.**
|
||||
- Stable level:
|
||||
- The version name is `vX` where `X` is an integer.
|
||||
- Stable versions of features will appear in released software for many subsequent versions.
|
||||
|
||||
## API groups
|
||||
|
||||
To make it easier to extend the Kubernetes API, we implemented [*API groups*](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/api-group.md).
|
||||
The API group is specified in a REST path and in the `apiVersion` field of a serialized object.
|
||||
|
||||
Currently there are several API groups in use:
|
||||
|
||||
1. the "core" (oftentimes called "legacy", due to not having explicit group name) group, which is at
|
||||
REST path `/api/v1` and is not specified as part of the `apiVersion` field, e.g. `apiVersion: v1`.
|
||||
1. the named groups are at REST path `/apis/$GROUP_NAME/$VERSION`, and use `apiVersion: $GROUP_NAME/$VERSION`
|
||||
(e.g. `apiVersion: batch/v1`). Full list of supported API groups can be seen in [Kubernetes API reference](/docs/reference/).
|
||||
|
||||
|
||||
There are two supported paths to extending the API.
|
||||
1. [Third Party Resources](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/extending-api.md)
|
||||
are for users with very basic CRUD needs.
|
||||
1. Coming soon: users needing the full set of Kubernetes API semantics can implement their own apiserver
|
||||
and use the [aggregator](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/aggregated-api-servers.md)
|
||||
to make it seamless for clients.
|
||||
|
||||
|
||||
## Enabling API groups
|
||||
|
||||
Certain resources and API groups are enabled by default. They can be enabled or disabled by setting `--runtime-config`
|
||||
on apiserver. `--runtime-config` accepts comma separated values. For ex: to disable batch/v1, set
|
||||
`--runtime-config=batch/v1=false`, to enable batch/v2alpha1, set `--runtime-config=batch/v2alpha1`.
|
||||
The flag accepts comma separated set of key=value pairs describing runtime configuration of the apiserver.
|
||||
|
||||
IMPORTANT: Enabling or disabling groups or resources requires restarting apiserver and controller-manager
|
||||
to pick up the `--runtime-config` changes.
|
||||
|
||||
## Enabling resources in the groups
|
||||
|
||||
DaemonSets, Deployments, HorizontalPodAutoscalers, Ingress, Jobs and ReplicaSets are enabled by default.
|
||||
Other extensions resources can be enabled by setting `--runtime-config` on
|
||||
apiserver. `--runtime-config` accepts comma separated values. For ex: to disable deployments and jobs, set
|
||||
`--runtime-config=extensions/v1beta1/deployments=false,extensions/v1beta1/jobs=false`
|
|
@ -0,0 +1,117 @@
|
|||
---
|
||||
assignees:
|
||||
- bgrant0607
|
||||
- mikedanese
|
||||
title: What is Kubernetes?
|
||||
---
|
||||
|
||||
Kubernetes is an [open-source platform for automating deployment, scaling, and operations of application containers](http://www.slideshare.net/BrianGrant11/wso2con-us-2015-kubernetes-a-platform-for-automating-deployment-scaling-and-operations) across clusters of hosts, providing container-centric infrastructure.
|
||||
|
||||
With Kubernetes, you are able to quickly and efficiently respond to customer demand:
|
||||
|
||||
- Deploy your applications quickly and predictably.
|
||||
- Scale your applications on the fly.
|
||||
- Seamlessly roll out new features.
|
||||
- Optimize use of your hardware by using only the resources you need.
|
||||
|
||||
Our goal is to foster an ecosystem of components and tools that relieve the burden of running applications in public and private clouds.
|
||||
|
||||
#### Kubernetes is:
|
||||
|
||||
* **portable**: public, private, hybrid, multi-cloud
|
||||
* **extensible**: modular, pluggable, hookable, composable
|
||||
* **self-healing**: auto-placement, auto-restart, auto-replication, auto-scaling
|
||||
|
||||
The Kubernetes project was started by Google in 2014. Kubernetes builds upon a [decade and a half of experience that Google has with running production workloads at scale](https://research.google.com/pubs/pub43438.html), combined with best-of-breed ideas and practices from the community.
|
||||
|
||||
##### Ready to [Get Started](/docs/getting-started-guides/)?
|
||||
|
||||
## Why containers?
|
||||
|
||||
Looking for reasons why you should be using [containers](http://aucouranton.com/2014/06/13/linux-containers-parallels-lxc-openvz-docker-and-more/)?
|
||||
|
||||
![Why Containers?](/images/docs/why_containers.svg)
|
||||
|
||||
The *Old Way* to deploy applications was to install the applications on a host using the operating system package manager. This had the disadvantage of entangling the applications' executables, configuration, libraries, and lifecycles with each other and with the host OS. One could build immutable virtual-machine images in order to achieve predictable rollouts and rollbacks, but VMs are heavyweight and non-portable.
|
||||
|
||||
The *New Way* is to deploy containers based on operating-system-level virtualization rather than hardware virtualization. These containers are isolated from each other and from the host: they have their own filesystems, they can't see each others' processes, and their computational resource usage can be bounded. They are easier to build than VMs, and because they are decoupled from the underlying infrastructure and from the host filesystem, they are portable across clouds and OS distributions.
|
||||
|
||||
Because containers are small and fast, one application can be packed in each container image. This one-to-one application-to-image relationship unlocks the full benefits of containers. With containers, immutable container images can be created at build/release time rather than deployment time, since each application doesn't need to be composed with the rest of the application stack, nor married to the production infrastructure environment. Generating container images at build/release time enables a consistent environment to be carried from development into production.
|
||||
Similarly, containers are vastly more transparent than VMs, which facilitates monitoring and management. This is especially true when the containers' process lifecycles are managed by the infrastructure rather than hidden by a process supervisor inside the container. Finally, with a single application per container, managing the containers becomes tantamount to managing deployment of the application.
|
||||
|
||||
Summary of container benefits:
|
||||
|
||||
* **Agile application creation and deployment**:
|
||||
Increased ease and efficiency of container image creation compared to VM image use.
|
||||
* **Continuous development, integration, and deployment**:
|
||||
Provides for reliable and frequent container image build and deployment with quick and easy rollbacks (due to image immutability).
|
||||
* **Dev and Ops separation of concerns**:
|
||||
Create application container images at build/release time rather than deployment time, thereby decoupling applications from infrastructure.
|
||||
* **Environmental consistency across development, testing, and production**:
|
||||
Runs the same on a laptop as it does in the cloud.
|
||||
* **Cloud and OS distribution portability**:
|
||||
Runs on Ubuntu, RHEL, CoreOS, on-prem, Google Container Engine, and anywhere else.
|
||||
* **Application-centric management**:
|
||||
Raises the level of abstraction from running an OS on virtual hardware to run an application on an OS using logical resources.
|
||||
* **Loosely coupled, distributed, elastic, liberated [micro-services](http://martinfowler.com/articles/microservices.html)**:
|
||||
Applications are broken into smaller, independent pieces and can be deployed and managed dynamically -- not a fat monolithic stack running on one big single-purpose machine.
|
||||
* **Resource isolation**:
|
||||
Predictable application performance.
|
||||
* **Resource utilization**:
|
||||
High efficiency and density.
|
||||
|
||||
#### Why do I need Kubernetes and what can it do?
|
||||
|
||||
At a minimum, Kubernetes can schedule and run application containers on clusters of physical or virtual machines. However, Kubernetes also allows developers to 'cut the cord' to physical and virtual machines, moving from a **host-centric** infrastructure to a **container-centric** infrastructure, which provides the full advantages and benefits inherent to containers. Kubernetes provides the infrastructure to build a truly **container-centric** development environment.
|
||||
|
||||
Kubernetes satisfies a number of common needs of applications running in production, such as:
|
||||
|
||||
* [co-locating helper processes](/docs/user-guide/pods/), facilitating composite applications and preserving the one-application-per-container model,
|
||||
* [mounting storage systems](/docs/user-guide/volumes/),
|
||||
* [distributing secrets](/docs/user-guide/secrets/),
|
||||
* [application health checking](/docs/user-guide/production-pods/#liveness-and-readiness-probes-aka-health-checks),
|
||||
* [replicating application instances](/docs/user-guide/replication-controller/),
|
||||
* [horizontal auto-scaling](/docs/user-guide/horizontal-pod-autoscaling/),
|
||||
* [naming and discovery](/docs/user-guide/connecting-applications/),
|
||||
* [load balancing](/docs/user-guide/services/),
|
||||
* [rolling updates](/docs/tasks/run-application/rolling-update-replication-controller/),
|
||||
* [resource monitoring](/docs/user-guide/monitoring/),
|
||||
* [log access and ingestion](/docs/user-guide/logging/overview/),
|
||||
* [support for introspection and debugging](/docs/user-guide/introspection-and-debugging/), and
|
||||
* [identity and authorization](/docs/admin/authorization/).
|
||||
|
||||
This provides the simplicity of Platform as a Service (PaaS) with the flexibility of Infrastructure as a Service (IaaS), and facilitates portability across infrastructure providers.
|
||||
|
||||
For more details, see the [user guide](/docs/user-guide/).
|
||||
|
||||
#### Why and how is Kubernetes a platform?
|
||||
|
||||
Even though Kubernetes provides a lot of functionality, there are always new scenarios that would benefit from new features. Application-specific workflows can be streamlined to accelerate developer velocity. Ad hoc orchestration that is acceptable initially often requires robust automation at scale. This is why Kubernetes was also designed to serve as a platform for building an ecosystem of components and tools to make it easier to deploy, scale, and manage applications.
|
||||
|
||||
[Labels](/docs/user-guide/labels/) empower users to organize their resources however they please. [Annotations](/docs/user-guide/annotations/) enable users to decorate resources with custom information to facilitate their workflows and provide an easy way for management tools to checkpoint state.
|
||||
|
||||
Additionally, the [Kubernetes control plane](/docs/admin/cluster-components) is built upon the same [APIs](/docs/api/) that are available to developers and users. Users can write their own controllers, [schedulers](https://github.com/kubernetes/kubernetes/tree/{{page.githubbranch}}/docs/devel/scheduler.md), etc., if they choose, with [their own APIs](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/extending-api.md) that can be targeted by a general-purpose [command-line tool](/docs/user-guide/kubectl-overview/).
|
||||
|
||||
This [design](https://github.com/kubernetes/kubernetes/blob/{{page.githubbranch}}/docs/design/principles.md) has enabled a number of other systems to build atop Kubernetes.
|
||||
|
||||
#### Kubernetes is not:
|
||||
|
||||
Kubernetes is not a traditional, all-inclusive PaaS (Platform as a Service) system. We preserve user choice where it is important.
|
||||
|
||||
* Kubernetes does not limit the types of applications supported. It does not dictate application frameworks (e.g., [Wildfly](http://wildfly.org/)), restrict the set of supported language runtimes (e.g., Java, Python, Ruby), cater to only [12-factor applications](http://12factor.net/), nor distinguish "apps" from "services". Kubernetes aims to support an extremely diverse variety of workloads, including stateless, stateful, and data-processing workloads. If an application can run in a container, it should run great on Kubernetes.
|
||||
* Kubernetes does not provide middleware (e.g., message buses), data-processing frameworks (e.g., Spark), databases (e.g., mysql), nor cluster storage systems (e.g., Ceph) as built-in services. Such applications run on Kubernetes.
|
||||
* Kubernetes does not have a click-to-deploy service marketplace.
|
||||
* Kubernetes is unopinionated in the source-to-image space. It does not deploy source code and does not build your application. Continuous Integration (CI) workflow is an area where different users and projects have their own requirements and preferences, so we support layering CI workflows on Kubernetes but don't dictate how it should work.
|
||||
* Kubernetes allows users to choose the logging, monitoring, and alerting systems of their choice. (Though we do provide some integrations as proof of concept.)
|
||||
* Kubernetes does not provide nor mandate a comprehensive application configuration language/system (e.g., [jsonnet](https://github.com/google/jsonnet)).
|
||||
* Kubernetes does not provide nor adopt any comprehensive machine configuration, maintenance, management, or self-healing systems.
|
||||
|
||||
On the other hand, a number of PaaS systems run *on* Kubernetes, such as [Openshift](https://github.com/openshift/origin), [Deis](http://deis.io/), and [Eldarion](http://eldarion.cloud/). You could also roll your own custom PaaS, integrate with a CI system of your choice, or get along just fine with just Kubernetes: bring your container images and deploy them on Kubernetes.
|
||||
|
||||
Since Kubernetes operates at the application level rather than at just the hardware level, it provides some generally applicable features common to PaaS offerings, such as deployment, scaling, load balancing, logging, monitoring, etc. However, Kubernetes is not monolithic, and these default solutions are optional and pluggable.
|
||||
|
||||
Additionally, Kubernetes is not a mere "orchestration system"; it eliminates the need for orchestration. The technical definition of "orchestration" is execution of a defined workflow: do A, then B, then C. In contrast, Kubernetes is comprised of a set of independent, composable control processes that continuously drive current state towards the provided desired state. It shouldn't matter how you get from A to C: make it so. Centralized control is also not required; the approach is more akin to "choreography". This results in a system that is easier to use and more powerful, robust, resilient, and extensible.
|
||||
|
||||
#### What does *Kubernetes* mean? K8s?
|
||||
|
||||
The name **Kubernetes** originates from Greek, meaning "helmsman" or "pilot", and is the root of "governor" and ["cybernetic"](http://www.etymonline.com/index.php?term=cybernetics). **K8s** is an abbreviation derived by replacing the 8 letters "ubernete" with 8.
|
|
@ -1,5 +1,9 @@
|
|||
---
|
||||
title: Kubernetes Objects
|
||||
title: Understanding Kubernetes Objects
|
||||
|
||||
redirect_from:
|
||||
- "/docs/concepts/abstractions/overview/"
|
||||
- "/docs/concepts/abstractions/overview.html"
|
||||
---
|
||||
|
||||
{% capture overview %}
|
||||
|
@ -9,7 +13,7 @@ This page explains how Kubernetes objects are represented in the Kubernetes API,
|
|||
{% capture body %}
|
||||
## Understanding Kubernetes Objects
|
||||
|
||||
*Kubernetes Objects* are persistent entities in the Kubernetes system. Kubenetes uses these entities to represent the state of your cluster. Specifically, they can describe:
|
||||
*Kubernetes Objects* are persistent entities in the Kubernetes system. Kubernetes uses these entities to represent the state of your cluster. Specifically, they can describe:
|
||||
|
||||
* What containerized applications are running (and on which nodes)
|
||||
* The resources available to those applications
|
||||
|
@ -17,15 +21,16 @@ This page explains how Kubernetes objects are represented in the Kubernetes API,
|
|||
|
||||
A Kubernetes object is a "record of intent"--once you create the object, the Kubernetes system will constantly work to ensure that that object exists. By creating an object, you're effectively telling the Kubernetes system what you want your cluster's workload to look like; this is your cluster's **desired state**.
|
||||
|
||||
To work with Kubernetes objects--whether to create, modify, or delete them--you'll need to use the [Kubernetes API](https://github.com/kubernetes/kubernetes/blob/master/docs/devel/api-conventions.md). When you use the `kubectl` comamnd-line interface, for example, the CLI makes the necessary Kubernetes API calls for you; you can also use the Kubernetes API directly in your own programs. Kubernetes currently provides a `golang` [client library](https://github.com/kubernetes/client-go) for this purpose, and other language libraries (such as [Python](https://github.com/kubernetes-incubator/client-python)) are being developed.
|
||||
To work with Kubernetes objects--whether to create, modify, or delete them--you'll need to use the [Kubernetes API](https://github.com/kubernetes/kubernetes/blob/master/docs/devel/api-conventions.md). When you use the `kubectl` command-line interface, for example, the CLI makes the necessary Kubernetes API calls for you; you can also use the Kubernetes API directly in your own programs. Kubernetes currently provides a `golang` [client library](https://github.com/kubernetes/client-go) for this purpose, and other language libraries (such as [Python](https://github.com/kubernetes-incubator/client-python)) are being developed.
|
||||
|
||||
### Object Spec and Status
|
||||
|
||||
Every Kubernetes object includes two nested object fields that govern the object's configuration: the object *spec* and the object *status*. The *spec*, which you must provide, describes your *desired state* for the object--the characteristics that you want the object to have. The *status* describes the *actual state* for the object, and is supplied and updated by the Kubernetes system. At any given time, the Kubernetes Control Plane actively manages an object's actual state to match the desired state you supplied.
|
||||
|
||||
|
||||
For example, a Kubernetes Deployment is an object that can represent an application running on your cluster. When you create the Deployment, you might set the Deployment spec to specify that you want three replicas of the application to be running. The Kubernetes system reads the Deployment spec and starts three instances of your desired application--updating the status to match your spec. If any of those instances should fail (a status change), the Kubernetes system responds to the difference between spec and status by making a correction--in this case, starting a replacement instance.
|
||||
|
||||
For more information on the object spec, status, and metadata, see the [Kubernetes API Conventions](https://github.com/kubernetes/kubernetes/blob/master/docs/devel/api-conventions.md#spec-and-status).
|
||||
For more information on the object spec, status, and metadata, see the [Kubernetes API Conventions](https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md).
|
||||
|
||||
### Describing a Kubernetes Object
|
||||
|
||||
|
@ -33,7 +38,7 @@ When you create an object in Kubernetes, you must provide the object spec that d
|
|||
|
||||
Here's an example `.yaml` file that shows the required fields and object spec for a Kubernetes Deployment:
|
||||
|
||||
{% include code.html language="yaml" file="nginx-deployment.yaml" ghlink="/docs/concepts/abstractions/nginx-deployment.yaml" %}
|
||||
{% include code.html language="yaml" file="nginx-deployment.yaml" ghlink="/docs/concepts/overview/working-with-objects/nginx-deployment.yaml" %}
|
||||
|
||||
One way to create a Deployment using a `.yaml` file like the one above is to use the []`kubectl create`]() command in the `kubectl` command-line interface, passing the `.yaml` file as an argument. Here's an example:
|
||||
|
|
@ -2,6 +2,9 @@
|
|||
assignees:
|
||||
- mikedanese
|
||||
title: Labels and Selectors
|
||||
redirect_from:
|
||||
- "/docs/user-guide/labels/"
|
||||
- "/docs/user-guide/labels.html"
|
||||
---
|
||||
|
||||
_Labels_ are key/value pairs that are attached to objects, such as pods.
|
||||
|
@ -154,7 +157,7 @@ this selector (respectively in `json` or `yaml` format) is equivalent to `compon
|
|||
|
||||
#### Resources that support set-based requirements
|
||||
|
||||
Newer resources, such as [`Job`](/docs/user-guide/jobs), [`Deployment`](/docs/user-guide/deployments/), [`Replica Set`](/docs/user-guide/replicasets/), and [`Daemon Set`](/docs/admin/daemons/), support _set-based_ requirements as well.
|
||||
Newer resources, such as [`Job`](/docs/concepts/jobs/run-to-completion-finite-workloads/), [`Deployment`](/docs/user-guide/deployments/), [`Replica Set`](/docs/user-guide/replicasets/), and [`Daemon Set`](/docs/admin/daemons/), support _set-based_ requirements as well.
|
||||
|
||||
```yaml
|
||||
selector:
|
|
@ -0,0 +1,16 @@
|
|||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nginx-deployment
|
||||
spec:
|
||||
replicas: 3
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nginx
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:1.7.9
|
||||
ports:
|
||||
- containerPort: 80
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue