[GPU] Centos 7 nvidia driver, CUDA , kubernetes gpu device 설치
nvidia Driver 415.18 install
# yum install kernel-devel kernel-headers gcc make
# lsmod | grep nouveau
# vi /etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
options nouveau modeset=0
# dracut --force
# reboot
lspci | grep VGA
# wget http://download.nvidia.com/XFree86/Linux-x86_64/415.18/NVIDIA-Linux-x86_64-415.18.run
# bash NVIDIA-Linux-x86_64-390.59.run
cuda 10.0 install
# rpm -i cuda-repo-rhel7-10.0.130-1.x86_64.rpm
nvidia-docker2 install
# yum install nvidia-docker2
# docker run --runtime=nvidia --rm nvidia/cuda nvidia-smi
# vi /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
# vi /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
[Service]
...
Environment="KUBELET_EXTRA_ARGS=--feature-gates=DevicePlugins=true"
systemctl daemon-reload
systemctl restart kubelet
DevicePlugin enable
# kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml
Running GPU Jobs
# vi testpod.yaml
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
spec:
containers:
- name: cuda-container
image: nvidia/cuda:9.0-devel
resources:
limits:
nvidia.com/gpu: 2 # requesting 2 GPUs
- name: digits-container
image: nvidia/digits:6.0
resources:
limits:
nvidia.com/gpu: 2 # requesting 2 GPUs
# kubectl apply -f testpod.yaml
# kubectl logs <id>
# pod check
kubectl get pods --all-namespaces
# gpu check
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
참고 :
https://github.com/NVIDIA/k8s-device-plugin
https://www.server-world.info/en/note?os=CentOS_7&p=nvidia