Skip to content
Snippets Groups Projects

Adding Nvidia gpu-operator installation

Merged Jaromír Hradil requested to merge nvidia_gpu_install into master
+ 67
0
@@ -559,3 +559,70 @@
@@ -559,3 +559,70 @@
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
changed_when: true
changed_when: true
when: true
when: true
 
# TODO: Following step can be removed as gpu-operator
 
# installs drivers in containers but it requires
 
# the latest kernels or a workaround is needed -
 
# provide an outdated kernel repo for operator to access
 
# drivers or install drivers yourself into the host kernel
 
- name: Nvidia GPU driver deployment
 
hosts: gpu
 
become: true
 
tasks:
 
- name: Install ubuntu-drivers package
 
apt:
 
name: ubuntu-drivers-common
 
update_cache: true
 
- name: Install Nvidia driver packages
 
shell: |-
 
ubuntu-drivers install
 
changed_when: true
 
- name: Reboot GPU node
 
reboot:
 
reboot_timeout: 600
 
- name: Install Nvidia GPU operator and enable MIGs
 
hosts: master[0]
 
become: true
 
tasks:
 
- block:
 
- name: Configure Nvidia gpu-operator Helm repo
 
shell: |-
 
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
 
helm repo update
 
- name: Deploy/upgrade Nvidia gpu-operator instance
 
vars:
 
version: "24.9.2"
 
driver_enabled: false # if set to true the operator will install containerized drivers
 
mig_strategy: single
 
shell: |-
 
helm status --namespace gpu-operator gpu-operator
 
if [ $? -ne 0 ]; then
 
helm install --wait --create-namespace --namespace gpu-operator \
 
--version {{ version }} gpu-operator nvidia/gpu-operator \
 
--set driver.enabled={{ driver_enabled }} --set mig.strategy={{ mig_strategy }}
 
else
 
helm upgrade --wait --namespace gpu-operator \
 
--version {{ version }} gpu-operator nvidia/gpu-operator \
 
--set driver.enabled={{ driver_enabled }} --set mig.strategy={{ mig_strategy }}
 
fi
 
changed_when: true
 
when: true
 
- name: Get GPU node hostnames
 
shell: |-
 
kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | grep gpu
 
register: gpu_nodes
 
changed_when: true
 
when: true
 
- name: Print GPU node hostnames
 
debug:
 
var: gpu_nodes
 
- name: Add required label to GPU nodes to create mig profiles
 
vars:
 
mig_profile: all-1g.12gb
 
shell: |-
 
kubectl label node {{ item }} nvidia.com/mig.config={{ mig_profile }} --overwrite
 
loop: "{{ gpu_nodes.stdout_lines }}"
 
changed_when: true
 
when: true
 
environment:
 
KUBECONFIG: /etc/kubernetes/admin.conf
 
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
Loading