Skip to content
Snippets Groups Projects
Verified Commit 65c4f996 authored by Jaromír Hradil's avatar Jaromír Hradil
Browse files

Adding Nvidia gpu-operator installation

parent 67760ce3
No related branches found
No related tags found
1 merge request!41Adding Nvidia gpu-operator installation
......@@ -559,3 +559,71 @@
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
changed_when: true
when: true
# TODO: Following step can be removed as gpu-operator
# installs drivers in containers but it requires
# the latest kernels or a workaround is needed -
# provide an outdated kernel repo for operator to access
# drivers or install drivers yourself into the host kernel
- name: Nvidia GPU driver deployment
hosts: gpu
become: true
tasks:
- name: Install ubuntu-drivers package
apt:
name: ubuntu-drivers-common
update_cache: true
- name: Install Nvidia driver packages
shell: |-
ubuntu-drivers install
changed_when: true
- name: Reboot GPU node
reboot:
reboot_timeout: 600
- name: Install Nvidia GPU operator and enable MIGs
hosts: master[0]
become: true
tasks:
- block:
- name: Configure Nvidia gpu-operator Helm repo
shell: |-
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update
changed_when: true
when: "'nvidia' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Deploy/upgrade Nvidia gpu-operator instance
vars:
driver_enabled: false # if set to true the operator will install containerized drivers
mig_strategy: single
shell: |-
helm status --namespace gpu-operator gpu-operator
if [ $? -ne 0 ]; then
helm install --wait --create-namespace --namespace gpu-operator \
gpu-operator nvidia/gpu-operator --set driver.enabled={{ driver_enabled }} \
--set mig.strategy={{ mig_strategy }}
else
helm upgrade --wait --namespace gpu-operator \
gpu-operator nvidia/gpu-operator --set driver.enabled={{ driver_enabled }} \
--set mig.strategy={{ mig_strategy }}
fi
changed_when: true
when: true
- name: Get GPU node hostnames
shell: |-
kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | grep gpu
register: gpu_nodes
changed_when: true
when: true
- name: Print GPU node hostnames
debug:
var: gpu_nodes
- name: Add required label to GPU nodes to create mig profiles
vars:
mig_profile: all-1g.12gb
shell: |-
kubectl label node {{ item }} nvidia.com/mig.config={{ mig_profile }} --overwrite
loop: "{{ gpu_nodes.stdout_lines }}"
changed_when: true
when: true
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment