diff --git a/common/playbooks/k8s.yaml b/common/playbooks/k8s.yaml index 84d59612e82b7babd571477abec8c6104e86b4ea..f8c835d15b676a829547b2f6a6b5df700c045143 100644 --- a/common/playbooks/k8s.yaml +++ b/common/playbooks/k8s.yaml @@ -559,3 +559,71 @@ PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin changed_when: true when: true +# TODO: Following step can be removed as gpu-operator +# installs drivers in containers but it requires +# the latest kernels or a workaround is needed - +# provide an outdated kernel repo for operator to access +# drivers or install drivers yourself into the host kernel +- name: Nvidia GPU driver deployment + hosts: gpu + become: true + tasks: + - name: Install ubuntu-drivers package + apt: + name: ubuntu-drivers-common + update_cache: true + - name: Install Nvidia driver packages + shell: |- + ubuntu-drivers install + changed_when: true + - name: Reboot GPU node + reboot: + reboot_timeout: 600 +- name: Install Nvidia GPU operator and enable MIGs + hosts: master[0] + become: true + tasks: + - block: + - name: Configure Nvidia gpu-operator Helm repo + shell: |- + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + helm repo update + changed_when: true + when: "'nvidia' not in ansible_local.helm_repos | map(attribute='name') | list" + - name: Deploy/upgrade Nvidia gpu-operator instance + vars: + driver_enabled: false # if set to true the operator will install containerized drivers + mig_strategy: single + shell: |- + helm status --namespace gpu-operator gpu-operator + if [ $? -ne 0 ]; then + helm install --wait --create-namespace --namespace gpu-operator \ + gpu-operator nvidia/gpu-operator --set driver.enabled={{ driver_enabled }} \ + --set mig.strategy={{ mig_strategy }} + else + helm upgrade --wait --namespace gpu-operator \ + gpu-operator nvidia/gpu-operator --set driver.enabled={{ driver_enabled }} \ + --set mig.strategy={{ mig_strategy }} + fi + changed_when: true + when: true + - name: Get GPU node hostnames + shell: |- + kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | grep gpu + register: gpu_nodes + changed_when: true + when: true + - name: Print GPU node hostnames + debug: + var: gpu_nodes + - name: Add required label to GPU nodes to create mig profiles + vars: + mig_profile: all-1g.12gb + shell: |- + kubectl label node {{ item }} nvidia.com/mig.config={{ mig_profile }} --overwrite + loop: "{{ gpu_nodes.stdout_lines }}" + changed_when: true + when: true + environment: + KUBECONFIG: /etc/kubernetes/admin.conf + PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin