From 65c4f99692b7046a673686c0f6b5490ecca2bb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jarom=C3=ADr=20Hradil?= <jaromir.hradil@cesnet.cz> Date: Thu, 30 Jan 2025 19:41:21 +0100 Subject: [PATCH] Adding Nvidia gpu-operator installation --- common/playbooks/k8s.yaml | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/common/playbooks/k8s.yaml b/common/playbooks/k8s.yaml index 84d5961..f8c835d 100644 --- a/common/playbooks/k8s.yaml +++ b/common/playbooks/k8s.yaml @@ -559,3 +559,71 @@ PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin changed_when: true when: true +# TODO: Following step can be removed as gpu-operator +# installs drivers in containers but it requires +# the latest kernels or a workaround is needed - +# provide an outdated kernel repo for operator to access +# drivers or install drivers yourself into the host kernel +- name: Nvidia GPU driver deployment + hosts: gpu + become: true + tasks: + - name: Install ubuntu-drivers package + apt: + name: ubuntu-drivers-common + update_cache: true + - name: Install Nvidia driver packages + shell: |- + ubuntu-drivers install + changed_when: true + - name: Reboot GPU node + reboot: + reboot_timeout: 600 +- name: Install Nvidia GPU operator and enable MIGs + hosts: master[0] + become: true + tasks: + - block: + - name: Configure Nvidia gpu-operator Helm repo + shell: |- + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + helm repo update + changed_when: true + when: "'nvidia' not in ansible_local.helm_repos | map(attribute='name') | list" + - name: Deploy/upgrade Nvidia gpu-operator instance + vars: + driver_enabled: false # if set to true the operator will install containerized drivers + mig_strategy: single + shell: |- + helm status --namespace gpu-operator gpu-operator + if [ $? -ne 0 ]; then + helm install --wait --create-namespace --namespace gpu-operator \ + gpu-operator nvidia/gpu-operator --set driver.enabled={{ driver_enabled }} \ + --set mig.strategy={{ mig_strategy }} + else + helm upgrade --wait --namespace gpu-operator \ + gpu-operator nvidia/gpu-operator --set driver.enabled={{ driver_enabled }} \ + --set mig.strategy={{ mig_strategy }} + fi + changed_when: true + when: true + - name: Get GPU node hostnames + shell: |- + kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | grep gpu + register: gpu_nodes + changed_when: true + when: true + - name: Print GPU node hostnames + debug: + var: gpu_nodes + - name: Add required label to GPU nodes to create mig profiles + vars: + mig_profile: all-1g.12gb + shell: |- + kubectl label node {{ item }} nvidia.com/mig.config={{ mig_profile }} --overwrite + loop: "{{ gpu_nodes.stdout_lines }}" + changed_when: true + when: true + environment: + KUBECONFIG: /etc/kubernetes/admin.conf + PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin -- GitLab