Skip to content
Snippets Groups Projects
Commit 2cb53341 authored by František Dvořák's avatar František Dvořák
Browse files

GPU support in notebooks

* GPU node automatic creation through terraform
* update deploy script
* update playbooks
* enable NVidia GPU in the k8s cluster
parent 658c1937
Branches
No related tags found
No related merge requests found
---
- name: CVMFS deployment
hosts: ingress, nfs, worker
hosts: ingress, nfs, worker, gpu
vars:
# EGI repositories: gridpp.egi.eu eosc.egi.eu pheno.egi.eu mice.egi.eu ghost.egi.eu wenmr.egi.eu neugrid.egi.eu auger.egi.eu dirac.egi.eu galdyn.egi.eu seadatanet.egi.eu ligo.egi.eu supernemo.egi.eu pravda.egi.eu chipster.egi.eu hyperk.egi.eu snoplus.egi.eu km3net.egi.eu t2k.egi.eu na62.egi.eu biomed.egi.eu eiscat.egi.eu comet.egi.eu notebooks.egi.eu
cvmfs_repositories:
......
......@@ -120,6 +120,9 @@
vars:
# do not downgrade docker
kube_docker_version: latest
# kube_nvidia_device_plugin_version: "v0.12.2"
# kube_nvidia_driver_version: "515" # "525"
kube_nvidia_support: true
kube_version: 1.28.8
kube_network: 'none' # custom network installation
kube_install_helm: true
......@@ -169,13 +172,17 @@
mode: 0755
- name: K8s nodes deployment
hosts: nfs, ingress, worker
hosts: nfs, ingress, worker, gpu
become: true
roles:
- role: 'grycap.kubernetes'
vars:
# do not downgrade docker
kube_docker_version: latest
# kube_nvidia_device_plugin_version: "v0.12.2"
# kube_nvidia_driver_version: "515" # "525"
# support only on worker nodes with GPU hardware
kube_nvidia_support: "{{ inventory_hostname in groups['gpu']}}"
# must be IPv4 address or hostname
kube_server: "{{ hostvars[groups['master'][0]].kube_server | default(groups['master'][0]) }}"
kube_type_of_node: wn
......
{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] -%}
{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] + groups['gpu'] -%}
acl allcluster src {{ hostvars[host].ansible_default_ipv6.address }}
{% endfor -%}
{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] -%}
{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] + groups['gpu'] -%}
acl allcluster src {{ hostvars[host].ansible_default_ipv4.address }}
{% endfor -%}
http_access allow allcluster
......@@ -39,10 +39,10 @@ ansible -m copy -a 'src=terraform/nfs-volume.sh dest=/root/ mode=preserve' nfs
ansible -m command -a '/root/nfs-volume.sh' nfs
ansible -m copy -a 'src=terraform/squid-volume.sh dest=/root/ mode=preserve' 'ingress[0]'
ansible -m command -a '/root/squid-volume.sh' 'ingress[0]'
ansible -m copy -a 'src=terraform/docker-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker'
ansible -m command -a '/root/docker-volume.sh' 'ingress nfs worker'
ansible -m copy -a 'src=terraform/scratch-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker'
ansible -m command -a '/root/scratch-volume.sh' 'ingress nfs worker'
ansible -m copy -a 'src=terraform/docker-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker gpu'
ansible -m command -a '/root/docker-volume.sh' 'ingress nfs worker gpu'
ansible -m copy -a 'src=terraform/scratch-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker gpu'
ansible -m command -a '/root/scratch-volume.sh' 'ingress nfs worker gpu'
# k8s + notebooks
ansible-playbook playbooks/k8s.yaml
......
......@@ -5,6 +5,7 @@ allnodes:
ingress:
nfs:
worker:
gpu:
all:
vars:
......
......@@ -9,10 +9,14 @@ master_cpus = 2 # 2 CPUs to match existing flavours
master_ram = 4096
worker_cpus = 4
worker_ram = 8192
gpu_flavor_name = "a3.32core-240ram-1t4"
# Number of extra workers
extra_workers = 1
# Number of GPU workers
gpu_workers = 1
# volumes for docker
docker_volumes_size = 384
......
......@@ -18,6 +18,11 @@ variable "site_name" {
description = "Site identifier for internal host names"
}
variable "gpu_flavor_name" {
type = string
description = "Name of the GPU flavor"
}
variable "master_cpus" {
type = number
description = "Number of CPUs for the master"
......@@ -43,6 +48,11 @@ variable "extra_workers" {
description = "Number of extra workers to create"
}
variable "gpu_workers" {
type = number
description = "Number of GPU workers to create"
}
variable "docker_volumes_size" {
type = number
description = "Size of volumes for docker (GB)"
......
......@@ -5,11 +5,12 @@ locals {
nodes = concat([
openstack_compute_instance_v2.ingress,
openstack_compute_instance_v2.nfs,
], openstack_compute_instance_v2.worker[*])
], openstack_compute_instance_v2.worker[*], openstack_compute_instance_v2.gpu[*])
master_ip = replace(openstack_compute_instance_v2.master.network[1].fixed_ip_v6, "/\\[(.*)\\]/", "$1")
ingress_ip = replace(openstack_compute_instance_v2.ingress.network[1].fixed_ip_v6, "/\\[(.*)\\]/", "$1")
nfs_ip = replace(openstack_compute_instance_v2.nfs.network[1].fixed_ip_v6, "/\\[(.*)\\]/", "$1")
worker_ips = [for s in openstack_compute_instance_v2.worker[*].network[1].fixed_ip_v6 : replace(s, "/\\[(.*)\\]/", "$1")]
gpu_ips = [for s in openstack_compute_instance_v2.gpu[*].network[1].fixed_ip_v6 : replace(s, "/\\[(.*)\\]/", "$1")]
}
# Security groups
......@@ -99,6 +100,10 @@ data "openstack_compute_flavor_v2" "worker-flavor" {
ram = var.worker_ram
}
data "openstack_compute_flavor_v2" "gpu-flavor" {
name = var.gpu_flavor_name
}
resource "openstack_compute_instance_v2" "master" {
name = "k8s-${var.site_name}-master"
image_id = data.openstack_images_image_v2.ubuntu.id
......@@ -160,6 +165,22 @@ resource "openstack_compute_instance_v2" "worker" {
}
}
resource "openstack_compute_instance_v2" "gpu" {
count = var.gpu_workers
name = "k8s-${var.site_name}-gpu-${count.index}"
image_id = data.openstack_images_image_v2.ubuntu.id
flavor_id = data.openstack_compute_flavor_v2.gpu-flavor.id
security_groups = ["default", openstack_compute_secgroup_v2.ping.name, openstack_compute_secgroup_v2.ssh.name]
user_data = file("cloud-init.yaml")
tags = ["worker"]
network {
name = var.net_name
}
network {
name = var.net6_name
}
}
resource "openstack_compute_floatingip_associate_v2" "fip" {
floating_ip = openstack_networking_floatingip_v2.public_ip.address
instance_id = openstack_compute_instance_v2.ingress.id
......@@ -193,13 +214,13 @@ EOT
}
resource "openstack_blockstorage_volume_v3" "docker-volume" {
count = var.extra_workers + 2
count = var.extra_workers + var.gpu_workers + 2
name = format("docker-%s", local.nodes[count.index].name)
size = var.docker_volumes_size
}
resource "openstack_compute_volume_attach_v2" "docker-volume-attach" {
count = var.extra_workers + 2
count = var.extra_workers + var.gpu_workers + 2
instance_id = local.nodes[count.index].id
volume_id = openstack_blockstorage_volume_v3.docker-volume[count.index].id
}
......@@ -227,13 +248,13 @@ EOT
}
resource "openstack_blockstorage_volume_v3" "scratch-volume" {
count = var.extra_workers + 2
count = var.extra_workers + var.gpu_workers + 2
name = format("scratch-%s", local.nodes[count.index].name)
size = var.scratch_volumes_size
}
resource "openstack_compute_volume_attach_v2" "scratch-volume-attach" {
count = var.extra_workers + 2
count = var.extra_workers + var.gpu_workers + 2
instance_id = local.nodes[count.index].id
volume_id = openstack_blockstorage_volume_v3.scratch-volume[count.index].id
}
......@@ -305,7 +326,11 @@ nfs:
worker:
hosts:
${join(":\n ", local.worker_ips)}:
${join("\n ", [for s in local.worker_ips: "${s}:"])}
gpu:
hosts:
${join("\n ", [for s in local.gpu_ips : "${s}:"])}
# using public IP of kube_server for ansible delegate_to
kube_server:
......@@ -330,6 +355,6 @@ resource "local_file" "hosts" {
${local.master_ip}
${local.ingress_ip}
${local.nfs_ip}
${join("\n", local.worker_ips)}
${join("\n", concat(local.worker_ips, local.gpu_ips))}
EOT
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment