diff --git a/cesnet-central/playbooks/cvmfs.yaml b/cesnet-central/playbooks/cvmfs.yaml index 74d53ee0971e71ab97f88bb63e5771aaa955642d..fceb710905f41b66c529794124dd072fd705b393 100644 --- a/cesnet-central/playbooks/cvmfs.yaml +++ b/cesnet-central/playbooks/cvmfs.yaml @@ -1,6 +1,6 @@ --- - name: CVMFS deployment - hosts: ingress, nfs, worker + hosts: ingress, nfs, worker, gpu vars: # EGI repositories: gridpp.egi.eu eosc.egi.eu pheno.egi.eu mice.egi.eu ghost.egi.eu wenmr.egi.eu neugrid.egi.eu auger.egi.eu dirac.egi.eu galdyn.egi.eu seadatanet.egi.eu ligo.egi.eu supernemo.egi.eu pravda.egi.eu chipster.egi.eu hyperk.egi.eu snoplus.egi.eu km3net.egi.eu t2k.egi.eu na62.egi.eu biomed.egi.eu eiscat.egi.eu comet.egi.eu notebooks.egi.eu cvmfs_repositories: diff --git a/cesnet-central/playbooks/k8s.yaml b/cesnet-central/playbooks/k8s.yaml index 7d1b5c41d25b08c820f855d368d38528dd5127cf..4a541870a64ea717a3bfdeb2ef4cc9aaac24b368 100644 --- a/cesnet-central/playbooks/k8s.yaml +++ b/cesnet-central/playbooks/k8s.yaml @@ -120,6 +120,9 @@ vars: # do not downgrade docker kube_docker_version: latest + # kube_nvidia_device_plugin_version: "v0.12.2" + # kube_nvidia_driver_version: "515" # "525" + kube_nvidia_support: true kube_version: 1.28.8 kube_network: 'none' # custom network installation kube_install_helm: true @@ -169,13 +172,17 @@ mode: 0755 - name: K8s nodes deployment - hosts: nfs, ingress, worker + hosts: nfs, ingress, worker, gpu become: true roles: - role: 'grycap.kubernetes' vars: # do not downgrade docker kube_docker_version: latest + # kube_nvidia_device_plugin_version: "v0.12.2" + # kube_nvidia_driver_version: "515" # "525" + # support only on worker nodes with GPU hardware + kube_nvidia_support: "{{ inventory_hostname in groups['gpu']}}" # must be IPv4 address or hostname kube_server: "{{ hostvars[groups['master'][0]].kube_server | default(groups['master'][0]) }}" kube_type_of_node: wn diff --git a/cesnet-central/playbooks/templates/etc/squid/conf.d/allcluster.conf b/cesnet-central/playbooks/templates/etc/squid/conf.d/allcluster.conf index 1b525205427754a671717d14c2abfe4602a06522..1449303a1455865631097618e15b39ead561090f 100644 --- a/cesnet-central/playbooks/templates/etc/squid/conf.d/allcluster.conf +++ b/cesnet-central/playbooks/templates/etc/squid/conf.d/allcluster.conf @@ -1,7 +1,7 @@ -{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] -%} +{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] + groups['gpu'] -%} acl allcluster src {{ hostvars[host].ansible_default_ipv6.address }} {% endfor -%} -{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] -%} +{% for host in groups['ingress'] + groups['nfs'] + groups['worker'] + groups['gpu'] -%} acl allcluster src {{ hostvars[host].ansible_default_ipv4.address }} {% endfor -%} http_access allow allcluster diff --git a/demo/deploy.sh b/demo/deploy.sh index b05f2f5cc64314d2bab7413e38281d1e733bdf65..d1f60bd52ec23e6a4754eb01183f74c42685e17e 100755 --- a/demo/deploy.sh +++ b/demo/deploy.sh @@ -39,10 +39,10 @@ ansible -m copy -a 'src=terraform/nfs-volume.sh dest=/root/ mode=preserve' nfs ansible -m command -a '/root/nfs-volume.sh' nfs ansible -m copy -a 'src=terraform/squid-volume.sh dest=/root/ mode=preserve' 'ingress[0]' ansible -m command -a '/root/squid-volume.sh' 'ingress[0]' -ansible -m copy -a 'src=terraform/docker-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker' -ansible -m command -a '/root/docker-volume.sh' 'ingress nfs worker' -ansible -m copy -a 'src=terraform/scratch-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker' -ansible -m command -a '/root/scratch-volume.sh' 'ingress nfs worker' +ansible -m copy -a 'src=terraform/docker-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker gpu' +ansible -m command -a '/root/docker-volume.sh' 'ingress nfs worker gpu' +ansible -m copy -a 'src=terraform/scratch-volume.sh dest=/root/ mode=preserve' 'ingress nfs worker gpu' +ansible -m command -a '/root/scratch-volume.sh' 'ingress nfs worker gpu' # k8s + notebooks ansible-playbook playbooks/k8s.yaml diff --git a/demo/inventory/99-all.yaml b/demo/inventory/99-all.yaml index 3f8a0013dcf7b057c7a76f6579be83eb0e679554..b35f043a4afdb1841a930598822070e1ed26e829 100644 --- a/demo/inventory/99-all.yaml +++ b/demo/inventory/99-all.yaml @@ -5,6 +5,7 @@ allnodes: ingress: nfs: worker: + gpu: all: vars: diff --git a/demo/terraform/terraform.tfvars b/demo/terraform/terraform.tfvars index 473e95b6264b6669d7b2b719d2c8e8576850f27a..c2231f2131056b94d70aba43df2e155227c66db0 100644 --- a/demo/terraform/terraform.tfvars +++ b/demo/terraform/terraform.tfvars @@ -9,10 +9,14 @@ master_cpus = 2 # 2 CPUs to match existing flavours master_ram = 4096 worker_cpus = 4 worker_ram = 8192 +gpu_flavor_name = "a3.32core-240ram-1t4" # Number of extra workers extra_workers = 1 +# Number of GPU workers +gpu_workers = 1 + # volumes for docker docker_volumes_size = 384 diff --git a/demo/terraform/vars.tf b/demo/terraform/vars.tf index 559190cd85c5f95973209ac373e784090c0a62e1..4b29665f3018fcd4d1a2bc51248204b88b4834e0 100644 --- a/demo/terraform/vars.tf +++ b/demo/terraform/vars.tf @@ -18,6 +18,11 @@ variable "site_name" { description = "Site identifier for internal host names" } +variable "gpu_flavor_name" { + type = string + description = "Name of the GPU flavor" +} + variable "master_cpus" { type = number description = "Number of CPUs for the master" @@ -43,6 +48,11 @@ variable "extra_workers" { description = "Number of extra workers to create" } +variable "gpu_workers" { + type = number + description = "Number of GPU workers to create" +} + variable "docker_volumes_size" { type = number description = "Size of volumes for docker (GB)" diff --git a/demo/terraform/vms.tf b/demo/terraform/vms.tf index 32a250f389352380749b0e0089ddb4f2a94fa444..4a32e2fc8c3b097965a7d689951fe3c1672c762f 100644 --- a/demo/terraform/vms.tf +++ b/demo/terraform/vms.tf @@ -5,11 +5,12 @@ locals { nodes = concat([ openstack_compute_instance_v2.ingress, openstack_compute_instance_v2.nfs, - ], openstack_compute_instance_v2.worker[*]) + ], openstack_compute_instance_v2.worker[*], openstack_compute_instance_v2.gpu[*]) master_ip = replace(openstack_compute_instance_v2.master.network[1].fixed_ip_v6, "/\\[(.*)\\]/", "$1") ingress_ip = replace(openstack_compute_instance_v2.ingress.network[1].fixed_ip_v6, "/\\[(.*)\\]/", "$1") nfs_ip = replace(openstack_compute_instance_v2.nfs.network[1].fixed_ip_v6, "/\\[(.*)\\]/", "$1") worker_ips = [for s in openstack_compute_instance_v2.worker[*].network[1].fixed_ip_v6 : replace(s, "/\\[(.*)\\]/", "$1")] + gpu_ips = [for s in openstack_compute_instance_v2.gpu[*].network[1].fixed_ip_v6 : replace(s, "/\\[(.*)\\]/", "$1")] } # Security groups @@ -99,6 +100,10 @@ data "openstack_compute_flavor_v2" "worker-flavor" { ram = var.worker_ram } +data "openstack_compute_flavor_v2" "gpu-flavor" { + name = var.gpu_flavor_name +} + resource "openstack_compute_instance_v2" "master" { name = "k8s-${var.site_name}-master" image_id = data.openstack_images_image_v2.ubuntu.id @@ -160,6 +165,22 @@ resource "openstack_compute_instance_v2" "worker" { } } +resource "openstack_compute_instance_v2" "gpu" { + count = var.gpu_workers + name = "k8s-${var.site_name}-gpu-${count.index}" + image_id = data.openstack_images_image_v2.ubuntu.id + flavor_id = data.openstack_compute_flavor_v2.gpu-flavor.id + security_groups = ["default", openstack_compute_secgroup_v2.ping.name, openstack_compute_secgroup_v2.ssh.name] + user_data = file("cloud-init.yaml") + tags = ["worker"] + network { + name = var.net_name + } + network { + name = var.net6_name + } +} + resource "openstack_compute_floatingip_associate_v2" "fip" { floating_ip = openstack_networking_floatingip_v2.public_ip.address instance_id = openstack_compute_instance_v2.ingress.id @@ -193,13 +214,13 @@ EOT } resource "openstack_blockstorage_volume_v3" "docker-volume" { - count = var.extra_workers + 2 + count = var.extra_workers + var.gpu_workers + 2 name = format("docker-%s", local.nodes[count.index].name) size = var.docker_volumes_size } resource "openstack_compute_volume_attach_v2" "docker-volume-attach" { - count = var.extra_workers + 2 + count = var.extra_workers + var.gpu_workers + 2 instance_id = local.nodes[count.index].id volume_id = openstack_blockstorage_volume_v3.docker-volume[count.index].id } @@ -227,13 +248,13 @@ EOT } resource "openstack_blockstorage_volume_v3" "scratch-volume" { - count = var.extra_workers + 2 + count = var.extra_workers + var.gpu_workers + 2 name = format("scratch-%s", local.nodes[count.index].name) size = var.scratch_volumes_size } resource "openstack_compute_volume_attach_v2" "scratch-volume-attach" { - count = var.extra_workers + 2 + count = var.extra_workers + var.gpu_workers + 2 instance_id = local.nodes[count.index].id volume_id = openstack_blockstorage_volume_v3.scratch-volume[count.index].id } @@ -305,7 +326,11 @@ nfs: worker: hosts: - ${join(":\n ", local.worker_ips)}: + ${join("\n ", [for s in local.worker_ips: "${s}:"])} + +gpu: + hosts: + ${join("\n ", [for s in local.gpu_ips : "${s}:"])} # using public IP of kube_server for ansible delegate_to kube_server: @@ -330,6 +355,6 @@ resource "local_file" "hosts" { ${local.master_ip} ${local.ingress_ip} ${local.nfs_ip} -${join("\n", local.worker_ips)} +${join("\n", concat(local.worker_ips, local.gpu_ips))} EOT }