Newer
Older
---
- name: Basic setup and NFS common
hosts: allnodes
become: true
tasks:
- name: Add SSH keys
authorized_key:
user: egi
state: present
key: '{{ item }}'
with_file:
- public_keys/andrea-manzi
- public_keys/enolfc
- public_keys/jhradil
- public_keys/pospisilp
- public_keys/sustr
- public_keys/valtri
- name: Install nfs-common
apt:
name: nfs-common
update_cache: true
- name: Site install packages
package:
name:
- atop
- cron-apt
- fail2ban
- git
- mc
- vim
- postfix
- name: Site remove packages
package:
name:
- unattended-upgrades
state: absent
- name: Site cron-apt config
copy:
dest: /etc/cron-apt/config
content: |
MAILTO=valtri@civ.zcu.cz
MAILON=upgrade
RUNSLEEP=600
mode: 0644
- name: Site cron-apt action
copy:
dest: /etc/cron-apt/action.d/9-upgrade
content: -q -q dist-upgrade
mode: 0644
- name: Mails settings
fip_hostname: "{{ lookup('dig', (groups['fip'][0], 'PTR') | join('/')) | regex_replace('\\.$', '') }}"
- name: Global postfix settings
main:
# disable everything except TLSv1.2
smtpd_tls_mandatory_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
smtpd_tls_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
smtp_tls_mandatory_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
smtp_tls_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
- name: Site-specific postfix settings (CESNET)
vars:
myhostname: "{{ fip_hostname }}"
relayhost: relay.muni.cz
inet_protocols: ipv4
set_fact:
main: '{{ main | combine(main_cesnet) }}'
when: site_name == "cesnet-testing" or site_name == "cesnet-mcc"
- name: Site-specific postfix settings - mail_fromdomain
set_fact:
main: '{{ main | combine({ "myhostname": mail_fromdomain }) }}'
when: mail_fromdomain is defined
- name: Site-specific postfix settings - default_transport
set_fact:
main: '{{ main | combine({ "default_transport": "error: This server sends mail only locally." }) }}'
when: mail_local | default(false) | bool
- name: Setup postfix
lineinfile:
regexp: '^{{ item.key }}\s*=\s*.*'
line: "{{ item.key }} = {{ item.value }}"
path: /etc/postfix/main.cf
loop: "{{ main | dict2items }}"
notify: Reload postfix
- name: Setup mailutils
vars:
fromdomain: "{{ mail_fromdomain | default(fip_hostname) }}"
template:
src: templates/etc/mailutils.conf
dest: /etc/mailutils.conf
mode: 0644
when: (site_name == "cesnet-testing" or site_name == "cesnet-mcc" or mail_fromdomain is defined) and not (mail_local | default(false))
- name: Site touch
file:
path: "/EOSC-{{ site_name | upper }}"
state: touch
mode: 0644
handlers:
- name: Reload postfix
service:
name: postfix
state: reloaded
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
- name: NFS server
hosts: nfs
become: true
tasks:
- name: Install nfs-server
apt:
name: nfs-kernel-server
state: present
update_cache: true
- name: Create user for NFS
user:
name: volumes
create_home: false
uid: 5005
- name: Create /exports dir
file:
path: /exports
state: directory
mode: 0755
owner: volumes
- name: Create exports
template:
src: templates/etc/exports
dest: /etc/exports
mode: 0644
notify: Reload exports
- name: Quota script
copy:
dest: /usr/local/bin/xfs-quotas.sh
src: files/usr/local/bin/xfs-quotas.sh
mode: 0755
owner: root
group: root
- name: Start NFS service
service:
name: nfs-server
state: started
handlers:
- name: Reload exports
command: exportfs -ra
- name: K8s master deployment
hosts: master
become: true
roles:
- role: 'grycap.kubernetes'
vars:
# do not downgrade docker
kube_docker_version: latest
# kube_nvidia_device_plugin_version: "v0.12.2"
# kube_nvidia_driver_version: "515" # "525"
kube_nvidia_support: true
kube_network: 'none' # custom network installation
kube_install_helm: true
kube_install_helm_version: 'v3.15.4'
kube_install_metrics: true
tasks:
- name: Create kubectl config dir
file:
path: "~{{ ansible_user }}/.kube"
mode: 0750
owner: "{{ ansible_user }}"
state: directory
- name: Copy kubectl config to regular user
copy:
remote_src: true
src: /etc/kubernetes/admin.conf
dest: "~{{ ansible_user }}/.kube/config"
mode: 0600
owner: "{{ ansible_user }}"
- name: Site k8s cheat sheets
copy:
dest: /etc/profile.d/k8s-cheats.sh
src: files//etc/profile.d/k8s-cheats.sh
mode: preserve
- name: K8s network deployment
hosts: master
vars:
# https://raw.githubusercontent.com/projectcalico/calico/v3.28.1/manifests/calico.yaml
src: files/calico.yaml
dest: /tmp/calico-net.yaml
mode: 0644
- name: Calico installation
command:
cmd: kubectl apply -f /tmp/calico-net.yaml
creates: /var/etcd/calico-data
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
- name: Download calicoctl
get_url:
url: https://github.com/projectcalico/calico/releases/download/v{{ calicoctl_version }}/calicoctl-linux-amd64
dest: /usr/local/sbin/calicoctl
mode: 0755
- name: K8s nodes deployment
become: true
roles:
- role: 'grycap.kubernetes'
vars:
# do not downgrade docker
kube_docker_version: latest
# kube_nvidia_device_plugin_version: "v0.12.2"
# kube_nvidia_driver_version: "515" # "525"
# support only on worker nodes with GPU hardware
kube_nvidia_support: "{{ inventory_hostname in groups['gpu'] }}"
František Dvořák
committed
# must be IPv4 address or hostname
kube_server: "{{ hostvars[groups['master'][0]].kube_server | default(groups['master'][0]) }}"
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
tasks:
- name: Overlay2 mountpoint workaround to docker.service unit
lineinfile:
path: /lib/systemd/system/docker.service
firstmatch: true
insertafter: '\[Service\]'
line: 'ExecStopPost=mount /var/lib/docker/overlay2'
regexp: '^\s*ExecStopPost\s*='
- name: Local docker.service unit
copy:
src: /lib/systemd/system/docker.service
dest: /etc/systemd/system/docker.service
mode: 0644
remote_src: true
notify:
- Reload systemd daemon
- Restart docker
handlers:
- name: Reload systemd daemon
command:
cmd: systemctl daemon-reload
ignore_errors: true
- name: Restart docker
service:
name: docker
state: restarted
become: true
tasks:
- name: Wait for helm
command: helm version
register: result
until: result.rc == 0
retries: 20
delay: 10
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
when: true
- name: Create custom fact directory
file:
path: "/etc/ansible/facts.d"
mode: 0755
recurse: true
state: "directory"
- name: Create helm repos custom fact
copy:
src: files/etc/ansible/facts.d/helm_repos.fact
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
dest: /etc/ansible/facts.d/helm_repos.fact
mode: 0755
- name: Reload custom facts
setup:
filter: ansible_local
- name: Helm repo add stable
shell: |-
helm repo add stable https://charts.helm.sh/stable/
helm repo update
when: "'stable' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Helm repo add nfs-subdir-external-provisioner
shell: |-
helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner
helm repo update
when: "'nfs-subdir-external-provisioner' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: NFS provisioner
vars:
config: >-
--set nfs.server={{ groups['nfs'][0] }}
--set storageClass.defaultClass=true
--set nfs.path=/exports
shell: |-
helm status --namespace kube-system nfs-provisioner
if [ $? -ne 0 ]; then
helm install --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner
else
helm upgrade --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
- name: Git clone local-path-provisioner
git:
repo: https://github.com/rancher/local-path-provisioner.git
dest: "/root/git-local-path-provisioner"
clone: yes
update: no
version: v0.0.26
- name: Local path provisioner configuration
copy:
dest: /tmp/local-path-provisioner.yaml
mode: 0644
content: |
storageClass:
defaultClass: false
defaultVolumeType: hostPath
name: local-path
nodePathMap:
- node: DEFAULT_PATH_FOR_NON_LISTED_NODES
paths:
- /scratch
- name: Local path provisioner deployment
vars:
config: >-
--namespace local-path-storage
-f /tmp/local-path-provisioner.yaml
local-path-storage
/root/git-local-path-provisioner/deploy/chart/local-path-provisioner/
shell: |-
helm status --namespace local-path-storage local-path-storage
if [ $? -ne 0 ]; then
kubectl create namespace local-path-storage || :
helm install {{ config }}
else
helm upgrade {{ config }}
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Helm repo add ingress-nginx
shell: |-
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
helm repo update
when: "'ingress-nginx' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Ingress
vars:
config: >-
--set controller.service.type=NodePort
--set controller.service.externalIPs={{ '{' + hostvars[groups['ingress'][0]].ansible_default_ipv4.address + '}' }}
--set controller.config.proxy-body-size=0
--set controller.allowSnippetAnnotations=false
shell: |-
helm status --namespace kube-system cluster-ingress
if [ $? -ne 0 ]; then
helm install cluster-ingress --namespace kube-system {{ config }} ingress-nginx/ingress-nginx
else
helm upgrade --namespace kube-system {{ config }} cluster-ingress ingress-nginx/ingress-nginx
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Cert-manager
vars:
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
config: >-
--version={{ version }}
--set ingressShim.defaultIssuerName=letsencrypt-prod
--set ingressShim.defaultIssuerKind=ClusterIssuer
--set ingressShim.defaultIssuerGroup=cert-manager.io
shell: |-
helm status --namespace cert-manager certs-man
if [ $? -ne 0 ]; then
kubectl create namespace cert-manager
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v{{ version }}/cert-manager.crds.yaml
helm repo add jetstack https://charts.jetstack.io
helm repo update
helm install --namespace cert-manager {{ config }} certs-man jetstack/cert-manager
else
helm upgrade --namespace cert-manager {{ config }} certs-man jetstack/cert-manager
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Cluster issuer file
copy:
dest: /tmp/clusterissuer.yaml
mode: 0644
content: |
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: valtri@civ.zcu.cz
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: cluster-issuer-account-key
# Add a single challenge solver, HTTP01 using nginx
solvers:
- http01:
ingress:
class: nginx
- name: Cluster issuer
command:
kubectl apply -f /tmp/clusterissuer.yaml
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
# Accounting / monitoring needs
- name: Helm repo add prometheus-community
shell: |-
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
when: "'prometheus-community' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Prometheus configuration
vars:
smtp_from: "noreply@{{ groups['ingress'][0] }}"
limit_memory_warn: 80
limit_cpu_warn: 80
limit_disk_warn: 80
copy:
dest: /tmp/prometheus.yaml
mode: 0600
content: |
alertmanagerFiles:
alertmanager.yml:
global:
smtp_from: "{{ smtp_from }}"
receivers:
- name: default-receiver
email_configs:
- send_resolved: true
to: valtri@civ.zcu.cz
- name: 'null'
route:
group_by: ['job']
kube-state-metrics:
metricAnnotationsAllowList:
- pods=[hub.jupyter.org/username,egi.eu/primary_group,egi.eu/flavor]
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
serverFiles:
alerting_rules.yml:
groups:
- name: limits
rules:
- alert: HighCpuLoad
expr: 100 * (1 - avg by(kubernetes_node) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > {{ limit_cpu_warn }}
for: 15m
labels:
job: "eosc-{{ site_name }}"
annotations:
summary: "Host high CPU load ({{ '{{ $labels.kubernetes_node }}' }})"
description: "CPU load {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_cpu_warn }}%)"
- alert: OutOfMemory
expr: 100 * (1 - avg by(kubernetes_node) (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > {{ limit_memory_warn }}
for: 20m
labels:
job: "eosc-{{ site_name }}"
annotations:
summary: "Host out of memory ({{ '{{ $labels.kubernetes_node }}' }})"
description: "Node memory {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_memory_warn }}%)"
- alert: OutOfDiskSpace
expr: 100 * (1 - avg by (kubernetes_node, mountpoint) (node_filesystem_avail_bytes{device=~"/dev/.*"} / node_filesystem_size_bytes))
> {{ limit_disk_warn }}
for: 20m
labels:
job: "eosc-{{ site_name }}"
annotations:
summary: "Host out of disk space ({{ '{{ $labels.kubernetes_node }}' }})"
description: "Disk is almost full {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_disk_warn }}%)"
- name: Prometheus
vars:
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
-f /tmp/prometheus.yaml
shell: |-
helm status --namespace prometheus prometheus
if [ $? -ne 0 ]; then
kubectl create ns prometheus >/dev/null 2>&1 || true
helm install --namespace prometheus {{ config }} prometheus prometheus-community/prometheus
else
helm upgrade --namespace prometheus {{ config }} prometheus prometheus-community/prometheus
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Grafana configuration
copy:
dest: /tmp/grafana.yaml
mode: 0640
content: |
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: "nginx"
kubernetes.io/tls-acme: "true"
hosts:
- "{{ grafana_hostname }}"
tls:
- hosts:
- "{{ grafana_hostname }}"
secretName: acme-tls-grafana
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: Server
orgId: 1
url: http://prometheus-server.prometheus.svc.cluster.local
isDefault: true
version: 1
editable: false
sidecar:
dashboards:
enabled: true
- name: Grafana
vars:
version: 8.5.8 # app 11.2.2-security-01
-f /tmp/grafana.yaml
shell: |-
helm status --namespace grafana grafana
if [ $? -ne 0 ]; then
kubectl create ns grafana
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
helm install --namespace grafana {{ config }} grafana grafana/grafana
else
helm upgrade --namespace grafana {{ config }} grafana grafana/grafana
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true