--- - name: Basic setup and NFS common hosts: allnodes become: true tasks: - name: Add SSH keys authorized_key: user: egi state: present key: '{{ item }}' with_file: - public_keys/andrea-manzi - public_keys/enolfc - public_keys/jhradil - public_keys/pailozian - public_keys/pospisilp - public_keys/sustr - public_keys/valtri - name: Install nfs-common apt: name: nfs-common update_cache: true - name: Site install packages package: name: - atop - cron-apt - fail2ban - git - mc - vim - postfix - name: Site remove packages package: name: - unattended-upgrades state: absent - name: Site cron-apt config copy: dest: /etc/cron-apt/config content: | MAILTO=valtri@civ.zcu.cz MAILON=upgrade RUNSLEEP=600 mode: 0644 - name: Site cron-apt action copy: dest: /etc/cron-apt/action.d/9-upgrade content: -q -q dist-upgrade mode: 0644 - name: Mails settings vars: fip_hostname: "{{ lookup('dig', (groups['fip'][0], 'PTR') | join('/')) | regex_replace('\\.$', '') }}" block: - name: Global postfix settings set_fact: main: # disable everything except TLSv1.2 smtpd_tls_mandatory_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1" smtpd_tls_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1" smtp_tls_mandatory_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1" smtp_tls_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1" - name: Site-specific postfix settings (CESNET) vars: main_cesnet: myhostname: "{{ fip_hostname }}" relayhost: relay.muni.cz inet_protocols: ipv4 set_fact: main: '{{ main | combine(main_cesnet) }}' when: site_name == "cesnet-testing" or site_name == "cesnet-mcc" - name: Site-specific postfix settings - mail_fromdomain set_fact: main: '{{ main | combine({ "myhostname": mail_fromdomain }) }}' when: mail_fromdomain is defined - name: Site-specific postfix settings - default_transport set_fact: main: '{{ main | combine({ "default_transport": "error: This server sends mail only locally." }) }}' when: mail_local | default(false) | bool - name: Setup postfix lineinfile: regexp: '^{{ item.key }}\s*=\s*.*' line: "{{ item.key }} = {{ item.value }}" path: /etc/postfix/main.cf loop: "{{ main | dict2items }}" notify: Reload postfix - name: Setup mailutils vars: fromdomain: "{{ mail_fromdomain | default(fip_hostname) }}" template: src: templates/etc/mailutils.conf dest: /etc/mailutils.conf mode: 0644 when: (site_name == "cesnet-testing" or site_name == "cesnet-mcc" or mail_fromdomain is defined) and not (mail_local | default(false)) - name: Site touch file: path: "/EOSC-{{ site_name | upper }}" state: touch mode: 0644 handlers: - name: Reload postfix service: name: postfix state: reloaded - name: NFS server hosts: nfs become: true tasks: - name: Install nfs-server apt: name: nfs-kernel-server state: present update_cache: true - name: Create user for NFS user: name: volumes create_home: false uid: 5005 - name: Create /exports dir file: path: /exports state: directory mode: 0755 owner: volumes - name: Create exports template: src: templates/etc/exports dest: /etc/exports mode: 0644 notify: Reload exports - name: Quota script copy: dest: /usr/local/bin/xfs-quotas.sh src: files/usr/local/bin/xfs-quotas.sh mode: 0755 owner: root group: root - name: Start NFS service service: name: nfs-server state: started handlers: - name: Reload exports command: exportfs -ra - name: K8s master deployment hosts: master become: true roles: - role: 'grycap.kubernetes' vars: # do not downgrade docker kube_docker_version: latest # kube_nvidia_device_plugin_version: "v0.12.2" # kube_nvidia_driver_version: "515" # "525" kube_nvidia_support: true kube_version: 1.31.0 kube_network: 'none' # custom network installation kube_install_helm: true kube_install_helm_version: 'v3.15.4' kube_install_metrics: true tasks: - name: Create kubectl config dir file: path: "~{{ ansible_user }}/.kube" mode: 0750 owner: "{{ ansible_user }}" state: directory - name: Copy kubectl config to regular user copy: remote_src: true src: /etc/kubernetes/admin.conf dest: "~{{ ansible_user }}/.kube/config" mode: 0600 owner: "{{ ansible_user }}" - name: Site k8s cheat sheets copy: dest: /etc/profile.d/k8s-cheats.sh src: files//etc/profile.d/k8s-cheats.sh mode: preserve - name: K8s network deployment hosts: master vars: calicoctl_version: 3.28.1 tasks: - name: Calico config copy: # https://raw.githubusercontent.com/projectcalico/calico/v3.28.1/manifests/calico.yaml src: files/calico.yaml dest: /tmp/calico-net.yaml mode: 0644 - name: Calico installation command: cmd: kubectl apply -f /tmp/calico-net.yaml creates: /var/etcd/calico-data environment: KUBECONFIG: /etc/kubernetes/admin.conf - name: Download calicoctl get_url: url: https://github.com/projectcalico/calico/releases/download/v{{ calicoctl_version }}/calicoctl-linux-amd64 dest: /usr/local/sbin/calicoctl mode: 0755 - name: K8s nodes deployment hosts: nfs, ingress, worker, gpu become: true roles: - role: 'grycap.kubernetes' vars: # do not downgrade docker kube_docker_version: latest # kube_nvidia_device_plugin_version: "v0.12.2" # kube_nvidia_driver_version: "515" # "525" # support only on worker nodes with GPU hardware kube_nvidia_support: "{{ inventory_hostname in groups['gpu'] }}" # must be IPv4 address or hostname kube_server: "{{ hostvars[groups['master'][0]].kube_server | default(groups['master'][0]) }}" kube_type_of_node: wn kube_version: 1.31.0 kubelet_extra_args: '--volume-stats-agg-period 0' tasks: - name: Overlay2 mountpoint workaround to docker.service unit lineinfile: path: /lib/systemd/system/docker.service firstmatch: true insertafter: '\[Service\]' line: 'ExecStopPost=mount /var/lib/docker/overlay2' regexp: '^\s*ExecStopPost\s*=' - name: Local docker.service unit copy: src: /lib/systemd/system/docker.service dest: /etc/systemd/system/docker.service mode: 0644 remote_src: true notify: - Reload systemd daemon - Restart docker handlers: - name: Reload systemd daemon command: cmd: systemctl daemon-reload ignore_errors: true - name: Restart docker service: name: docker state: restarted - name: K8s customization hosts: master become: true tasks: - name: Wait for helm command: helm version register: result until: result.rc == 0 retries: 20 delay: 10 environment: KUBECONFIG: /etc/kubernetes/admin.conf when: true - name: Create custom fact directory file: path: "/etc/ansible/facts.d" mode: 0755 recurse: true state: "directory" - name: Create helm repos custom fact copy: src: files/etc/ansible/facts.d/helm_repos.fact dest: /etc/ansible/facts.d/helm_repos.fact mode: 0755 - name: Reload custom facts setup: filter: ansible_local - name: Helm repo add stable shell: |- helm repo add stable https://charts.helm.sh/stable/ helm repo update when: "'stable' not in ansible_local.helm_repos | map(attribute='name') | list" - name: Helm repo add nfs-subdir-external-provisioner shell: |- helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner helm repo update when: "'nfs-subdir-external-provisioner' not in ansible_local.helm_repos | map(attribute='name') | list" - name: NFS provisioner vars: config: >- --set nfs.server={{ groups['nfs'][0] }} --set storageClass.defaultClass=true --set nfs.path=/exports shell: |- helm status --namespace kube-system nfs-provisioner if [ $? -ne 0 ]; then helm install --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner else helm upgrade --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner fi environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true - name: Git clone local-path-provisioner git: repo: https://github.com/rancher/local-path-provisioner.git dest: "/root/git-local-path-provisioner" clone: yes update: no version: v0.0.26 - name: Local path provisioner configuration copy: dest: /tmp/local-path-provisioner.yaml mode: 0644 content: | storageClass: defaultClass: false defaultVolumeType: hostPath name: local-path nodePathMap: - node: DEFAULT_PATH_FOR_NON_LISTED_NODES paths: - /scratch - name: Local path provisioner deployment vars: config: >- --namespace local-path-storage -f /tmp/local-path-provisioner.yaml local-path-storage /root/git-local-path-provisioner/deploy/chart/local-path-provisioner/ shell: |- helm status --namespace local-path-storage local-path-storage if [ $? -ne 0 ]; then kubectl create namespace local-path-storage || : helm install {{ config }} else helm upgrade {{ config }} fi environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true - name: Helm repo add ingress-nginx shell: |- helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx helm repo update when: "'ingress-nginx' not in ansible_local.helm_repos | map(attribute='name') | list" - name: Ingress vars: version: 4.11.2 # app 1.11.2 config: >- --set controller.service.type=NodePort --set controller.service.externalIPs={{ '{' + hostvars[groups['ingress'][0]].ansible_default_ipv4.address + '}' }} --set controller.config.proxy-body-size=0 --set controller.allowSnippetAnnotations=false --version={{version}} shell: |- helm status --namespace kube-system cluster-ingress if [ $? -ne 0 ]; then helm install cluster-ingress --namespace kube-system {{ config }} ingress-nginx/ingress-nginx else helm upgrade --namespace kube-system {{ config }} cluster-ingress ingress-nginx/ingress-nginx fi environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true - name: Cert-manager vars: version: 1.15.3 config: >- --version={{ version }} --set ingressShim.defaultIssuerName=letsencrypt-prod --set ingressShim.defaultIssuerKind=ClusterIssuer --set ingressShim.defaultIssuerGroup=cert-manager.io shell: |- helm status --namespace cert-manager certs-man if [ $? -ne 0 ]; then kubectl create namespace cert-manager kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v{{ version }}/cert-manager.crds.yaml helm repo add jetstack https://charts.jetstack.io helm repo update helm install --namespace cert-manager {{ config }} certs-man jetstack/cert-manager else helm upgrade --namespace cert-manager {{ config }} certs-man jetstack/cert-manager fi environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true - name: Cluster issuer file copy: dest: /tmp/clusterissuer.yaml mode: 0644 content: | apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: letsencrypt-prod spec: acme: email: valtri@civ.zcu.cz server: https://acme-v02.api.letsencrypt.org/directory privateKeySecretRef: name: cluster-issuer-account-key # Add a single challenge solver, HTTP01 using nginx solvers: - http01: ingress: class: nginx - name: Cluster issuer command: kubectl apply -f /tmp/clusterissuer.yaml environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true # Accounting / monitoring needs - name: Helm repo add prometheus-community shell: |- helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update when: "'prometheus-community' not in ansible_local.helm_repos | map(attribute='name') | list" - name: Prometheus configuration vars: smtp_from: "noreply@{{ groups['ingress'][0] }}" limit_memory_warn: 80 limit_cpu_warn: 80 limit_disk_warn: 80 copy: dest: /tmp/prometheus.yaml mode: 0600 content: | alertmanagerFiles: alertmanager.yml: global: smtp_from: "{{ smtp_from }}" receivers: - name: default-receiver email_configs: - send_resolved: true to: valtri@civ.zcu.cz - name: 'null' route: group_by: ['job'] kube-state-metrics: metricAnnotationsAllowList: - pods=[hub.jupyter.org/username,egi.eu/primary_group,egi.eu/flavor] serverFiles: alerting_rules.yml: groups: - name: limits rules: - alert: HighCpuLoad expr: 100 * (1 - avg by(kubernetes_node) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > {{ limit_cpu_warn }} for: 15m labels: job: "eosc-{{ site_name }}" annotations: summary: "Host high CPU load ({{ '{{ $labels.kubernetes_node }}' }})" description: "CPU load {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_cpu_warn }}%)" - alert: OutOfMemory expr: 100 * (1 - avg by(kubernetes_node) (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > {{ limit_memory_warn }} for: 20m labels: job: "eosc-{{ site_name }}" annotations: summary: "Host out of memory ({{ '{{ $labels.kubernetes_node }}' }})" description: "Node memory {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_memory_warn }}%)" - alert: OutOfDiskSpace expr: 100 * (1 - avg by (kubernetes_node, mountpoint) (node_filesystem_avail_bytes{device=~"/dev/.*"} / node_filesystem_size_bytes)) > {{ limit_disk_warn }} for: 20m labels: job: "eosc-{{ site_name }}" annotations: summary: "Host out of disk space ({{ '{{ $labels.kubernetes_node }}' }})" description: "Disk is almost full {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_disk_warn }}%)" - name: Prometheus vars: version: 25.27.0 # app v2.54.1 config: >- --version={{ version }} -f /tmp/prometheus.yaml shell: |- helm status --namespace prometheus prometheus if [ $? -ne 0 ]; then kubectl create ns prometheus >/dev/null 2>&1 || true helm install --namespace prometheus {{ config }} prometheus prometheus-community/prometheus else helm upgrade --namespace prometheus {{ config }} prometheus prometheus-community/prometheus fi environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true - name: Grafana configuration copy: dest: /tmp/grafana.yaml mode: 0640 content: | ingress: enabled: true annotations: kubernetes.io/ingress.class: "nginx" kubernetes.io/tls-acme: "true" hosts: - "{{ grafana_hostname }}" tls: - hosts: - "{{ grafana_hostname }}" secretName: acme-tls-grafana datasources: datasources.yaml: apiVersion: 1 datasources: - name: Prometheus type: prometheus access: Server orgId: 1 url: http://prometheus-server.prometheus.svc.cluster.local isDefault: true version: 1 editable: false sidecar: dashboards: enabled: true - name: Grafana vars: version: 8.5.8 # app 11.2.2-security-01 config: >- --version={{ version }} -f /tmp/grafana.yaml shell: |- helm status --namespace grafana grafana if [ $? -ne 0 ]; then kubectl create ns grafana helm repo add grafana https://grafana.github.io/helm-charts helm repo update helm install --namespace grafana {{ config }} grafana grafana/grafana else helm upgrade --namespace grafana {{ config }} grafana grafana/grafana fi environment: KUBECONFIG: /etc/kubernetes/admin.conf PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin when: true