Skip to content
Snippets Groups Projects
k8s.yaml 19 KiB
Newer Older
František Dvořák's avatar
František Dvořák committed
---
- name: Basic setup and NFS common
  hosts: allnodes
  become: true
  tasks:
    - name: Add SSH keys
      authorized_key:
        user: egi
        state: present
        key: '{{ item }}'
      with_file:
        - public_keys/andrea-manzi
        - public_keys/enolfc
        - public_keys/jhradil
        - public_keys/pailozian
František Dvořák's avatar
František Dvořák committed
        - public_keys/pospisilp
        - public_keys/sustr
        - public_keys/valtri
    - name: Install nfs-common
      apt:
        name: nfs-common
        update_cache: true
    - name: Site install packages
      package:
        name:
          - atop
          - cron-apt
          - fail2ban
František Dvořák's avatar
František Dvořák committed
          - mc
          - vim
          - postfix
    - name: Site remove packages
      package:
        name:
          - unattended-upgrades
        state: absent
    - name: Site cron-apt config
      copy:
        dest: /etc/cron-apt/config
        content: |
          MAILTO=valtri@civ.zcu.cz
          MAILON=upgrade
          RUNSLEEP=600
        mode: 0644
    - name: Site cron-apt action
      copy:
        dest: /etc/cron-apt/action.d/9-upgrade
        content: -q -q dist-upgrade
        mode: 0644
        fip_hostname: "{{ lookup('dig', (groups['fip'][0], 'PTR') | join('/')) | regex_replace('\\.$', '') }}"
        - name: Global postfix settings
            main:
              # disable everything except TLSv1.2
              smtpd_tls_mandatory_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
              smtpd_tls_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
              smtp_tls_mandatory_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
              smtp_tls_protocols: "!SSLv2, !SSLv3, !TLSv1, !TLSv1.1"
        - name: Site-specific postfix settings (CESNET)
          vars:
              myhostname: "{{ fip_hostname }}"
              relayhost: relay.muni.cz
              inet_protocols: ipv4
          set_fact:
            main: '{{ main | combine(main_cesnet) }}'
          when: site_name == "cesnet-testing" or site_name == "cesnet-mcc"
        - name: Site-specific postfix settings - mail_fromdomain
          set_fact:
            main: '{{ main | combine({ "myhostname": mail_fromdomain }) }}'
          when: mail_fromdomain is defined
        - name: Site-specific postfix settings - default_transport
          set_fact:
            main: '{{ main | combine({ "default_transport": "error: This server sends mail only locally." }) }}'
          when: mail_local | default(false) | bool
        - name: Setup postfix
          lineinfile:
            regexp: '^{{ item.key }}\s*=\s*.*'
            line: "{{ item.key }} = {{ item.value }}"
            path: /etc/postfix/main.cf
          loop: "{{ main | dict2items }}"
          notify: Reload postfix
        - name: Setup mailutils
          vars:
            fromdomain: "{{ mail_fromdomain | default(fip_hostname) }}"
          template:
            src: templates/etc/mailutils.conf
            dest: /etc/mailutils.conf
            mode: 0644
          when: (site_name == "cesnet-testing" or site_name == "cesnet-mcc" or mail_fromdomain is defined) and not (mail_local | default(false))
František Dvořák's avatar
František Dvořák committed
    - name: Site touch
      file:
        path: "/EOSC-{{ site_name | upper }}"
        state: touch
        mode: 0644
  handlers:
    - name: Reload postfix
      service:
        name: postfix
        state: reloaded
František Dvořák's avatar
František Dvořák committed

- name: NFS server
  hosts: nfs
  become: true
  tasks:
    - name: Install nfs-server
      apt:
        name: nfs-kernel-server
        state: present
        update_cache: true
    - name: Create user for NFS
      user:
        name: volumes
        create_home: false
        uid: 5005
    - name: Create /exports dir
      file:
        path: /exports
        state: directory
        mode: 0755
        owner: volumes
    - name: Create exports
      template:
        src: templates/etc/exports
        dest: /etc/exports
        mode: 0644
      notify: Reload exports
    - name: Quota script
      copy:
        dest: /usr/local/bin/xfs-quotas.sh
        src: files/usr/local/bin/xfs-quotas.sh
        mode: 0755
        owner: root
        group: root
František Dvořák's avatar
František Dvořák committed
    - name: Start NFS service
      service:
        name: nfs-server
        state: started
  handlers:
    - name: Reload exports
      command: exportfs -ra

- name: K8s master deployment
  hosts: master
  become: true
  roles:
    - role: 'grycap.kubernetes'
      vars:
        # do not downgrade docker
        kube_docker_version: latest
        # kube_nvidia_device_plugin_version: "v0.12.2"
        # kube_nvidia_driver_version: "515" # "525"
        kube_nvidia_support: true
        kube_version: 1.31.0
František Dvořák's avatar
František Dvořák committed
        kube_network: 'none'  # custom network installation
        kube_install_helm: true
        kube_install_helm_version: 'v3.15.4'
František Dvořák's avatar
František Dvořák committed
        kube_install_metrics: true
  tasks:
    - name: Create kubectl config dir
      file:
        path: "~{{ ansible_user }}/.kube"
        mode: 0750
        owner: "{{ ansible_user }}"
        state: directory
    - name: Copy kubectl config to regular user
      copy:
        remote_src: true
        src: /etc/kubernetes/admin.conf
        dest: "~{{ ansible_user }}/.kube/config"
        mode: 0600
        owner: "{{ ansible_user }}"
    - name: Site k8s cheat sheets
      copy:
        dest: /etc/profile.d/k8s-cheats.sh
        src: files//etc/profile.d/k8s-cheats.sh
František Dvořák's avatar
František Dvořák committed
        mode: preserve

- name: K8s network deployment
  hosts: master
  vars:
    calicoctl_version: 3.28.1
František Dvořák's avatar
František Dvořák committed
  tasks:
    - name: Calico config
      copy:
        # https://raw.githubusercontent.com/projectcalico/calico/v3.28.1/manifests/calico.yaml
František Dvořák's avatar
František Dvořák committed
        src: files/calico.yaml
        dest: /tmp/calico-net.yaml
        mode: 0644
    - name: Calico installation
      command:
        cmd: kubectl apply -f /tmp/calico-net.yaml
        creates: /var/etcd/calico-data
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
    - name: Download calicoctl
      get_url:
        url: https://github.com/projectcalico/calico/releases/download/v{{ calicoctl_version }}/calicoctl-linux-amd64
        dest: /usr/local/sbin/calicoctl
        mode: 0755

- name: K8s nodes deployment
  hosts: nfs, ingress, worker, gpu
František Dvořák's avatar
František Dvořák committed
  become: true
  roles:
    - role: 'grycap.kubernetes'
      vars:
        # do not downgrade docker
        kube_docker_version: latest
        # kube_nvidia_device_plugin_version: "v0.12.2"
        # kube_nvidia_driver_version: "515" # "525"
        # support only on worker nodes with GPU hardware
František Dvořák's avatar
František Dvořák committed
        kube_nvidia_support: "{{ inventory_hostname in groups['gpu'] }}"
        # must be IPv4 address or hostname
        kube_server: "{{ hostvars[groups['master'][0]].kube_server | default(groups['master'][0]) }}"
František Dvořák's avatar
František Dvořák committed
        kube_type_of_node: wn
        kube_version: 1.31.0
František Dvořák's avatar
František Dvořák committed
        kubelet_extra_args: '--volume-stats-agg-period 0'
  tasks:
    - name: Overlay2 mountpoint workaround to docker.service unit
      lineinfile:
        path: /lib/systemd/system/docker.service
        firstmatch: true
        insertafter: '\[Service\]'
        line: 'ExecStopPost=mount /var/lib/docker/overlay2'
        regexp: '^\s*ExecStopPost\s*='
    - name: Local docker.service unit
      copy:
        src: /lib/systemd/system/docker.service
        dest: /etc/systemd/system/docker.service
        mode: 0644
        remote_src: true
      notify:
        - Reload systemd daemon
        - Restart docker
  handlers:
    - name: Reload systemd daemon
      command:
        cmd: systemctl daemon-reload
      ignore_errors: true
    - name: Restart docker
      service:
        name: docker
        state: restarted

František Dvořák's avatar
František Dvořák committed

- name: K8s customization
František Dvořák's avatar
František Dvořák committed
  become: true
  tasks:
    - name: Wait for helm
      command: helm version
      register: result
      until: result.rc == 0
      retries: 20
      delay: 10
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
      when: true
    - name: Create custom fact directory
      file:
        path: "/etc/ansible/facts.d"
        mode: 0755
        recurse: true
        state: "directory"
    - name: Create helm repos custom fact
      copy:
        src: files/etc/ansible/facts.d/helm_repos.fact
František Dvořák's avatar
František Dvořák committed
        dest: /etc/ansible/facts.d/helm_repos.fact
        mode: 0755
    - name: Reload custom facts
      setup:
        filter: ansible_local
    - name: Helm repo add stable
      shell: |-
        helm repo add stable https://charts.helm.sh/stable/
        helm repo update
      when: "'stable' not in ansible_local.helm_repos | map(attribute='name') | list"
    - name: Helm repo add nfs-subdir-external-provisioner
      shell: |-
        helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner
        helm repo update
      when: "'nfs-subdir-external-provisioner' not in ansible_local.helm_repos | map(attribute='name') | list"
    - name: NFS provisioner
      vars:
        config: >-
          --set nfs.server={{ groups['nfs'][0] }}
          --set storageClass.defaultClass=true
          --set nfs.path=/exports
      shell: |-
        helm status --namespace kube-system nfs-provisioner
        if [ $? -ne 0 ]; then
            helm install --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner
        else
            helm upgrade --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner
        fi
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true
    - name: Git clone local-path-provisioner
      git:
        repo: https://github.com/rancher/local-path-provisioner.git
        dest: "/root/git-local-path-provisioner"
        clone: yes
        update: no
        version: v0.0.26
    - name: Local path provisioner configuration
      copy:
        dest: /tmp/local-path-provisioner.yaml
        mode: 0644
        content: |
          storageClass:
            defaultClass: false
            defaultVolumeType: hostPath
            name: local-path
          nodePathMap:
            - node: DEFAULT_PATH_FOR_NON_LISTED_NODES
              paths:
                - /scratch
    - name: Local path provisioner deployment
      vars:
        config: >-
          --namespace local-path-storage
          -f /tmp/local-path-provisioner.yaml
          local-path-storage
          /root/git-local-path-provisioner/deploy/chart/local-path-provisioner/
      shell: |-
        helm status --namespace local-path-storage local-path-storage
        if [ $? -ne 0 ]; then
            kubectl create namespace local-path-storage || :
            helm install {{ config }}
        else
            helm upgrade {{ config }}
        fi
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true
František Dvořák's avatar
František Dvořák committed
    - name: Helm repo add ingress-nginx
      shell: |-
        helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
        helm repo update
      when: "'ingress-nginx' not in ansible_local.helm_repos | map(attribute='name') | list"
    - name: Ingress
      vars:
        version: 4.11.2 # app 1.11.2
František Dvořák's avatar
František Dvořák committed
        config: >-
          --set controller.service.type=NodePort
          --set controller.service.externalIPs={{ '{' + hostvars[groups['ingress'][0]].ansible_default_ipv4.address + '}' }}
          --set controller.config.proxy-body-size=0
          --set controller.allowSnippetAnnotations=false
          --version={{version}}
František Dvořák's avatar
František Dvořák committed
      shell: |-
        helm status --namespace kube-system cluster-ingress
        if [ $? -ne 0 ]; then
            helm install cluster-ingress --namespace kube-system {{ config }} ingress-nginx/ingress-nginx
        else
            helm upgrade --namespace kube-system {{ config }} cluster-ingress ingress-nginx/ingress-nginx
        fi
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true
    - name: Cert-manager
      vars:
        version: 1.15.3
František Dvořák's avatar
František Dvořák committed
        config: >-
          --version={{ version }}
          --set ingressShim.defaultIssuerName=letsencrypt-prod
          --set ingressShim.defaultIssuerKind=ClusterIssuer
          --set ingressShim.defaultIssuerGroup=cert-manager.io
      shell: |-
        helm status --namespace cert-manager certs-man
        if [ $? -ne 0 ]; then
            kubectl create namespace cert-manager
            kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v{{ version }}/cert-manager.crds.yaml
            helm repo add jetstack https://charts.jetstack.io
            helm repo update
            helm install --namespace cert-manager {{ config }} certs-man jetstack/cert-manager
        else
            helm upgrade --namespace cert-manager {{ config }} certs-man jetstack/cert-manager
        fi
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true
    - name: Cluster issuer file
      copy:
        dest: /tmp/clusterissuer.yaml
        mode: 0644
        content: |
          apiVersion: cert-manager.io/v1
          kind: ClusterIssuer
          metadata:
            name: letsencrypt-prod
          spec:
            acme:
              email: valtri@civ.zcu.cz
              server: https://acme-v02.api.letsencrypt.org/directory
              privateKeySecretRef:
                name: cluster-issuer-account-key
              # Add a single challenge solver, HTTP01 using nginx
              solvers:
              - http01:
                  ingress:
                    class: nginx
    - name: Cluster issuer
      command:
        kubectl apply -f /tmp/clusterissuer.yaml
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true
    # Accounting / monitoring needs
    - name: Helm repo add prometheus-community
      shell: |-
        helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
        helm repo update
      when: "'prometheus-community' not in ansible_local.helm_repos | map(attribute='name') | list"
    - name: Prometheus configuration
      vars:
        smtp_from: "noreply@{{ groups['ingress'][0] }}"
        limit_memory_warn: 80
        limit_cpu_warn: 80
        limit_disk_warn: 80
      copy:
        dest: /tmp/prometheus.yaml
        mode: 0600
        content: |
          alertmanagerFiles:
            alertmanager.yml:
              global:
                smtp_from: "{{ smtp_from }}"
              receivers:
                - name: default-receiver
                  email_configs:
                    - send_resolved: true
                      to: valtri@civ.zcu.cz
                - name: 'null'
              route:
                group_by: ['job']
          kube-state-metrics:
            metricAnnotationsAllowList:
              - pods=[hub.jupyter.org/username,egi.eu/primary_group,egi.eu/flavor]
František Dvořák's avatar
František Dvořák committed
          serverFiles:
            alerting_rules.yml:
              groups:
                - name: limits
                  rules:
                    - alert: HighCpuLoad
                      expr: 100 * (1 - avg by(kubernetes_node) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > {{ limit_cpu_warn }}
                      for: 15m
                      labels:
                        job: "eosc-{{ site_name }}"
                      annotations:
                        summary: "Host high CPU load ({{ '{{ $labels.kubernetes_node }}' }})"
                        description: "CPU load {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_cpu_warn }}%)"
                    - alert: OutOfMemory
                      expr: 100 * (1 - avg by(kubernetes_node) (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > {{ limit_memory_warn }}
                      for: 20m
                      labels:
                        job: "eosc-{{ site_name }}"
                      annotations:
                        summary: "Host out of memory ({{ '{{ $labels.kubernetes_node }}' }})"
                        description: "Node memory {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_memory_warn }}%)"
                    - alert: OutOfDiskSpace
                      expr: 100 * (1 - avg by (kubernetes_node, mountpoint) (node_filesystem_avail_bytes{device=~"/dev/.*"} / node_filesystem_size_bytes))
                        > {{ limit_disk_warn }}
                      for: 20m
                      labels:
                        job: "eosc-{{ site_name }}"
                      annotations:
                        summary: "Host out of disk space ({{ '{{ $labels.kubernetes_node }}' }})"
                        description: "Disk is almost full {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_disk_warn }}%)"
    - name: Prometheus
      vars:
        version: 25.27.0 # app v2.54.1
František Dvořák's avatar
František Dvořák committed
        config: >-
          --version={{ version }}
František Dvořák's avatar
František Dvořák committed
          -f /tmp/prometheus.yaml
      shell: |-
        helm status --namespace prometheus prometheus
        if [ $? -ne 0 ]; then
            kubectl create ns prometheus >/dev/null 2>&1 || true
            helm install --namespace prometheus {{ config }} prometheus prometheus-community/prometheus
        else
            helm upgrade --namespace prometheus {{ config }} prometheus prometheus-community/prometheus
        fi
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true
    - name: Grafana configuration
      copy:
        dest: /tmp/grafana.yaml
        mode: 0640
        content: |
          ingress:
            enabled: true
            annotations:
              kubernetes.io/ingress.class: "nginx"
              kubernetes.io/tls-acme: "true"
            hosts:
            - "{{ grafana_hostname }}"
            tls:
            - hosts:
              - "{{ grafana_hostname }}"
              secretName: acme-tls-grafana
          datasources:
           datasources.yaml:
             apiVersion: 1
             datasources:
              - name: Prometheus
                type: prometheus
                access: Server
                orgId: 1
                url: http://prometheus-server.prometheus.svc.cluster.local
                isDefault: true
                version: 1
                editable: false
          sidecar:
            dashboards:
              enabled: true
    - name: Grafana
      vars:
        version: 8.5.8 # app 11.2.2-security-01
František Dvořák's avatar
František Dvořák committed
        config: >-
          --version={{ version }}
František Dvořák's avatar
František Dvořák committed
          -f /tmp/grafana.yaml
      shell: |-
        helm status --namespace grafana grafana
        if [ $? -ne 0 ]; then
            kubectl create ns grafana
            helm repo add grafana https://grafana.github.io/helm-charts
            helm repo update
            helm install --namespace grafana {{ config }} grafana grafana/grafana
        else
            helm upgrade --namespace grafana {{ config }} grafana grafana/grafana
        fi
      environment:
        KUBECONFIG: /etc/kubernetes/admin.conf
        PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
      when: true