Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
---
- name: Basic setup and NFS common
hosts: allnodes
become: true
tasks:
- name: Add SSH keys
authorized_key:
user: egi
state: present
key: '{{ item }}'
with_file:
- public_keys/andrea-manzi
- public_keys/enolfc
- public_keys/jhradil
- public_keys/pospisilp
- public_keys/sustr
- public_keys/valtri
- name: Install nfs-common
apt:
name: nfs-common
update_cache: true
- name: Site install packages
package:
name:
- atop
- cron-apt
- fail2ban
- mc
- vim
- postfix
- name: Site remove packages
package:
name:
- unattended-upgrades
state: absent
- name: Site cron-apt config
copy:
dest: /etc/cron-apt/config
content: |
MAILTO=valtri@civ.zcu.cz
MAILON=upgrade
RUNSLEEP=600
mode: 0644
- name: Site cron-apt action
copy:
dest: /etc/cron-apt/action.d/9-upgrade
content: -q -q dist-upgrade
mode: 0644
- name: Site touch
file:
path: "/EOSC-{{ site_name | upper }}"
state: touch
mode: 0644
- name: NFS server
hosts: nfs
become: true
tasks:
- name: Install nfs-server
apt:
name: nfs-kernel-server
state: present
update_cache: true
- name: Create user for NFS
user:
name: volumes
create_home: false
uid: 5005
- name: Create /exports dir
file:
path: /exports
state: directory
mode: 0755
owner: volumes
- name: Create exports
template:
src: templates/etc/exports
dest: /etc/exports
mode: 0644
notify: Reload exports
- name: Start NFS service
service:
name: nfs-server
state: started
handlers:
- name: Reload exports
command: exportfs -ra
- name: K8s master deployment
hosts: master
become: true
roles:
- role: 'grycap.kubernetes'
vars:
# do not downgrade docker
kube_docker_version: latest
kube_version: 1.28.2
kube_network: 'none' # custom network installation
kube_install_helm: true
kube_install_helm_version: 'v3.13.0'
kube_install_metrics: true
tasks:
- name: Create kubectl config dir
file:
path: "~{{ ansible_user }}/.kube"
mode: 0750
owner: "{{ ansible_user }}"
state: directory
- name: Copy kubectl config to regular user
copy:
remote_src: true
src: /etc/kubernetes/admin.conf
dest: "~{{ ansible_user }}/.kube/config"
mode: 0600
owner: "{{ ansible_user }}"
- name: Site k8s cheat sheets
copy:
dest: /etc/profile.d/k8s-cheats.sh
src: files/k8s-cheats.sh
mode: preserve
- name: K8s network deployment
hosts: master
vars:
calicoctl_version: 3.27.0
tasks:
- name: Calico config
copy:
# https://raw.githubusercontent.com/projectcalico/calico/v3.27.0/manifests/calico.yaml
src: files/calico.yaml
dest: /tmp/calico-net.yaml
mode: 0644
- name: Calico installation
command:
cmd: kubectl apply -f /tmp/calico-net.yaml
creates: /var/etcd/calico-data
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
- name: Download calicoctl
get_url:
url: https://github.com/projectcalico/calico/releases/download/v{{ calicoctl_version }}/calicoctl-linux-amd64
dest: /usr/local/sbin/calicoctl
mode: 0755
- name: K8s nodes deployment
hosts: nfs, ingress, worker
become: true
roles:
- role: 'grycap.kubernetes'
vars:
# do not downgrade docker
kube_docker_version: latest
kube_server: "{{ groups['master'][0] }}"
kube_type_of_node: wn
kube_version: 1.28.2
kubelet_extra_args: '--volume-stats-agg-period 0'
- name: K8s customization
hosts: master
become: true
tasks:
- name: Wait for helm
command: helm version
register: result
until: result.rc == 0
retries: 20
delay: 10
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
when: true
- name: Create custom fact directory
file:
path: "/etc/ansible/facts.d"
mode: 0755
recurse: true
state: "directory"
- name: Create helm repos custom fact
copy:
src: files/helm_repos.fact
dest: /etc/ansible/facts.d/helm_repos.fact
mode: 0755
- name: Reload custom facts
setup:
filter: ansible_local
- name: Helm repo add stable
shell: |-
helm repo add stable https://charts.helm.sh/stable/
helm repo update
when: "'stable' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Helm repo add nfs-subdir-external-provisioner
shell: |-
helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner
helm repo update
when: "'nfs-subdir-external-provisioner' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: NFS provisioner
vars:
config: >-
--set nfs.server={{ groups['nfs'][0] }}
--set storageClass.defaultClass=true
--set nfs.path=/exports
shell: |-
helm status --namespace kube-system nfs-provisioner
if [ $? -ne 0 ]; then
helm install --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner
else
helm upgrade --namespace kube-system {{ config }} nfs-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Helm repo add ingress-nginx
shell: |-
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
helm repo update
when: "'ingress-nginx' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Ingress
vars:
config: >-
--set controller.service.type=NodePort
--set controller.service.externalIPs={{ '{' + hostvars[groups['ingress'][0]].ansible_default_ipv4.address + '}' }}
--set controller.config.proxy-body-size=0
--set controller.allowSnippetAnnotations=false
shell: |-
helm status --namespace kube-system cluster-ingress
if [ $? -ne 0 ]; then
helm install cluster-ingress --namespace kube-system {{ config }} ingress-nginx/ingress-nginx
else
helm upgrade --namespace kube-system {{ config }} cluster-ingress ingress-nginx/ingress-nginx
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Cert-manager
vars:
version: 1.13.3
config: >-
--version={{ version }}
--set ingressShim.defaultIssuerName=letsencrypt-prod
--set ingressShim.defaultIssuerKind=ClusterIssuer
--set ingressShim.defaultIssuerGroup=cert-manager.io
shell: |-
helm status --namespace cert-manager certs-man
if [ $? -ne 0 ]; then
kubectl create namespace cert-manager
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v{{ version }}/cert-manager.crds.yaml
helm repo add jetstack https://charts.jetstack.io
helm repo update
helm install --namespace cert-manager {{ config }} certs-man jetstack/cert-manager
else
helm upgrade --namespace cert-manager {{ config }} certs-man jetstack/cert-manager
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Cluster issuer file
copy:
dest: /tmp/clusterissuer.yaml
mode: 0644
content: |
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
email: valtri@civ.zcu.cz
server: https://acme-v02.api.letsencrypt.org/directory
privateKeySecretRef:
name: cluster-issuer-account-key
# Add a single challenge solver, HTTP01 using nginx
solvers:
- http01:
ingress:
class: nginx
- name: Cluster issuer
command:
kubectl apply -f /tmp/clusterissuer.yaml
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
# Accounting / monitoring needs
- name: Helm repo add prometheus-community
shell: |-
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
when: "'prometheus-community' not in ansible_local.helm_repos | map(attribute='name') | list"
- name: Prometheus configuration
vars:
smtp_from: "noreply@{{ groups['ingress'][0] }}"
limit_memory_warn: 80
limit_cpu_warn: 80
limit_disk_warn: 80
copy:
dest: /tmp/prometheus.yaml
mode: 0600
content: |
alertmanagerFiles:
alertmanager.yml:
global:
smtp_from: "{{ smtp_from }}"
receivers:
- name: default-receiver
email_configs:
- send_resolved: true
to: valtri@civ.zcu.cz
- name: 'null'
route:
group_by: ['job']
kube-state-metrics:
metricAnnotationsAllowList:
- pods=[hub.jupyter.org/username,egi.eu/primary_group]
serverFiles:
alerting_rules.yml:
groups:
- name: limits
rules:
- alert: HighCpuLoad
expr: 100 * (1 - avg by(kubernetes_node) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > {{ limit_cpu_warn }}
for: 15m
labels:
job: "eosc-{{ site_name }}"
annotations:
summary: "Host high CPU load ({{ '{{ $labels.kubernetes_node }}' }})"
description: "CPU load {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_cpu_warn }}%)"
- alert: OutOfMemory
expr: 100 * (1 - avg by(kubernetes_node) (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > {{ limit_memory_warn }}
for: 20m
labels:
job: "eosc-{{ site_name }}"
annotations:
summary: "Host out of memory ({{ '{{ $labels.kubernetes_node }}' }})"
description: "Node memory {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_memory_warn }}%)"
- alert: OutOfDiskSpace
expr: 100 * (1 - avg by (kubernetes_node, mountpoint) (node_filesystem_avail_bytes{device=~"/dev/.*"} / node_filesystem_size_bytes))
> {{ limit_disk_warn }}
for: 20m
labels:
job: "eosc-{{ site_name }}"
annotations:
summary: "Host out of disk space ({{ '{{ $labels.kubernetes_node }}' }})"
description: "Disk is almost full {{ '{{ $value | printf \"%.2f\" }}' }}% (limit {{ limit_disk_warn }}%)"
- name: Prometheus
vars:
config: >-
--version=25.8.2
-f /tmp/prometheus.yaml
shell: |-
helm status --namespace prometheus prometheus
if [ $? -ne 0 ]; then
kubectl create ns prometheus >/dev/null 2>&1 || true
helm install --namespace prometheus {{ config }} prometheus prometheus-community/prometheus
else
helm upgrade --namespace prometheus {{ config }} prometheus prometheus-community/prometheus
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true
- name: Grafana configuration
copy:
dest: /tmp/grafana.yaml
mode: 0640
content: |
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: "nginx"
kubernetes.io/tls-acme: "true"
hosts:
- "{{ grafana_hostname }}"
tls:
- hosts:
- "{{ grafana_hostname }}"
secretName: acme-tls-grafana
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: Server
orgId: 1
url: http://prometheus-server.prometheus.svc.cluster.local
isDefault: true
version: 1
editable: false
sidecar:
dashboards:
enabled: true
- name: Grafana
vars:
config: >-
--version=7.0.3
-f /tmp/grafana.yaml
shell: |-
helm status --namespace grafana grafana
if [ $? -ne 0 ]; then
kubectl create ns grafana
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
helm install --namespace grafana {{ config }} grafana grafana/grafana
else
helm upgrade --namespace grafana {{ config }} grafana grafana/grafana
fi
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
PATH: /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin
when: true