antennine
/
infra_public


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
							---
prometheus_version: 2.37.0 # LTS
prometheus_binary_local_dir: '' # default /usr/local/bin
prometheus_skip_install: false

prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
prometheus_read_only_dirs: []

prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ''
# See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md

prometheus_storage_retention: "30d"
# Available since Prometheus 2.7.0
# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units
# supported: KB, MB, GB, TB, PB.
prometheus_storage_retention_size: "0"

# Alternative config file name, searched in ansible templates path.
prometheus_config_file: 'prometheus.yml.j2'

prometheus_targets: "{{ all_targets }}"

prometheus_alertmanager_config:
  - static_configs:
    - targets:
       - localhost:9093

prometheus_scrape_configs:
  - job_name: "prometheus"
    metrics_path: "{{ prometheus_metrics_path }}"
    static_configs:
      - targets:
          - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
  - job_name: "node"
    file_sd_configs:
      - files:
          - "{{ prometheus_config_dir }}/file_sd/node.yml"

  - job_name: 'blackbox-external-targets'
    metrics_path: /probe
    params:
      module: [http_2xx_head]
    static_configs:
      - targets:
        - https://www.google.com
        - https://www.ripe.net
    relabel_configs: "{{ blackbox_relabel_configs }}"

  - job_name: 'blackbox-server_head'
    metrics_path: /probe
    params:
      module: [http_2xx_head]
    static_configs:
      - targets:
        - https://ada
    relabel_configs: "{{ blackbox_relabel_configs }}"

  - job_name: 'blackbox-server_get'
    metrics_path: /probe
    params:
      module: [http_2xx_get]
    static_configs:
      - targets:
         - https://torrent.ada/
    relabel_configs: "{{ blackbox_relabel_configs }}"

  - job_name: 'blackbox-ping-external'
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
      - targets:
        - 1.1.1.1
        - 8.8.8.8
        - 4.2.2.2
    relabel_configs: "{{ blackbox_relabel_configs }}"

  - job_name: 'blackbox-ping-internal'
    file_sd_configs:
      - files:
          - "{{ prometheus_config_dir }}/file_sd/blackbox_ping_internal.yml"
    metrics_path: /probe
    params:
      module: [icmp]
    relabel_configs: "{{ blackbox_relabel_configs }}"

prometheus_alert_rules:
  - alert: Watchdog
    expr: vector(1)
    for: 10m
    labels:
      severity: warning
    annotations:
      description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
      summary: 'Ensure entire alerting pipeline is functional'
  - alert: NodeDown
    expr: "up{job=\"node\", alert=\"yes\"} == 0"
    for: 5m
    labels:
      severity: critical
    annotations:
      description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
      summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
  - alert: ToolDown
    expr: "probe_success{job=\"blackbox-ping-internal\"} == 0"
    for: 5m
    labels:
      severity: critical
    annotations:
      description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
      summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
  - alert: RebootRequired
    expr: 'node_reboot_required > 0'
    labels:
      severity: warning
    annotations:
      description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
      summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
  - alert: NodeFilesystemSpaceFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'
      summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n  predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemSpaceFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'
      summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n  predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeFilesystemAlmostOutOfSpace
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
      summary: 'Filesystem has less than 5% space left.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemAlmostOutOfSpace
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
      summary: 'Filesystem has less than 3% space left.'
    expr: "(\n  node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeFilesystemFilesFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'
      summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n  predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemFilesFillingUp
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'
      summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n  predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeFilesystemAlmostOutOfFiles
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
      summary: 'Filesystem has less than 5% inodes left.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeFilesystemAlmostOutOfFiles
    annotations:
      description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
      summary: 'Filesystem has less than 3% inodes left.'
    expr: "(\n  node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n  node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
    for: 1h
    labels:
      severity: critical
  - alert: NodeNetworkReceiveErrs
    annotations:
      description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
      summary: 'Network interface is reporting many receive errors.'
    expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeNetworkTransmitErrs
    annotations:
      description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
      summary: 'Network interface is reporting many transmit errors.'
    expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
    for: 1h
    labels:
      severity: warning
  - alert: NodeHighNumberConntrackEntriesUsed
    annotations:
      description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
      summary: 'Number of conntrack are getting close to the limit'
    expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
    labels:
      severity: warning
  - alert: NodeClockSkewDetected
    annotations:
      message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
      summary: 'Clock skew detected.'
    expr: "(\n  node_timex_offset_seconds > 0.05\nand\n  deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n  node_timex_offset_seconds < -0.05\nand\n  deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
    for: 10m
    labels:
      severity: warning
  - alert: NodeClockNotSynchronising
    annotations:
      message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
      summary: 'Clock not synchronising.'
    expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
    for: 10m
    labels:
      severity: warning