diff --git a/cluster_metrics/kapacitor_files/delete_all.sh b/cluster_metrics/kapacitor_files/delete_all.sh new file mode 100755 index 00000000..9adaaf53 --- /dev/null +++ b/cluster_metrics/kapacitor_files/delete_all.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright 2016, Rackspace US, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +tasks=$(kapacitor list tasks | sed 's/\|/ /'|awk '{print $1}') +for var in "${tasks[@]}"; do + kapacitor delete tasks ${var} +done diff --git a/cluster_metrics/kapacitor_files/run_all.sh b/cluster_metrics/kapacitor_files/run_all.sh new file mode 100755 index 00000000..cdedd4a0 --- /dev/null +++ b/cluster_metrics/kapacitor_files/run_all.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright 2016, Rackspace US, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +for i in $(find /opt/kapacitor/kapacitor_files/tickscripts -type f -name "*.tick"); do + echo $i + IFS='.' read -ra NAMES <<< "$i" + IFS='/' read -ra NAMES <<< "${NAMES[-2]}" + if [[ $i == *"batch"* ]]; then + kapacitor define ${NAMES[-1]} -type batch -tick $i -dbrp telegraf.autogen + else + kapacitor define ${NAMES[-1]} -type stream -tick $i -dbrp telegraf.autogen + fi + kapacitor enable ${NAMES[-1]} +done diff --git a/cluster_metrics/kapacitor_files/tickscripts/cinder_vg_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/cinder_vg_alert_stream.tick new file mode 100644 index 00000000..208c8835 --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/cinder_vg_alert_stream.tick @@ -0,0 +1,46 @@ +// cinder_vg_alert_stream + +// metric: cinder_used_percentage +// available_fields: "cinder_free_capacity_gb","cinder_total_capacity_gb","cinder_used_percentage" + +// TELEGRAF CONFIGURATION +// [[inputs.exec]] +// commands = ['python /opt/telegraf/cinder_data.py'] +// timeout = "15s" +// data_format = "influx" + +// DEFINE: kapacitor define cinder_vg_alert_stream -type stream -tick mem/cinder_vg_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable cinder_vg_alert_stream + +// Parameters +var info = 69 +var warn = 79 +var crit = 89 +var period = 10s +var every = 10s + +// Dataframe +var data = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('cinder') + .groupBy('host') + |window() + .period(period) + .every(every) + |mean('cinder_used_percentage') + .as('stat') + +// Thresholds +var alert = data + |alert() + .id('{{ index .Tags "host"}}/cinder_used') + .message('{{ .ID }}:{{ index .Fields "stat" }}') + .info(lambda: "stat" > info) + .warn(lambda: "stat" > warn) + .crit(lambda: "stat" > crit) + +// Alert +alert + .log('/tmp/cinder_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/conntrack_alert_batch.tick b/cluster_metrics/kapacitor_files/tickscripts/conntrack_alert_batch.tick new file mode 100644 index 00000000..162f04a0 --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/conntrack_alert_batch.tick @@ -0,0 +1,39 @@ +// conntrack_alert + +// metric: {ip_conntrack_count,ip_conntrack_max} + +// TELEGRAF CONFIGURATION +// [[inputs.conntrack]] +// files = ["ip_conntrack_count","ip_conntrack_max", +// "nf_conntrack_count","nf_conntrack_max"] +// dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"] + +// DEFINE: kapacitor define conntrack_alert_batch -type batch -tick conntrack_alert_batch.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable conntrack_alert_batch + +// Parameters +var info = 69 +var warn = 79 +var crit = 89 +var period = 10s +var every = 10s + +// Dataframe +var data = batch + |query('''select (mean(ip_conntrack_count)/mean(ip_conntrack_max))*100 as stat from "telegraf"."autogen".conntrack''') + .period(period) + .every(every) + .groupBy('host') + +// Thresholds +var alert = data + |alert() + .id('{{ index .Tags "host"}}/connection_used') + .message('{{ .ID }}:{{ index .Fields "stat" }}') + .info(lambda: "stat" > info) + .warn(lambda: "stat" > warn) + .crit(lambda: "stat" > crit) + +// Alert +alert + .log('/tmp/conntrack_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/cpu_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/cpu_alert_stream.tick new file mode 100644 index 00000000..87b0ea68 --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/cpu_alert_stream.tick @@ -0,0 +1,46 @@ +// cpu_alert_stream + +// metric: usage_idle +// available_fields: "usage_guest","usage_guest_nice","usage_user","usage_iowait", "usage_irq","usage_nice","usage_softirq","usage_steal","usage_system" + +// TELEGRAF CONFIGURATION +// [[inputs.cpu]] +// percpu = true +// totalcpu = true +// fielddrop = ["time_*"] + +// DEFINE: kapacitor define cpu_alert_stream -type stream -tick cpu/cpu_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable cpu_alert_stream + +// Parameters +var info = 21 +var warn = 11 +var crit = 2 +var period = 10s +var every = 10s + +// Dataframe +var data = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('cpu') + .groupBy('host') + |window() + .period(period) + .every(every) + |mean('usage_idle') + .as('stat') + +// Thresholds +var alert = data + |alert() + .id('{{ index .Tags "host"}}/cpu_used') + .message('{{ .ID }}:{{ index .Fields "stat" }}') + .info(lambda: "stat" < info) + .warn(lambda: "stat" < warn) + .crit(lambda: "stat" < crit) + +// Alert +alert + .log('/tmp/cpu_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/disk_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/disk_alert_stream.tick new file mode 100644 index 00000000..0dca9f75 --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/disk_alert_stream.tick @@ -0,0 +1,44 @@ +// disk_alert_stream + +// metric: used_percent +// available_fields: free","inodes_free","inodes_total","inodes_used","total","used" + +// TELEGRAF CONFIGURATION +// [[inputs.disk]] +// ignore_fs = ["tmpfs", "devtmpfs"] + +// DEFINE: kapacitor define disk_alert_stream -type stream -tick disk/disk_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable disk_alert_stream + +// Parameters +var info = 69 +var warn = 79 +var crit = 89 +var period = 10s +var every = 10s + +// Dataframe +var data = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('disk') + .groupBy('host') + |window() + .period(period) + .every(every) + |mean('used_percent') + .as('stat') + +// Thresholds +var alert = data + |alert() + .id('{{ index .Tags "host"}}/disk_used') + .message('{{ .ID }}:{{ index .Fields "stat" }}') + .info(lambda: "stat" > info) + .warn(lambda: "stat" > warn) + .crit(lambda: "stat" > crit) + +// Alert +alert + .log('/tmp/disk_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/interface_speed_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/interface_speed_alert_stream.tick new file mode 100644 index 00000000..b36aef39 --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/interface_speed_alert_stream.tick @@ -0,0 +1,75 @@ +// interface_speed_alert_stream.tick + +// metric: bytes_recv,bytes_sent +// available_fields: "bytes_recv","bytes_sent" + +// TELEGRAF CONFIGURATION +// [[inputs.exec]] +// commands = ['python /opt/telegraf/maxspeed_interface.py'] +// timeout = "15s" +// data_format = "influx" + +// DEFINE: kapacitor define interface_speed_alert_stream.tick -type stream -tick net/net_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable interface_speed_alert_stream.tick + +// Parameters + +var info = 29.0 +var warn = 39.0 +var crit = 59.0 +var unit = 1s + +// Dataframe +var rawdata = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('net') + .groupBy('host', 'interface') + +var interface_speed = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('interface_speed') + .groupBy('host', 'interface') + +var max_speed = interface_speed + |mean('max_speed') + .as('value') + +var bytes_recv1 = rawdata + |derivative('bytes_recv') + .as('value') + .nonNegative() + .unit(unit) + +var bytes_recv = bytes_recv1 + |mean('bytes_recv') + .as('value') + +var bytes_sent1 = rawdata + |derivative('bytes_sent') + .as('value') + .nonNegative() + .unit(unit) + +var bytes_sent = bytes_sent1 + |mean('bytes_sent') + .as('value') + +var data = bytes_recv + |join(max_speed,bytes_sent) + .as('bytes_recv', 'max_speed','bytes_sent') + +var alert = data + |alert() + .id('{{ index .Tags "host"}}/net_stats') + .message('{{ .ID }}: bytes_sent:{{ index .Fields "bytes_sent.value"}} bytes_recv:{{ index .Fields "bytes_recv.value"}} max_speed:{{ index .Fields "max_speed.value" }}') + .info(lambda: "bytes_recv.value"+"bytes_sent.value" > "max_speed.value"*0.125*1000000.0*info) + .warn(lambda: "bytes_recv.value"+"bytes_sent.value" > "max_speed.value"*0.125*1000000.0*warn) + .crit(lambda: "bytes_recv.value"+"bytes_sent.value" > "max_speed.value"*0.125*1000000.0*crit) + +// Alert + alert + .log('/tmp/interface_speed_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/mem_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/mem_alert_stream.tick new file mode 100644 index 00000000..8ed7c0ba --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/mem_alert_stream.tick @@ -0,0 +1,43 @@ +// mem_alert_stream + +// metric: used_percent +// available_fields: "active","available","available_percent","buffered","cached","free","inactive","total","used" + +// TELEGRAF CONFIGURATION +// [[inputs.mem]] + +// DEFINE: kapacitor define mem_alert_stream -type stream -tick mem/mem_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable mem_alert_stream + +// Parameters +var info = 79 +var warn = 89 +var crit = 98 +var period = 10s +var every = 10s + +// Dataframe +var data = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('mem') + .groupBy('host') + |window() + .period(period) + .every(every) + |mean('used_percent') + .as('stat') + +// Thresholds +var alert = data + |alert() + .id('{{ index .Tags "host"}}/mem_used') + .message('{{ .ID }}:{{ index .Fields "stat" }}') + .info(lambda: "stat" > info) + .warn(lambda: "stat" > warn) + .crit(lambda: "stat" > crit) + +// Alert +alert + .log('/tmp/mem_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/net_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/net_alert_stream.tick new file mode 100644 index 00000000..63605db2 --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/net_alert_stream.tick @@ -0,0 +1,66 @@ +// net_alert_stream + +// metric: err_in, err_out, drop_in, drop_out +// available_fields: "bytes_recv","bytes_sent","packets_recv","packets_sent" +// NOTE: More fields are available with the `[[inputs.net]]` plugin on linux + +// TELEGRAF CONFIGURATION +// [[inputs.net]] + +// DEFINE: kapacitor define net_alert_stream -type stream -tick net/net_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable net_alert_stream + +// Parameters +var info = 50 +var warn = 75 +var crit = 90 +var unit = 1s + +// Dataframe +var rawdata = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('net') + .groupBy('host', 'interface') + +var err_in = rawdata + |derivative('err_in') + .as('value') + .nonNegative() + .unit(unit) + +var err_out = rawdata + |derivative('err_out') + .as('value') + .nonNegative() + .unit(unit) + +var drop_in = rawdata + |derivative('drop_in') + .as('value') + .nonNegative() + .unit(unit) + +var drop_out = rawdata + |derivative('drop_out') + .as('value') + .nonNegative() + .unit(unit) + +var data = err_in + |join(err_out, drop_in, drop_out) + .as('err_in', 'err_out', 'drop_in', 'drop_out') + +// Thresholds +var alert = data + |alert() + .id('{{ index .Tags "host"}}/net_stats') + .message('{{ .ID }}: err_in:{{ index .Fields "err_in.value" }} err_out:{{ index .Fields "err_out.value" }} drop_in:{{ index .Fields "drop_in.value" }} drop_out:{{ index .Fields "drop_out.value" }}') + .info(lambda: "err_in.value" > info OR "err_out.value" > info OR "drop_in.value" > info OR "drop_out.value" > info) + .warn(lambda: "err_in.value" > warn OR "err_out.value" > warn OR "drop_in.value" > warn OR "drop_out.value" > warn) + .crit(lambda: "err_in.value" > crit OR "err_out.value" > crit OR "drop_in.value" > crit OR "drop_out.value" > crit) + +// Alert +alert + .log('/tmp/net_alert_log.txt') diff --git a/cluster_metrics/kapacitor_files/tickscripts/netstat_alert_stream.tick b/cluster_metrics/kapacitor_files/tickscripts/netstat_alert_stream.tick new file mode 100644 index 00000000..e0acbadf --- /dev/null +++ b/cluster_metrics/kapacitor_files/tickscripts/netstat_alert_stream.tick @@ -0,0 +1,49 @@ +// netstat_alert_stream + +// metric: 'tcp_established' +// available_fields: "tcp_close","tcp_close_wait","tcp_closing","tcp_established","tcp_fin_wait1","tcp_fin_wait2","tcp_last_ack","tcp_listen","tcp_none","tcp_syn_recv","cp_syn_sent","tcp_time_wait","udp_socket","tcp_listen" + +// TELEGRAF CONFIGURATION +// [[inputs.netstat]] + +// DEFINE: kapacitor define netstat_alert_stream -type stream -tick netstat/netstat_alert_stream.tick -dbrp telegraf.autogen +// ENABLE: kapacitor enable netstat_alert_stream + +// Parameters +var info = 20 +var warn = 40 +var crit = 60 +var infoSig = 2.5 +var warnSig = 3 +var critSig = 3.5 +var period = 10s +var every = 10s + +// Dataframe +var data = stream + |from() + .database('telegraf') + .retentionPolicy('autogen') + .measurement('netstat') + .groupBy('host') + |window() + .period(period) + .every(every) + |mean('tcp_established') + .as('stat') + +// Thresholds +var alert = data + |eval(lambda: sigma("stat")) + .as('sigma') + .keep() + |alert() + .id('{{ index .Tags "host"}}/tcp_conns') + .message('{{ .ID }}:{{ index .Fields "stat" }}') + .info(lambda: "stat" > info OR "sigma" > infoSig) + .warn(lambda: "stat" > warn OR "sigma" > warnSig) + .crit(lambda: "stat" > crit OR "sigma" > critSig) + +// Alert +alert + .log('/tmp/netstat_alert_stream_log.txt') diff --git a/cluster_metrics/playbook-kapacitor.yml b/cluster_metrics/playbook-kapacitor.yml index c7944050..e65fa4ab 100644 --- a/cluster_metrics/playbook-kapacitor.yml +++ b/cluster_metrics/playbook-kapacitor.yml @@ -31,12 +31,13 @@ template: src: templates/kapacitor.conf.j2 dest: /etc/kapacitor/kapacitor.conf - - name: Enable and restart kapacitor - service: - name: "kapacitor" - enabled: true - state: restarted - name: Start kapacitor server - shell: kapacitord -config /etc/kapacitor/kapacitor.conf -log-file /var/log/kapacitor/kapacitor.log & + shell: kapacitord config > /etc/kapacitor/kapacitor.conf;kapacitord -config /etc/kapacitor/kapacitor.conf -log-file /var/log/kapacitor/kapacitor.log & + - name: Copy tickscripts + copy: + src: /opt/openstack-ansible-ops/cluster_metrics/kapacitor_files + dest: /opt/kapacitor/ + - name: Execute tickscripts + shell: chmod 755 /opt/kapacitor/kapacitor_files/*.*;bash /opt/kapacitor/kapacitor_files/run_all.sh vars_files: - vars.yml diff --git a/cluster_metrics/templates/telegraf.conf.j2 b/cluster_metrics/templates/telegraf.conf.j2 index 580a956e..b33613a5 100644 --- a/cluster_metrics/templates/telegraf.conf.j2 +++ b/cluster_metrics/templates/telegraf.conf.j2 @@ -60,6 +60,11 @@ data_format = "influx" {% endif %} +[[inputs.conntrack]] + files = ["ip_conntrack_count","ip_conntrack_max", + "nf_conntrack_count","nf_conntrack_max"] + dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"] + {% if inventory_hostname in groups['all_containers'] %} [[inputs.net]]