Added kapacitor scripts

Added kapacitor tickscripts to trigger alerts based on certain
thresholds.

Change-Id: I66d1b1e58d279405637d9a2f06b3aae19fa29cc3
Signed-off-by: Kevin Carter <kevin.carter@rackspace.com>
This commit is contained in:
Nish Patwa 2016-09-29 05:02:53 +00:00 committed by Nish Patwa(nishpatwa007)
parent 6c45b23c4a
commit f0b26e6301
12 changed files with 465 additions and 6 deletions

View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Copyright 2016, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tasks=$(kapacitor list tasks | sed 's/\|/ /'|awk '{print $1}')
for var in "${tasks[@]}"; do
kapacitor delete tasks ${var}
done

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Copyright 2016, Rackspace US, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
for i in $(find /opt/kapacitor/kapacitor_files/tickscripts -type f -name "*.tick"); do
echo $i
IFS='.' read -ra NAMES <<< "$i"
IFS='/' read -ra NAMES <<< "${NAMES[-2]}"
if [[ $i == *"batch"* ]]; then
kapacitor define ${NAMES[-1]} -type batch -tick $i -dbrp telegraf.autogen
else
kapacitor define ${NAMES[-1]} -type stream -tick $i -dbrp telegraf.autogen
fi
kapacitor enable ${NAMES[-1]}
done

View File

@ -0,0 +1,46 @@
// cinder_vg_alert_stream
// metric: cinder_used_percentage
// available_fields: "cinder_free_capacity_gb","cinder_total_capacity_gb","cinder_used_percentage"
// TELEGRAF CONFIGURATION
// [[inputs.exec]]
// commands = ['python /opt/telegraf/cinder_data.py']
// timeout = "15s"
// data_format = "influx"
// DEFINE: kapacitor define cinder_vg_alert_stream -type stream -tick mem/cinder_vg_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable cinder_vg_alert_stream
// Parameters
var info = 69
var warn = 79
var crit = 89
var period = 10s
var every = 10s
// Dataframe
var data = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('cinder')
.groupBy('host')
|window()
.period(period)
.every(every)
|mean('cinder_used_percentage')
.as('stat')
// Thresholds
var alert = data
|alert()
.id('{{ index .Tags "host"}}/cinder_used')
.message('{{ .ID }}:{{ index .Fields "stat" }}')
.info(lambda: "stat" > info)
.warn(lambda: "stat" > warn)
.crit(lambda: "stat" > crit)
// Alert
alert
.log('/tmp/cinder_alert_log.txt')

View File

@ -0,0 +1,39 @@
// conntrack_alert
// metric: {ip_conntrack_count,ip_conntrack_max}
// TELEGRAF CONFIGURATION
// [[inputs.conntrack]]
// files = ["ip_conntrack_count","ip_conntrack_max",
// "nf_conntrack_count","nf_conntrack_max"]
// dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"]
// DEFINE: kapacitor define conntrack_alert_batch -type batch -tick conntrack_alert_batch.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable conntrack_alert_batch
// Parameters
var info = 69
var warn = 79
var crit = 89
var period = 10s
var every = 10s
// Dataframe
var data = batch
|query('''select (mean(ip_conntrack_count)/mean(ip_conntrack_max))*100 as stat from "telegraf"."autogen".conntrack''')
.period(period)
.every(every)
.groupBy('host')
// Thresholds
var alert = data
|alert()
.id('{{ index .Tags "host"}}/connection_used')
.message('{{ .ID }}:{{ index .Fields "stat" }}')
.info(lambda: "stat" > info)
.warn(lambda: "stat" > warn)
.crit(lambda: "stat" > crit)
// Alert
alert
.log('/tmp/conntrack_alert_log.txt')

View File

@ -0,0 +1,46 @@
// cpu_alert_stream
// metric: usage_idle
// available_fields: "usage_guest","usage_guest_nice","usage_user","usage_iowait", "usage_irq","usage_nice","usage_softirq","usage_steal","usage_system"
// TELEGRAF CONFIGURATION
// [[inputs.cpu]]
// percpu = true
// totalcpu = true
// fielddrop = ["time_*"]
// DEFINE: kapacitor define cpu_alert_stream -type stream -tick cpu/cpu_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable cpu_alert_stream
// Parameters
var info = 21
var warn = 11
var crit = 2
var period = 10s
var every = 10s
// Dataframe
var data = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('cpu')
.groupBy('host')
|window()
.period(period)
.every(every)
|mean('usage_idle')
.as('stat')
// Thresholds
var alert = data
|alert()
.id('{{ index .Tags "host"}}/cpu_used')
.message('{{ .ID }}:{{ index .Fields "stat" }}')
.info(lambda: "stat" < info)
.warn(lambda: "stat" < warn)
.crit(lambda: "stat" < crit)
// Alert
alert
.log('/tmp/cpu_alert_log.txt')

View File

@ -0,0 +1,44 @@
// disk_alert_stream
// metric: used_percent
// available_fields: free","inodes_free","inodes_total","inodes_used","total","used"
// TELEGRAF CONFIGURATION
// [[inputs.disk]]
// ignore_fs = ["tmpfs", "devtmpfs"]
// DEFINE: kapacitor define disk_alert_stream -type stream -tick disk/disk_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable disk_alert_stream
// Parameters
var info = 69
var warn = 79
var crit = 89
var period = 10s
var every = 10s
// Dataframe
var data = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('disk')
.groupBy('host')
|window()
.period(period)
.every(every)
|mean('used_percent')
.as('stat')
// Thresholds
var alert = data
|alert()
.id('{{ index .Tags "host"}}/disk_used')
.message('{{ .ID }}:{{ index .Fields "stat" }}')
.info(lambda: "stat" > info)
.warn(lambda: "stat" > warn)
.crit(lambda: "stat" > crit)
// Alert
alert
.log('/tmp/disk_alert_log.txt')

View File

@ -0,0 +1,75 @@
// interface_speed_alert_stream.tick
// metric: bytes_recv,bytes_sent
// available_fields: "bytes_recv","bytes_sent"
// TELEGRAF CONFIGURATION
// [[inputs.exec]]
// commands = ['python /opt/telegraf/maxspeed_interface.py']
// timeout = "15s"
// data_format = "influx"
// DEFINE: kapacitor define interface_speed_alert_stream.tick -type stream -tick net/net_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable interface_speed_alert_stream.tick
// Parameters
var info = 29.0
var warn = 39.0
var crit = 59.0
var unit = 1s
// Dataframe
var rawdata = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('net')
.groupBy('host', 'interface')
var interface_speed = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('interface_speed')
.groupBy('host', 'interface')
var max_speed = interface_speed
|mean('max_speed')
.as('value')
var bytes_recv1 = rawdata
|derivative('bytes_recv')
.as('value')
.nonNegative()
.unit(unit)
var bytes_recv = bytes_recv1
|mean('bytes_recv')
.as('value')
var bytes_sent1 = rawdata
|derivative('bytes_sent')
.as('value')
.nonNegative()
.unit(unit)
var bytes_sent = bytes_sent1
|mean('bytes_sent')
.as('value')
var data = bytes_recv
|join(max_speed,bytes_sent)
.as('bytes_recv', 'max_speed','bytes_sent')
var alert = data
|alert()
.id('{{ index .Tags "host"}}/net_stats')
.message('{{ .ID }}: bytes_sent:{{ index .Fields "bytes_sent.value"}} bytes_recv:{{ index .Fields "bytes_recv.value"}} max_speed:{{ index .Fields "max_speed.value" }}')
.info(lambda: "bytes_recv.value"+"bytes_sent.value" > "max_speed.value"*0.125*1000000.0*info)
.warn(lambda: "bytes_recv.value"+"bytes_sent.value" > "max_speed.value"*0.125*1000000.0*warn)
.crit(lambda: "bytes_recv.value"+"bytes_sent.value" > "max_speed.value"*0.125*1000000.0*crit)
// Alert
alert
.log('/tmp/interface_speed_alert_log.txt')

View File

@ -0,0 +1,43 @@
// mem_alert_stream
// metric: used_percent
// available_fields: "active","available","available_percent","buffered","cached","free","inactive","total","used"
// TELEGRAF CONFIGURATION
// [[inputs.mem]]
// DEFINE: kapacitor define mem_alert_stream -type stream -tick mem/mem_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable mem_alert_stream
// Parameters
var info = 79
var warn = 89
var crit = 98
var period = 10s
var every = 10s
// Dataframe
var data = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('mem')
.groupBy('host')
|window()
.period(period)
.every(every)
|mean('used_percent')
.as('stat')
// Thresholds
var alert = data
|alert()
.id('{{ index .Tags "host"}}/mem_used')
.message('{{ .ID }}:{{ index .Fields "stat" }}')
.info(lambda: "stat" > info)
.warn(lambda: "stat" > warn)
.crit(lambda: "stat" > crit)
// Alert
alert
.log('/tmp/mem_alert_log.txt')

View File

@ -0,0 +1,66 @@
// net_alert_stream
// metric: err_in, err_out, drop_in, drop_out
// available_fields: "bytes_recv","bytes_sent","packets_recv","packets_sent"
// NOTE: More fields are available with the `[[inputs.net]]` plugin on linux
// TELEGRAF CONFIGURATION
// [[inputs.net]]
// DEFINE: kapacitor define net_alert_stream -type stream -tick net/net_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable net_alert_stream
// Parameters
var info = 50
var warn = 75
var crit = 90
var unit = 1s
// Dataframe
var rawdata = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('net')
.groupBy('host', 'interface')
var err_in = rawdata
|derivative('err_in')
.as('value')
.nonNegative()
.unit(unit)
var err_out = rawdata
|derivative('err_out')
.as('value')
.nonNegative()
.unit(unit)
var drop_in = rawdata
|derivative('drop_in')
.as('value')
.nonNegative()
.unit(unit)
var drop_out = rawdata
|derivative('drop_out')
.as('value')
.nonNegative()
.unit(unit)
var data = err_in
|join(err_out, drop_in, drop_out)
.as('err_in', 'err_out', 'drop_in', 'drop_out')
// Thresholds
var alert = data
|alert()
.id('{{ index .Tags "host"}}/net_stats')
.message('{{ .ID }}: err_in:{{ index .Fields "err_in.value" }} err_out:{{ index .Fields "err_out.value" }} drop_in:{{ index .Fields "drop_in.value" }} drop_out:{{ index .Fields "drop_out.value" }}')
.info(lambda: "err_in.value" > info OR "err_out.value" > info OR "drop_in.value" > info OR "drop_out.value" > info)
.warn(lambda: "err_in.value" > warn OR "err_out.value" > warn OR "drop_in.value" > warn OR "drop_out.value" > warn)
.crit(lambda: "err_in.value" > crit OR "err_out.value" > crit OR "drop_in.value" > crit OR "drop_out.value" > crit)
// Alert
alert
.log('/tmp/net_alert_log.txt')

View File

@ -0,0 +1,49 @@
// netstat_alert_stream
// metric: 'tcp_established'
// available_fields: "tcp_close","tcp_close_wait","tcp_closing","tcp_established","tcp_fin_wait1","tcp_fin_wait2","tcp_last_ack","tcp_listen","tcp_none","tcp_syn_recv","cp_syn_sent","tcp_time_wait","udp_socket","tcp_listen"
// TELEGRAF CONFIGURATION
// [[inputs.netstat]]
// DEFINE: kapacitor define netstat_alert_stream -type stream -tick netstat/netstat_alert_stream.tick -dbrp telegraf.autogen
// ENABLE: kapacitor enable netstat_alert_stream
// Parameters
var info = 20
var warn = 40
var crit = 60
var infoSig = 2.5
var warnSig = 3
var critSig = 3.5
var period = 10s
var every = 10s
// Dataframe
var data = stream
|from()
.database('telegraf')
.retentionPolicy('autogen')
.measurement('netstat')
.groupBy('host')
|window()
.period(period)
.every(every)
|mean('tcp_established')
.as('stat')
// Thresholds
var alert = data
|eval(lambda: sigma("stat"))
.as('sigma')
.keep()
|alert()
.id('{{ index .Tags "host"}}/tcp_conns')
.message('{{ .ID }}:{{ index .Fields "stat" }}')
.info(lambda: "stat" > info OR "sigma" > infoSig)
.warn(lambda: "stat" > warn OR "sigma" > warnSig)
.crit(lambda: "stat" > crit OR "sigma" > critSig)
// Alert
alert
.log('/tmp/netstat_alert_stream_log.txt')

View File

@ -31,12 +31,13 @@
template:
src: templates/kapacitor.conf.j2
dest: /etc/kapacitor/kapacitor.conf
- name: Enable and restart kapacitor
service:
name: "kapacitor"
enabled: true
state: restarted
- name: Start kapacitor server
shell: kapacitord -config /etc/kapacitor/kapacitor.conf -log-file /var/log/kapacitor/kapacitor.log &
shell: kapacitord config > /etc/kapacitor/kapacitor.conf;kapacitord -config /etc/kapacitor/kapacitor.conf -log-file /var/log/kapacitor/kapacitor.log &
- name: Copy tickscripts
copy:
src: /opt/openstack-ansible-ops/cluster_metrics/kapacitor_files
dest: /opt/kapacitor/
- name: Execute tickscripts
shell: chmod 755 /opt/kapacitor/kapacitor_files/*.*;bash /opt/kapacitor/kapacitor_files/run_all.sh
vars_files:
- vars.yml

View File

@ -60,6 +60,11 @@
data_format = "influx"
{% endif %}
[[inputs.conntrack]]
files = ["ip_conntrack_count","ip_conntrack_max",
"nf_conntrack_count","nf_conntrack_max"]
dirs = ["/proc/sys/net/ipv4/netfilter","/proc/sys/net/netfilter"]
{% if inventory_hostname in groups['all_containers'] %}
[[inputs.net]]