Merge "Report Tool: Package and add plugins/correlator"
This commit is contained in:
commit
882eab3cd4
@ -2941,6 +2941,56 @@ collect_subclouds()
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
#
|
||||||
|
# Name : get_report_tool
|
||||||
|
#
|
||||||
|
# Purpose : Fetch report tool from current host
|
||||||
|
#
|
||||||
|
# Parameters: $1 - local path destination
|
||||||
|
#
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
function get_report_tool()
|
||||||
|
{
|
||||||
|
local local_dest=${1}
|
||||||
|
|
||||||
|
mkdir -p ${local_dest}
|
||||||
|
cp -r /usr/local/bin/report/tool ${local_dest}
|
||||||
|
|
||||||
|
local rc=${?}
|
||||||
|
if [ ${rc} -ne ${PASS} ] ; then
|
||||||
|
report_error "failed to get report tool from /usr/local/bin" ${rc}
|
||||||
|
else
|
||||||
|
ilog "copied report tool from host"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
#
|
||||||
|
# Name : get_report_plugins
|
||||||
|
#
|
||||||
|
# Purpose : Fetch plugins for report tool from current host
|
||||||
|
#
|
||||||
|
# Parameters: $1 - local path destination
|
||||||
|
#
|
||||||
|
############################################################################
|
||||||
|
|
||||||
|
function get_report_plugins()
|
||||||
|
{
|
||||||
|
local local_dest=${1}
|
||||||
|
|
||||||
|
mkdir -p ${local_dest}
|
||||||
|
cp -r /etc/collect/plugins ${local_dest}
|
||||||
|
|
||||||
|
local rc=${?}
|
||||||
|
if [ ${rc} -ne ${PASS} ] ; then
|
||||||
|
report_error "failed to get report plugins from /etc/collect" ${rc}
|
||||||
|
else
|
||||||
|
ilog "copied plugins for report tool from host"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
#
|
#
|
||||||
# Handle subcloud and system hosts batched collect
|
# Handle subcloud and system hosts batched collect
|
||||||
@ -3031,6 +3081,17 @@ echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... "
|
|||||||
|
|
||||||
remove_file_local ${COLLECT_ERROR_LOG}
|
remove_file_local ${COLLECT_ERROR_LOG}
|
||||||
remove_file_local ${HOST_COLLECT_ERROR_LOG}
|
remove_file_local ${HOST_COLLECT_ERROR_LOG}
|
||||||
|
get_report_tool ${COLLECT_DIR}/report
|
||||||
|
get_report_plugins ${COLLECT_DIR}/report
|
||||||
|
|
||||||
|
cd ${COLLECT_DIR}
|
||||||
|
tar -czf report_tool.tgz report
|
||||||
|
rc=${?}
|
||||||
|
if [ ${rc} -ne ${PASS} ] ; then
|
||||||
|
report_error "failed to tar report tool" ${rc}
|
||||||
|
else
|
||||||
|
rm -r report
|
||||||
|
fi
|
||||||
|
|
||||||
/usr/bin/expect << EOF
|
/usr/bin/expect << EOF
|
||||||
log_user ${USER_LOG_MODE}
|
log_user ${USER_LOG_MODE}
|
||||||
|
81
tools/collector/debian-scripts/report/README
Normal file
81
tools/collector/debian-scripts/report/README
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
Refer to report.py file header for a description of the tool
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
Consider the following collect bundle structure
|
||||||
|
|
||||||
|
SELECT_NODES_20220527.193605
|
||||||
|
├── controller-0_20220527.193605
|
||||||
|
│ ├── etc
|
||||||
|
│ ├── root
|
||||||
|
│ └── var
|
||||||
|
├── controller-1_20220527.193605
|
||||||
|
│ ├── etc
|
||||||
|
│ ├── root
|
||||||
|
│ └── var
|
||||||
|
├── report
|
||||||
|
├── plugins (where the plugin files will be placed)
|
||||||
|
│ ├── alarm
|
||||||
|
│ ├── substring
|
||||||
|
│ └── ...
|
||||||
|
├── tool (where the tool will be placed)
|
||||||
|
└── output (where the output files will be placed)
|
||||||
|
|
||||||
|
|
||||||
|
> cat plugins/alarm
|
||||||
|
|
||||||
|
algorithm=alarm
|
||||||
|
alarm_exclude=400., 800.
|
||||||
|
entity_exclude=subsystem=vim
|
||||||
|
|
||||||
|
> cat plugins/substring
|
||||||
|
|
||||||
|
algorithm=substring
|
||||||
|
files=var/log/mtcAgent.log, var/log/sm.log
|
||||||
|
hosts=controllers
|
||||||
|
substring=operation failed
|
||||||
|
substring=Failed to send message
|
||||||
|
|
||||||
|
> report/tool/report.py --start 20220501 --end 20220530
|
||||||
|
|
||||||
|
Running the command above will populate the report folder with output files.
|
||||||
|
The tool also provides default values, more details are in 'report.py -h'.
|
||||||
|
|
||||||
|
The substring algorithm creates an output file for every host of the
|
||||||
|
specified host type. The files will contain log events within the
|
||||||
|
provided date range containing the substring 'operation failed' and 'Failed
|
||||||
|
to send message'.
|
||||||
|
|
||||||
|
The alarm algorithm creates two output file: 'log' and 'alarm'
|
||||||
|
'log' contains customer log messages created within the provided date range,
|
||||||
|
and 'alarm' contains system alarms created within the provided date range, as
|
||||||
|
long as the alarm ids and entity ids are not included in the alarm plugin file.
|
||||||
|
|
||||||
|
For more detailed information about an algorithm use 'report.py <algorithm> -h'.
|
||||||
|
|
||||||
|
Here is the report directory after running the above command
|
||||||
|
|
||||||
|
report
|
||||||
|
├── output
|
||||||
|
│ └── SELECT_NODES_20220527.193605 (collect bundle that the report tool was run on)
|
||||||
|
│ ├── plugins (output files for plugins)
|
||||||
|
│ │ ├── alarm
|
||||||
|
│ │ └── ...
|
||||||
|
│ ├── correlator_failures
|
||||||
|
│ ├── correlator_events
|
||||||
|
│ ├── correlator_state_changes
|
||||||
|
│ ├── report.log (log file for report tool)
|
||||||
|
│ └── untar.log (log file for untarring collect bundle and host tar files)
|
||||||
|
├── plugins (where the plugins files are)
|
||||||
|
└── tool (where the report tool is)
|
||||||
|
|
||||||
|
The report tool also allows users to point it at any collect bundle and
|
||||||
|
have it automatically extract the tarball and tar files for each host
|
||||||
|
before running.
|
||||||
|
|
||||||
|
> report/tool/report.py -d CGTS-19143
|
||||||
|
|
||||||
|
Users may specify if they want the correlator to only find events
|
||||||
|
and state changes for a specific host.
|
||||||
|
|
||||||
|
> report/tool/report.py --hostname controller-0
|
@ -9,8 +9,12 @@
|
|||||||
# Algorithm string constants
|
# Algorithm string constants
|
||||||
ALARM = "alarm"
|
ALARM = "alarm"
|
||||||
AUDIT = "audit"
|
AUDIT = "audit"
|
||||||
PROCESS_FAILURE = "process_failure"
|
DAEMON_FAILURES = "daemon_failures"
|
||||||
PUPPET = "puppet"
|
HEARTBEAT_LOSS = "heartbeat_loss"
|
||||||
|
MAINTENANCE_ERR = "maintenance_errors"
|
||||||
|
PROCESS_FAILURES = "process_failures"
|
||||||
|
PUPPET_ERRORS = "puppet_errors"
|
||||||
|
STATE_CHANGES = "state_changes"
|
||||||
SUBSTRING = "substring"
|
SUBSTRING = "substring"
|
||||||
SWACT = "swact"
|
SWACT_ACTIVITY = "swact_activity"
|
||||||
SYSTEM_INFO = "system_info"
|
SYSTEM_INFO = "system_info"
|
583
tools/collector/debian-scripts/report/correlator.py
Executable file
583
tools/collector/debian-scripts/report/correlator.py
Executable file
@ -0,0 +1,583 @@
|
|||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# This file contains the Correlator class.
|
||||||
|
# The Correlator class contains algorithms that search for failures.
|
||||||
|
#
|
||||||
|
# The Correlator class reads through all the output files created by
|
||||||
|
# the plugins and detects failures. A summary of the failures and their
|
||||||
|
# causes are printed to standard output and an output file is created
|
||||||
|
# in the report directory.
|
||||||
|
#
|
||||||
|
# TODO: Modularize code and separate methods into their own files
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Correlator:
|
||||||
|
def __init__(self, plugin_output_dir):
|
||||||
|
"""Constructor for the Correlator class
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
plugin_output_dir (string): Path to directory with output files
|
||||||
|
from plugins
|
||||||
|
"""
|
||||||
|
self.plugin_output_dir = plugin_output_dir
|
||||||
|
|
||||||
|
def run(self, hostname):
|
||||||
|
"""Searches through the output files created by the plugins for
|
||||||
|
failures and determines their causes, as well as extracts significant
|
||||||
|
events and state changes
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
failures = []
|
||||||
|
try:
|
||||||
|
failures += self.uncontrolled_swact()
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
failures += self.mtc_errors()
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
events = []
|
||||||
|
try:
|
||||||
|
events += self.get_events(hostname)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
alarms = []
|
||||||
|
try:
|
||||||
|
alarms += self.get_alarms(hostname)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
state_changes = []
|
||||||
|
try:
|
||||||
|
state_changes += self.get_state_changes(hostname)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
return (sorted(failures), sorted(events), sorted(alarms),
|
||||||
|
sorted(state_changes))
|
||||||
|
|
||||||
|
def uncontrolled_swact(self):
|
||||||
|
"""Searches through the output file created by the swact activity
|
||||||
|
plugin for uncontrolled swacts and determines their causes through
|
||||||
|
other indicators, like the log "Neighbour [..] is now in the down"
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Variables to keep track of indicators for failure causes
|
||||||
|
start_time = end_time = svc_failed = None
|
||||||
|
ctrlr_down = None # Active controller that went down, causing swact
|
||||||
|
ctrlr_svc_fail = None # Active controller where service failed
|
||||||
|
ctrlr_link_down = None # Orig. active controller when link went down
|
||||||
|
hb_loss = active_failed = go_active_failed = link_down = False
|
||||||
|
|
||||||
|
# Open output file from swact activity plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
|
||||||
|
|
||||||
|
with open(file_path, "r") as swact_activity:
|
||||||
|
for line in swact_activity:
|
||||||
|
if "Uncontrolled swact" in line and not start_time:
|
||||||
|
start_time = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
if ("Host from active to failed, Peer from standby to "
|
||||||
|
"active" in line):
|
||||||
|
link_down = True
|
||||||
|
ctrlr_link_down = re.findall(
|
||||||
|
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) "
|
||||||
|
"sm:", line)[0]
|
||||||
|
elif (re.search("Neighbor (.+) is now in the down", line)
|
||||||
|
and start_time and not ctrlr_down):
|
||||||
|
ctrlr_down = re.findall(
|
||||||
|
r"Neighbor \((.+)\) received event", line)[0]
|
||||||
|
elif (re.search("Service (.+) is failed and has reached max "
|
||||||
|
"failures", line) and not svc_failed):
|
||||||
|
svc_failed = re.findall(
|
||||||
|
r"Service \((.+)\) is failed", line)[0]
|
||||||
|
ctrlr_svc_fail = re.findall(
|
||||||
|
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
|
||||||
|
line)[0]
|
||||||
|
elif (svc_failed and re.search(
|
||||||
|
"active-failed\\s+\\| disabling-failed\\s+\\| "
|
||||||
|
+ svc_failed, line)):
|
||||||
|
if re.search(r"\| go-active-failed\s+\|", line):
|
||||||
|
go_active_failed = True
|
||||||
|
else:
|
||||||
|
active_failed = True
|
||||||
|
elif "Swact update" in line and start_time and not end_time:
|
||||||
|
end_time = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
if ctrlr_down:
|
||||||
|
try:
|
||||||
|
hb_loss = self.search_hb_loss(
|
||||||
|
start_time, end_time, ctrlr_down)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
if link_down:
|
||||||
|
data.append(start_time + " to " + end_time
|
||||||
|
+ " Uncontrolled swact, refer to SM logs "
|
||||||
|
"for in-depth analysis, original active "
|
||||||
|
"controller: " + ctrlr_link_down + "\n")
|
||||||
|
elif ctrlr_down:
|
||||||
|
if hb_loss:
|
||||||
|
data.append(start_time + " to " + end_time
|
||||||
|
+ " Uncontrolled swact due to "
|
||||||
|
"spontaneous reset of active "
|
||||||
|
"controller " + ctrlr_down + "\n")
|
||||||
|
else:
|
||||||
|
data.append(start_time + " to " + end_time
|
||||||
|
+ " Uncontrolled swact likely due to "
|
||||||
|
"spontaneous reset of active "
|
||||||
|
"controller " + ctrlr_down + "\n")
|
||||||
|
elif svc_failed:
|
||||||
|
if active_failed and go_active_failed:
|
||||||
|
data.append(start_time + " to " + end_time
|
||||||
|
+ " Uncontrolled swact due to service "
|
||||||
|
"failure (" + svc_failed + ") twice "
|
||||||
|
"in 2 minutes was unsuccessful so "
|
||||||
|
"\"bounced back\" to original active "
|
||||||
|
"controller " + ctrlr_svc_fail + "\n")
|
||||||
|
elif active_failed:
|
||||||
|
data.append(start_time + " to " + end_time
|
||||||
|
+ " Uncontrolled swact due to service "
|
||||||
|
"failure (" + svc_failed + ") twice "
|
||||||
|
"in 2 minutes on active controller "
|
||||||
|
+ ctrlr_svc_fail + "\n")
|
||||||
|
else:
|
||||||
|
data.append(start_time + " to " + end_time
|
||||||
|
+ " Uncontrolled swact likely due to "
|
||||||
|
"service failure (" + svc_failed
|
||||||
|
+ ") twice in 2 minutes on active "
|
||||||
|
"controller " + ctrlr_svc_fail + "\n")
|
||||||
|
|
||||||
|
start_time = end_time = svc_failed = None
|
||||||
|
ctrlr_down = ctrlr_svc_fail = ctrlr_link_down = None
|
||||||
|
hb_loss = active_failed = go_active_failed = False
|
||||||
|
link_down = False
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def mtc_errors(self):
|
||||||
|
"""Searches through the output file created by the maintenance errors
|
||||||
|
plugin for failures and determines their causes through other
|
||||||
|
indicators, like the log "Loss Of Communication for 5 seconds"
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Variables to keep track of indicators for failure causes
|
||||||
|
goenable_start = goenable_end = goenable_host = None
|
||||||
|
goenable_tst_f = config_tst_f = None # Tests failed
|
||||||
|
config_start = config_end = config_host = puppet_error = None
|
||||||
|
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||||
|
daemon_fail = comm_loss = auto_recov_dis = False
|
||||||
|
|
||||||
|
# Open output file from maintenance errors plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
|
||||||
|
|
||||||
|
with open(file_path, "r") as mtc:
|
||||||
|
for line in mtc:
|
||||||
|
if "auto recovery disabled" in line and not auto_recov_dis:
|
||||||
|
# Check if previous failure recorded was go-enable,
|
||||||
|
# configuration or heartbeat failure
|
||||||
|
if (data and
|
||||||
|
re.search(r"Go-enable|[cC]onfiguration|Heartbeat",
|
||||||
|
data[-1])):
|
||||||
|
host = re.findall(r"failure on ([^\s]+)", data[-1])
|
||||||
|
# Check if host in auto recovery disabled mode is same
|
||||||
|
# as host with previous failure
|
||||||
|
if (host and re.search(
|
||||||
|
host[0] + " auto recovery disabled", line)):
|
||||||
|
old = data[-1].split("due", 1)
|
||||||
|
if len(old) == 1:
|
||||||
|
data[-1] = (data[-1][:-1]
|
||||||
|
+ " (auto recovery disabled)\n")
|
||||||
|
else:
|
||||||
|
data[-1] = (old[0]
|
||||||
|
+ "(auto recovery disabled) due"
|
||||||
|
+ old[1])
|
||||||
|
auto_recov_dis = True
|
||||||
|
elif "GOENABLED Failed" in line and not goenable_start:
|
||||||
|
goenable_start, auto_recov_dis = line[0:19], False
|
||||||
|
goenable_host = re.findall(
|
||||||
|
"Error : (.+) got GOENABLED Failed", line)[0]
|
||||||
|
elif ("configuration failed or incomplete" in line
|
||||||
|
and not config_start):
|
||||||
|
config_start = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
auto_recov_dis = False
|
||||||
|
config_host = re.findall(
|
||||||
|
"Error : (.+) configuration failed", line)[0]
|
||||||
|
elif "Heartbeat Loss" in line:
|
||||||
|
# Check if previous failure recorded was heartbeat loss
|
||||||
|
# due to missing heartbeat messages
|
||||||
|
if ("(during recovery soak)" in line and data and
|
||||||
|
re.search("missing heartbeat messages", data[-1])):
|
||||||
|
host = re.findall(
|
||||||
|
"failure on (.+) due to", data[-1])[0]
|
||||||
|
# Check if host with hearbeat loss failure is the same
|
||||||
|
# as host with previous failure
|
||||||
|
if (re.search(host + " (.+) Heartbeat Loss (.+) "
|
||||||
|
"\\(during recovery soak\\)", line)):
|
||||||
|
old = data[-1]
|
||||||
|
data[-1] = (old[0:23] + line[0:19] + old[42:-1]
|
||||||
|
+ " (recovery over disabled due to "
|
||||||
|
"heartbeat soak failure)\n")
|
||||||
|
else:
|
||||||
|
hb_loss_start = line[0:19]
|
||||||
|
comm_loss = auto_recov_dis = False
|
||||||
|
hb_loss_host = re.findall("Error : (.+) [CM]", line)[0]
|
||||||
|
# Check if previous failure recorded was heartbeat loss due to
|
||||||
|
# missing heartbeat messages
|
||||||
|
elif ("regained MTCALIVE from host that has rebooted" in line
|
||||||
|
and data and re.search(r"Heartbeat loss failure (.+) "
|
||||||
|
r"\(recovery over disabled\)",
|
||||||
|
data[-1])):
|
||||||
|
host = re.findall("failure on (.+) due to", data[-1])[0]
|
||||||
|
if re.search(host + " regained MTCALIVE", line):
|
||||||
|
old = data[-1].split("due", 1)[0]
|
||||||
|
data[-1] = (old[0:23] + line[0:19] + old[42:]
|
||||||
|
+ "due to uncontrolled reboot\n")
|
||||||
|
elif (hb_loss_start and not comm_loss and hb_loss_host and
|
||||||
|
re.search(hb_loss_host + " Loss Of Communication for 5 "
|
||||||
|
"seconds", line)):
|
||||||
|
comm_loss = True
|
||||||
|
elif re.search("mtcClient --- (.+)Error : FAILED:", line):
|
||||||
|
if goenable_start and not goenable_tst_f:
|
||||||
|
goenable_tst_f = re.findall(
|
||||||
|
r"Error : FAILED: (.+) \(\d", line)[0]
|
||||||
|
elif config_start and not config_tst_f:
|
||||||
|
config_tst_f = re.findall(
|
||||||
|
r"Error : FAILED: (.+) \(\d", line)[0]
|
||||||
|
elif (goenable_host and not goenable_end and
|
||||||
|
re.search(goenable_host + " Task: In-Test Failure, "
|
||||||
|
"threshold reached", line)):
|
||||||
|
goenable_end = line[0:19]
|
||||||
|
if goenable_tst_f:
|
||||||
|
data.append(goenable_start + " to " + goenable_end
|
||||||
|
+ " Go-enable test failure on "
|
||||||
|
+ goenable_host + " due to failing of "
|
||||||
|
+ goenable_tst_f + "\n")
|
||||||
|
else:
|
||||||
|
data.append(goenable_start + " to " + goenable_end
|
||||||
|
+ " Go-enable test failure on "
|
||||||
|
+ goenable_host + " due to unknown test "
|
||||||
|
"failing\n")
|
||||||
|
|
||||||
|
goenable_start = goenable_end = goenable_host = None
|
||||||
|
goenable_tst_f = None
|
||||||
|
elif (config_host and not config_end and
|
||||||
|
re.search(config_host + " Task: Configuration failure, "
|
||||||
|
"threshold reached", line)):
|
||||||
|
config_end = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
if (config_tst_f
|
||||||
|
!= "/etc/goenabled.d/config_goenabled_check.sh"):
|
||||||
|
try:
|
||||||
|
daemon_fail = self.search_daemon_fail(
|
||||||
|
config_start, config_end, config_host)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
if (config_tst_f ==
|
||||||
|
"/etc/goenabled.d/config_goenabled_check.sh"
|
||||||
|
or daemon_fail):
|
||||||
|
try:
|
||||||
|
puppet_error = self.search_puppet_error(
|
||||||
|
config_start, config_end)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
config_start = config_start.strftime(
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
if puppet_error:
|
||||||
|
data.append(config_start + " to " + config_end
|
||||||
|
+ " Configuration failure on "
|
||||||
|
+ config_host + " due to:\n"
|
||||||
|
+ puppet_error)
|
||||||
|
else:
|
||||||
|
data.append(config_start + " to " + config_end
|
||||||
|
+ " Configuration failure on "
|
||||||
|
+ config_host
|
||||||
|
+ " due to unknown cause\n")
|
||||||
|
else:
|
||||||
|
config_start = config_start.strftime(
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
data.append(config_start + " to " + config_end
|
||||||
|
+ " Possible configuration failure on "
|
||||||
|
+ config_host + "\n")
|
||||||
|
|
||||||
|
config_start = config_end = config_host = None
|
||||||
|
config_tst_f = puppet_error = None
|
||||||
|
daemon_fail = False
|
||||||
|
elif (hb_loss_start and not hb_loss_end and hb_loss_host and
|
||||||
|
re.search(hb_loss_host + " Connectivity Recovered ",
|
||||||
|
line)):
|
||||||
|
hb_loss_end = line[0:19]
|
||||||
|
data.append(hb_loss_start + " to " + hb_loss_end
|
||||||
|
+ " Heartbeat loss failure on " + hb_loss_host
|
||||||
|
+ " due to too many missing heartbeat "
|
||||||
|
"messages\n")
|
||||||
|
|
||||||
|
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||||
|
comm_loss = False
|
||||||
|
elif (hb_loss_start and comm_loss and not hb_loss_end and
|
||||||
|
hb_loss_host and re.search(
|
||||||
|
hb_loss_host + " Graceful Recovery Wait", line)):
|
||||||
|
hb_loss_end = line[0:19]
|
||||||
|
data.append(hb_loss_start + " to " + hb_loss_end
|
||||||
|
+ " Heartbeat loss failure on " + hb_loss_host
|
||||||
|
+ " due to too many missing heartbeat "
|
||||||
|
"messages (recovery over disabled)\n")
|
||||||
|
|
||||||
|
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||||
|
comm_loss = False
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def search_hb_loss(self, start_time, end_time, host):
|
||||||
|
"""Searches through the output file created by the heartbeat loss
|
||||||
|
plugin for "Heartbeat Loss" message from host between one minute before
|
||||||
|
start_time and end_time
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
hb_loss = False
|
||||||
|
|
||||||
|
# Open output file from heartbeat loss plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "heartbeat_loss")
|
||||||
|
|
||||||
|
with open(file_path, "r") as heartbeat_loss:
|
||||||
|
for line in heartbeat_loss:
|
||||||
|
if (re.search("Error : " + host + " (.+) Heartbeat Loss ",
|
||||||
|
line)):
|
||||||
|
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||||
|
if (date >= start_time - timedelta(minutes=1)
|
||||||
|
and date <= end_time):
|
||||||
|
hb_loss = True
|
||||||
|
break
|
||||||
|
|
||||||
|
return hb_loss
|
||||||
|
|
||||||
|
def search_daemon_fail(self, start_time, end_time, host):
|
||||||
|
"""Searches through the output file created by the daemon failures
|
||||||
|
plugin for "Failed to run the puppet manifest" message from host
|
||||||
|
between 10 seconds before start_time and end_time
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
daemon_fail = False
|
||||||
|
|
||||||
|
# Open output file from daemon failures plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "daemon_failures")
|
||||||
|
|
||||||
|
with open(file_path, "r") as daemon_failures:
|
||||||
|
for line in daemon_failures:
|
||||||
|
if (re.search("\\d " + host
|
||||||
|
+ " (.+) Failed to run the puppet manifest",
|
||||||
|
line)):
|
||||||
|
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||||
|
if (date >= start_time - timedelta(seconds=10)
|
||||||
|
and date <= end_time):
|
||||||
|
daemon_fail = True
|
||||||
|
break
|
||||||
|
|
||||||
|
return daemon_fail
|
||||||
|
|
||||||
|
def search_puppet_error(self, start_time, end_time):
|
||||||
|
"""Searches through the output file created by the puppet errors
|
||||||
|
plugin for "Error:" message between 10 seconds before start_time and
|
||||||
|
end_time and returns it
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
puppet_log = None
|
||||||
|
|
||||||
|
# Open output file from puppet errors plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "puppet_errors")
|
||||||
|
|
||||||
|
with open(file_path, "r") as puppet_errors:
|
||||||
|
for line in puppet_errors:
|
||||||
|
if "Error: " in line:
|
||||||
|
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||||
|
if (date >= start_time - timedelta(seconds=10)
|
||||||
|
and date <= end_time):
|
||||||
|
puppet_log = line
|
||||||
|
break
|
||||||
|
|
||||||
|
return puppet_log
|
||||||
|
|
||||||
|
def get_events(self, hostname):
|
||||||
|
"""Searches through the output files created by the plugins for
|
||||||
|
significant events and summarizes them, such as "force failed by SM"
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Variables to keep track of details for events
|
||||||
|
mnfa_start, mnfa_hist = None, ""
|
||||||
|
|
||||||
|
# Open output file from maintenance errors plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
|
||||||
|
|
||||||
|
with open(file_path, "r") as mtc:
|
||||||
|
for line in mtc:
|
||||||
|
if "force failed by SM" in line:
|
||||||
|
host = re.findall("Error : (.+) is being", line)[0]
|
||||||
|
if hostname == "all" or host == hostname:
|
||||||
|
data.append(line[0:19] + " " + host
|
||||||
|
+ " force failed by SM\n")
|
||||||
|
elif "Graceful Recovery Failed" in line:
|
||||||
|
host = re.findall("Info : (.+) Task:", line)[0]
|
||||||
|
if hostname == "all" or host == hostname:
|
||||||
|
data.append(line[0:19] + " " + host
|
||||||
|
+ " graceful recovery failed\n")
|
||||||
|
elif "MNFA ENTER" in line:
|
||||||
|
mnfa_start = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
elif "MNFA POOL" in line:
|
||||||
|
pool_hosts = len(line.split("MNFA POOL: ")[1].split())
|
||||||
|
if mnfa_start:
|
||||||
|
mnfa_hist += (" " + str(pool_hosts))
|
||||||
|
else:
|
||||||
|
data_len = len(data)
|
||||||
|
for n in range(0, data_len):
|
||||||
|
event = data[data_len - 1 - n]
|
||||||
|
if "Multi-node failure" in event:
|
||||||
|
temp = " " + str(pool_hosts) + ")\n"
|
||||||
|
data[data_len - 1 - n] = event[:-2] + temp
|
||||||
|
break
|
||||||
|
elif "MNFA EXIT" in line:
|
||||||
|
mnfa_duration = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
mnfa_duration -= mnfa_start
|
||||||
|
mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
data.append(mnfa_start + " Multi-node failure avoidance "
|
||||||
|
+ "(duration: " + str(mnfa_duration)
|
||||||
|
+ "; history:" + mnfa_hist + ")\n")
|
||||||
|
|
||||||
|
mnfa_start, mnfa_hist = None, ""
|
||||||
|
|
||||||
|
# Open output file from swact activity plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
|
||||||
|
|
||||||
|
with open(file_path, "r") as swact_activity:
|
||||||
|
for line in swact_activity:
|
||||||
|
if (re.search("Service (.+) is failed and has reached max "
|
||||||
|
"failures", line)):
|
||||||
|
host = re.findall(
|
||||||
|
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
|
||||||
|
line)[0]
|
||||||
|
svc_failed = re.findall(
|
||||||
|
r"Service \((.+)\) is failed", line)[0]
|
||||||
|
if hostname == "all" or host == hostname:
|
||||||
|
data.append(line[0:19] + " " + host
|
||||||
|
+ " service failure (" + svc_failed
|
||||||
|
+ ")\n")
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_alarms(self, hostname):
|
||||||
|
"""Searches through the 'alarm' output file created by the alarm plugin
|
||||||
|
and summarizes which alarms were found as well as the number of times
|
||||||
|
they were set and cleared
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Open 'alarm' output file from alarm plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "alarm")
|
||||||
|
|
||||||
|
with open(file_path, "r") as alarm:
|
||||||
|
extract = False
|
||||||
|
for line in alarm:
|
||||||
|
if re.search(" \\d", line) and extract:
|
||||||
|
if line.split()[2] == "set":
|
||||||
|
data[-1]["set"] += 1
|
||||||
|
else:
|
||||||
|
data[-1]["clear"] += 1
|
||||||
|
elif hostname == "all" or hostname in line:
|
||||||
|
extract = True
|
||||||
|
alarm = {
|
||||||
|
"name": line[:-1],
|
||||||
|
"set": 0,
|
||||||
|
"clear": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
data.append(alarm)
|
||||||
|
else:
|
||||||
|
extract = False
|
||||||
|
|
||||||
|
temp = []
|
||||||
|
for entry in data:
|
||||||
|
temp.append(entry["name"] + " - set: " + str(entry["set"])
|
||||||
|
+ ", clear: " + str(entry["clear"]) + "\n")
|
||||||
|
data = temp
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_state_changes(self, hostname):
|
||||||
|
"""Searches through the output files created by the state changes
|
||||||
|
plugin and summarizes the changes of state of the hosts, such as
|
||||||
|
"is ENABLED"
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# Open output file from state changes plugin and read it
|
||||||
|
file_path = os.path.join(self.plugin_output_dir, "state_changes")
|
||||||
|
|
||||||
|
with open(file_path, "r") as state_changes:
|
||||||
|
for line in state_changes:
|
||||||
|
if "is ENABLED" in line:
|
||||||
|
host = re.findall("Info : (.+) is ENABLED", line)[0]
|
||||||
|
state = re.findall("is (.+)\n", line)[0].lower()
|
||||||
|
if hostname == "all" or hostname in host:
|
||||||
|
data.append(line[0:19] + " " + host + " " + state
|
||||||
|
+ "\n")
|
||||||
|
elif "locked-disabled" in line:
|
||||||
|
host = re.findall(
|
||||||
|
"Info : (.+) u?n?locked-disabled", line)[0]
|
||||||
|
if hostname == "all" or host == hostname:
|
||||||
|
data.append(line[0:19] + " " + host + " disabled\n")
|
||||||
|
|
||||||
|
return data
|
869
tools/collector/debian-scripts/report/execution_engine.py
Executable file
869
tools/collector/debian-scripts/report/execution_engine.py
Executable file
@ -0,0 +1,869 @@
|
|||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# This file contains the ExecutionEngine class.
|
||||||
|
# The ExecutionEngine class contains all the available algorithms.
|
||||||
|
#
|
||||||
|
# The ExecutionEngine class runs plugins and gathers relevant logs and
|
||||||
|
# information, creating output files in the report directory.
|
||||||
|
#
|
||||||
|
# TODO: Modularize code and separate plugin algorithms into their own
|
||||||
|
# files
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
import gzip
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tarfile
|
||||||
|
|
||||||
|
import algorithms
|
||||||
|
from correlator import Correlator
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExecutionEngine:
|
||||||
|
def __init__(self, opts, output_directory):
|
||||||
|
"""Constructor for the ExecutionEngine class
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
opts (dictionary): Options from command line
|
||||||
|
"""
|
||||||
|
self.opts = opts
|
||||||
|
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
|
||||||
|
self.active_controller_directory = None
|
||||||
|
|
||||||
|
# Uncompresses host tar files if not already done
|
||||||
|
with open(os.path.join(output_directory, "untar.log"), "a") as logfile:
|
||||||
|
for obj in (os.scandir(self.opts.directory)):
|
||||||
|
info = os.path.splitext(obj.name)
|
||||||
|
if (obj.is_file() and obj.name != "report_tool.tgz" and
|
||||||
|
tarfile.is_tarfile(obj.path) and not
|
||||||
|
os.path.isdir(os.path.join(self.opts.directory,
|
||||||
|
info[0]))):
|
||||||
|
try:
|
||||||
|
subprocess.run(["tar", "xzfC", obj.path,
|
||||||
|
self.opts.directory],
|
||||||
|
stderr=logfile, check=True)
|
||||||
|
subprocess.run(["echo", "uncompressed", obj.name],
|
||||||
|
check=True)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
for folder in (f.path for f in os.scandir(self.opts.directory)):
|
||||||
|
database_path = os.path.join(folder, "var", "extra", "database")
|
||||||
|
host_info_path = os.path.join(folder, "var", "extra", "host.info")
|
||||||
|
|
||||||
|
if os.path.isdir(database_path) and os.listdir(database_path):
|
||||||
|
self.active_controller_directory = folder
|
||||||
|
|
||||||
|
if os.path.exists(host_info_path):
|
||||||
|
hostname, subfunction = self._extract_subfunction(
|
||||||
|
host_info_path)
|
||||||
|
if "controller" in subfunction:
|
||||||
|
self.hosts["controllers"][hostname] = folder
|
||||||
|
elif "worker" in subfunction:
|
||||||
|
self.hosts["workers"][hostname] = folder
|
||||||
|
elif "storage" in subfunction:
|
||||||
|
self.hosts["storages"][hostname] = folder
|
||||||
|
|
||||||
|
if not self.active_controller_directory:
|
||||||
|
raise ValueError("Active controller not found")
|
||||||
|
|
||||||
|
def execute(self, plugins, output_directory):
|
||||||
|
"""Run a list of plugins
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
plugins (Plugin list): List of plugins to run
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
plugin_output_dir = os.path.join(output_directory, "plugins")
|
||||||
|
os.makedirs(plugin_output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for plugin in plugins:
|
||||||
|
processing = "Processing plugin: " + os.path.basename(plugin.file)
|
||||||
|
hosts = {}
|
||||||
|
if (
|
||||||
|
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
|
||||||
|
): # if host list is given
|
||||||
|
logger.info(
|
||||||
|
f"Processing plugin: {os.path.basename(plugin.file)}")
|
||||||
|
|
||||||
|
for h in plugin.state["hosts"]:
|
||||||
|
if h == "all":
|
||||||
|
hosts.update(self.hosts["workers"])
|
||||||
|
hosts.update(self.hosts["storages"])
|
||||||
|
hosts.update(self.hosts["controllers"])
|
||||||
|
else:
|
||||||
|
hosts.update(self.hosts[h])
|
||||||
|
|
||||||
|
for hostname, folderpath in hosts.items():
|
||||||
|
|
||||||
|
events = []
|
||||||
|
if plugin.state["algorithm"] == algorithms.SUBSTRING:
|
||||||
|
events = self.substring(
|
||||||
|
plugin.state["substring"],
|
||||||
|
[
|
||||||
|
os.path.join(folderpath, file)
|
||||||
|
for file in plugin.state["files"]
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# creating output file
|
||||||
|
output_file = os.path.join(
|
||||||
|
plugin_output_dir,
|
||||||
|
f"substring_{hostname}",
|
||||||
|
)
|
||||||
|
if self.opts.verbose:
|
||||||
|
logger.info("output at "
|
||||||
|
+ os.path.relpath(output_file))
|
||||||
|
with open(output_file, "w") as file:
|
||||||
|
file.write(
|
||||||
|
f"Date range: {self.opts.start} until "
|
||||||
|
f"{self.opts.end}\n"
|
||||||
|
)
|
||||||
|
file.write(
|
||||||
|
f"substrings: "
|
||||||
|
f"{' '.join(plugin.state['substring'])}\n"
|
||||||
|
)
|
||||||
|
for line in events:
|
||||||
|
if line[-1] == "\n":
|
||||||
|
file.write(line)
|
||||||
|
else:
|
||||||
|
file.write(line + "\n")
|
||||||
|
else:
|
||||||
|
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||||
|
info = self.system_info()
|
||||||
|
system_info_output = os.path.join(plugin_output_dir,
|
||||||
|
"system_info")
|
||||||
|
with open(system_info_output, "w") as file:
|
||||||
|
for i in info:
|
||||||
|
file.write(i + "\n")
|
||||||
|
|
||||||
|
for k, v in self.hosts.items():
|
||||||
|
file.write(f"{k}: {','.join(v.keys())}\n")
|
||||||
|
if self.opts.verbose:
|
||||||
|
logger.info(processing + ", output at "
|
||||||
|
+ os.path.relpath(system_info_output))
|
||||||
|
else:
|
||||||
|
logger.info(processing)
|
||||||
|
|
||||||
|
elif plugin.state["algorithm"] == algorithms.AUDIT:
|
||||||
|
hosts = {}
|
||||||
|
hosts.update(self.hosts["workers"])
|
||||||
|
hosts.update(self.hosts["storages"])
|
||||||
|
hosts.update(self.hosts["controllers"])
|
||||||
|
|
||||||
|
for hostname, folderpath in hosts.items():
|
||||||
|
self._create_output_file(
|
||||||
|
f"{hostname}_audit",
|
||||||
|
plugin_output_dir,
|
||||||
|
self.audit(
|
||||||
|
plugin.state["start"],
|
||||||
|
plugin.state["end"],
|
||||||
|
os.path.join(
|
||||||
|
folderpath, "var", "log", "dcmanager",
|
||||||
|
"audit.log"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
processing,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
||||||
|
self._create_output_file(
|
||||||
|
"swact_activity", plugin_output_dir,
|
||||||
|
self.swact_activity(), processing
|
||||||
|
)
|
||||||
|
|
||||||
|
elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
||||||
|
self._create_output_file(
|
||||||
|
"puppet_errors", plugin_output_dir,
|
||||||
|
self.puppet_errors(), processing
|
||||||
|
)
|
||||||
|
|
||||||
|
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
||||||
|
self._create_output_file(
|
||||||
|
"process_failures", plugin_output_dir,
|
||||||
|
self.process_failures(), processing
|
||||||
|
)
|
||||||
|
|
||||||
|
elif plugin.state["algorithm"] == algorithms.ALARM:
|
||||||
|
alarms, logs = self.alarm(
|
||||||
|
plugin.state["alarm_exclude"],
|
||||||
|
plugin.state["entity_exclude"]
|
||||||
|
)
|
||||||
|
alarm_output = os.path.join(plugin_output_dir, "alarm")
|
||||||
|
log_output = os.path.join(plugin_output_dir, "log")
|
||||||
|
|
||||||
|
# creating output alarm file
|
||||||
|
with open(alarm_output, "w") as file:
|
||||||
|
for k, v in alarms.items():
|
||||||
|
file.write(f"{k}:\n")
|
||||||
|
for date in v["dates"]:
|
||||||
|
file.write(f" {date}\n")
|
||||||
|
|
||||||
|
# creating output log file
|
||||||
|
with open(log_output, "w") as file:
|
||||||
|
for k, v in logs.items():
|
||||||
|
file.write(f"{k}: {v['count']}\n")
|
||||||
|
file.write("\n")
|
||||||
|
for k, v in logs.items():
|
||||||
|
file.write(f"{k}:\n")
|
||||||
|
for date in v["dates"]:
|
||||||
|
file.write(f" {date}\n")
|
||||||
|
if self.opts.verbose:
|
||||||
|
logger.info(processing + ", output at "
|
||||||
|
+ os.path.relpath(alarm_output)
|
||||||
|
+ ", " + os.path.relpath(log_output))
|
||||||
|
else:
|
||||||
|
logger.info(processing)
|
||||||
|
elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
||||||
|
self._create_output_file(
|
||||||
|
"heartbeat_loss", plugin_output_dir,
|
||||||
|
self.heartbeat_loss(), processing
|
||||||
|
)
|
||||||
|
elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
||||||
|
self._create_output_file(
|
||||||
|
"maintenance_errors", plugin_output_dir,
|
||||||
|
self.maintenance_errors(), processing
|
||||||
|
)
|
||||||
|
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
||||||
|
self._create_output_file(
|
||||||
|
"daemon_failures", plugin_output_dir,
|
||||||
|
self.daemon_failures(), processing
|
||||||
|
)
|
||||||
|
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
|
||||||
|
self._create_output_file(
|
||||||
|
"state_changes", plugin_output_dir,
|
||||||
|
self.state_changes(), processing
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.opts.verbose:
|
||||||
|
logger.info("Output files for plugins can be found at " +
|
||||||
|
os.path.relpath(plugin_output_dir))
|
||||||
|
|
||||||
|
# Running the correlator and printing the output from it
|
||||||
|
self.run_correlator(output_directory, plugin_output_dir)
|
||||||
|
|
||||||
|
# Built-in algorithms ------------------------------
|
||||||
|
def alarm(self, alarm_exclude=[], entity_exclude=[]):
|
||||||
|
"""Alarm algorithm
|
||||||
|
Gathers list of alarms and customer logs
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
alarm_exclude (string list) : List of alarm id patterns to not
|
||||||
|
search for
|
||||||
|
entity_exclude (string list): List of entity id patterns to not
|
||||||
|
search for
|
||||||
|
"""
|
||||||
|
alarm_data = {}
|
||||||
|
log_data = {}
|
||||||
|
|
||||||
|
with open(
|
||||||
|
os.path.join(
|
||||||
|
self.active_controller_directory,
|
||||||
|
"var", "extra", "database", "fm.db.sql.txt"
|
||||||
|
)
|
||||||
|
) as file:
|
||||||
|
start = False
|
||||||
|
for line in file:
|
||||||
|
# start of event log
|
||||||
|
if re.search(r"COPY (public\.)?event_log", line):
|
||||||
|
start = True
|
||||||
|
elif start and line == "\\.\n":
|
||||||
|
break
|
||||||
|
elif start:
|
||||||
|
entry = re.split(r"\t", line)
|
||||||
|
|
||||||
|
INDEX_ALARM_ID = 5
|
||||||
|
INDEX_ACTION = 6
|
||||||
|
INDEX_ENTITY_ID = 8
|
||||||
|
INDEX_ALARM_DATE = 9
|
||||||
|
INDEX_SEVERITY = 10
|
||||||
|
|
||||||
|
alarm_id = entry[INDEX_ALARM_ID]
|
||||||
|
entity_id = entry[INDEX_ENTITY_ID]
|
||||||
|
action = entry[INDEX_ACTION]
|
||||||
|
severity = entry[INDEX_SEVERITY]
|
||||||
|
alarm_date = entry[INDEX_ALARM_DATE]
|
||||||
|
|
||||||
|
entry_date = alarm_date.replace(
|
||||||
|
" ", "T"
|
||||||
|
) # making time format of alarm the same
|
||||||
|
if (self.opts.start <= entry_date
|
||||||
|
and entry_date <= self.opts.end):
|
||||||
|
cont = True
|
||||||
|
# Checks if the alarm is in the user specified list of
|
||||||
|
# alarm or entity ids
|
||||||
|
for id in alarm_exclude:
|
||||||
|
if id in alarm_id:
|
||||||
|
cont = False
|
||||||
|
break
|
||||||
|
|
||||||
|
for entity in entity_exclude:
|
||||||
|
if entity in entity_id:
|
||||||
|
cont = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if not cont:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
if action == "log":
|
||||||
|
log_info = log_data[
|
||||||
|
f"{alarm_id} {entity_id} {severity}"
|
||||||
|
]
|
||||||
|
log_info["count"] += 1
|
||||||
|
log_info["dates"].append(alarm_date)
|
||||||
|
else:
|
||||||
|
alarm_info = alarm_data[
|
||||||
|
f"{alarm_id} {entity_id} {severity}"
|
||||||
|
]
|
||||||
|
alarm_info["dates"].append(
|
||||||
|
f"{alarm_date} {action}")
|
||||||
|
except KeyError:
|
||||||
|
if entry[6] != "log":
|
||||||
|
alarm_data[
|
||||||
|
f"{alarm_id} {entity_id} {severity}"
|
||||||
|
] = {
|
||||||
|
"dates": [f"{alarm_date} {action}"],
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
log_data[
|
||||||
|
f"{alarm_id} {entity_id} {severity}"
|
||||||
|
] = {
|
||||||
|
"count": 1,
|
||||||
|
"dates": [alarm_date],
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v in alarm_data.items():
|
||||||
|
v["dates"] = sorted(v["dates"])
|
||||||
|
temp = []
|
||||||
|
temp.append(v["dates"][0])
|
||||||
|
for i in range(1, len(v["dates"])):
|
||||||
|
if v["dates"][i].split()[2] != v["dates"][i-1].split()[2]:
|
||||||
|
temp.append(v["dates"][i])
|
||||||
|
v["dates"] = temp
|
||||||
|
|
||||||
|
for _, v in log_data.items():
|
||||||
|
v["dates"] = sorted(v["dates"])
|
||||||
|
|
||||||
|
return alarm_data, log_data
|
||||||
|
|
||||||
|
def substring(self, substr, files):
|
||||||
|
"""Substring algorithm
|
||||||
|
Looks for substrings within files
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
substr (string list): List of substrings to look for
|
||||||
|
files (string list): List of absolute filepaths to search in
|
||||||
|
|
||||||
|
Errors:
|
||||||
|
FileNotFoundError
|
||||||
|
"""
|
||||||
|
# don't analyze older files, continue with current file
|
||||||
|
CONTINUE_CURRENT = 0
|
||||||
|
# analyze older files, continue with current file
|
||||||
|
CONTINUE_CURRENT_OLD = 1
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for file in files:
|
||||||
|
try:
|
||||||
|
if not os.path.exists(file):
|
||||||
|
if (re.search("controller-1_(.+)/var/log/mtcAgent.log",
|
||||||
|
file)):
|
||||||
|
continue
|
||||||
|
raise FileNotFoundError(f"File not found: {file}")
|
||||||
|
cont = True
|
||||||
|
# Searching through file
|
||||||
|
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
||||||
|
f"""{file} 2>/dev/null""")
|
||||||
|
status = self._continue(file)
|
||||||
|
|
||||||
|
if (status == CONTINUE_CURRENT
|
||||||
|
or status == CONTINUE_CURRENT_OLD):
|
||||||
|
# continue with current file
|
||||||
|
if status == CONTINUE_CURRENT:
|
||||||
|
cont = False
|
||||||
|
self._evaluate_substring(data, command)
|
||||||
|
|
||||||
|
# Searching through rotated log files that aren't compressed
|
||||||
|
n = 1
|
||||||
|
while os.path.exists(f"{file}.{n}") and cont:
|
||||||
|
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
||||||
|
f"""{file}.{n} 2>/dev/null""")
|
||||||
|
status = self._continue(f"{file}.{n}")
|
||||||
|
|
||||||
|
if (status == CONTINUE_CURRENT
|
||||||
|
or status == CONTINUE_CURRENT_OLD):
|
||||||
|
if status == CONTINUE_CURRENT:
|
||||||
|
cont = False
|
||||||
|
self._evaluate_substring(data, command)
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
|
||||||
|
# Searching through rotated log files
|
||||||
|
while os.path.exists(f"{file}.{n}.gz") and cont:
|
||||||
|
command = (f"""zgrep -E "{'|'.join(s for s in substr)}" """
|
||||||
|
f"""{file}.{n}.gz 2>/dev/null""")
|
||||||
|
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
||||||
|
|
||||||
|
if (status == CONTINUE_CURRENT
|
||||||
|
or status == CONTINUE_CURRENT_OLD):
|
||||||
|
if status == CONTINUE_CURRENT:
|
||||||
|
cont = False
|
||||||
|
self._evaluate_substring(data, command)
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def system_info(self):
|
||||||
|
"""System info algorithm
|
||||||
|
Presents basic information about the system
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
with open(
|
||||||
|
os.path.join(
|
||||||
|
self.active_controller_directory, "etc", "platform",
|
||||||
|
"platform.conf"
|
||||||
|
)
|
||||||
|
) as file:
|
||||||
|
for line in file:
|
||||||
|
if "system_mode" in line:
|
||||||
|
data.append(
|
||||||
|
f"System Mode: "
|
||||||
|
f"{re.match('^system_mode=(.*)', line).group(1)}"
|
||||||
|
)
|
||||||
|
elif "system_type" in line:
|
||||||
|
data.append(
|
||||||
|
f"System Type: "
|
||||||
|
f"{re.match('^system_type=(.*)', line).group(1)}"
|
||||||
|
)
|
||||||
|
elif "distributed_cloud_role" in line:
|
||||||
|
role = re.match('^distributed_cloud_role=(.*)',
|
||||||
|
line).group(1)
|
||||||
|
data.append(f"Distributed cloud role: {role}")
|
||||||
|
elif "sw_version" in line:
|
||||||
|
data.append(
|
||||||
|
f"SW Version: "
|
||||||
|
f"{re.match('^sw_version=(.*)', line).group(1)}"
|
||||||
|
)
|
||||||
|
with open(
|
||||||
|
os.path.join(self.active_controller_directory, "etc", "build.info")
|
||||||
|
) as file:
|
||||||
|
for line in file:
|
||||||
|
if "BUILD_TYPE" in line:
|
||||||
|
data.append(
|
||||||
|
f"Build Type: "
|
||||||
|
f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
||||||
|
)
|
||||||
|
elif re.match("^OS=(.*)", line):
|
||||||
|
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def swact_activity(self):
|
||||||
|
"""Swact activity algorithm
|
||||||
|
Presents all swacting activity in the system
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
sm_files = []
|
||||||
|
sm_customer_files = []
|
||||||
|
swact_start = None
|
||||||
|
swact_in_progress = False
|
||||||
|
swact_end = None
|
||||||
|
|
||||||
|
for _, folder in self.hosts["controllers"].items():
|
||||||
|
sm_path = os.path.join(folder, "var", "log", "sm.log")
|
||||||
|
sm_files.append(sm_path)
|
||||||
|
sm_customer_path = os.path.join(folder, "var", "log",
|
||||||
|
"sm-customer.log")
|
||||||
|
sm_customer_files.append(sm_customer_path)
|
||||||
|
|
||||||
|
sm_substrings = ["Uncontrolled swact", "Swact has started,",
|
||||||
|
"Neighbor (.+) is now in the down",
|
||||||
|
"Service (.+) has reached max failures",
|
||||||
|
"Swact update"]
|
||||||
|
data = self.substring(sm_substrings, sm_files)
|
||||||
|
|
||||||
|
for i, line in enumerate(data):
|
||||||
|
if "Swact has started," in line and not swact_in_progress:
|
||||||
|
swact_in_progress = True
|
||||||
|
swact_start = datetime.strptime(line[0:19],
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
elif "Swact update" in line and swact_in_progress:
|
||||||
|
swact_in_progress = False
|
||||||
|
swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||||
|
line += f" SWACT TOOK {swact_end - swact_start} \n"
|
||||||
|
data[i] = line
|
||||||
|
|
||||||
|
sm_customer_substrings = [
|
||||||
|
"swact", "active-failed\\s+\\| disabling-failed\\s+\\|"
|
||||||
|
]
|
||||||
|
data += self.substring(sm_customer_substrings, sm_customer_files)
|
||||||
|
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def puppet_errors(self):
|
||||||
|
"""Puppet errors algorithm
|
||||||
|
Presents log errors from puppet logs
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
for host_type in self.hosts.keys():
|
||||||
|
for _, folder in self.hosts[host_type].items():
|
||||||
|
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
||||||
|
command = (f"""grep -rh "[m ]Error: " {puppet_folder} """
|
||||||
|
f"""2>/dev/null""")
|
||||||
|
self._evaluate_substring(data, command)
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def process_failures(self):
|
||||||
|
"""Process failures algorithm
|
||||||
|
Presents log errors from pmond
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
files = []
|
||||||
|
for host_type in self.hosts.keys():
|
||||||
|
for _, folder in self.hosts[host_type].items():
|
||||||
|
pmond = os.path.join(folder, "var", "log", "pmond.log")
|
||||||
|
files.append(pmond)
|
||||||
|
|
||||||
|
data = self.substring(["Error :"], files)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def heartbeat_loss(self):
|
||||||
|
"""Heartbeat loss algorithm
|
||||||
|
Presents all heartbeat loss error messages in the system
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
hb_files = []
|
||||||
|
|
||||||
|
for _, folder in self.hosts["controllers"].items():
|
||||||
|
hb_path = os.path.join(folder, "var", "log", "hbsAgent.log")
|
||||||
|
hb_files.append(hb_path)
|
||||||
|
|
||||||
|
hb_substrings = ["Heartbeat Loss"]
|
||||||
|
data = self.substring(hb_substrings, hb_files)
|
||||||
|
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def maintenance_errors(self):
|
||||||
|
"""Maintenance errors algorithm
|
||||||
|
Presents maintenance errors and other relevant log messages in system
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
mtc_files = []
|
||||||
|
|
||||||
|
for _, folder in self.hosts["controllers"].items():
|
||||||
|
agent = os.path.join(folder, "var", "log", "mtcAgent.log")
|
||||||
|
mtc_files.append(agent)
|
||||||
|
|
||||||
|
for host_type in self.hosts.keys():
|
||||||
|
for _, folder in self.hosts[host_type].items():
|
||||||
|
client = os.path.join(folder, "var", "log", "mtcClient.log")
|
||||||
|
mtc_files.append(client)
|
||||||
|
|
||||||
|
mtc_substrings = ["Error : ", "Configuration failure",
|
||||||
|
"In-Test Failure", "Loss Of Communication",
|
||||||
|
"Graceful Recovery Wait ",
|
||||||
|
"regained MTCALIVE from host that has rebooted",
|
||||||
|
"Connectivity Recovered ; ",
|
||||||
|
"auto recovery disabled", "Graceful Recovery Failed",
|
||||||
|
"MNFA ENTER", "MNFA EXIT", "MNFA POOL"]
|
||||||
|
data = self.substring(mtc_substrings, mtc_files)
|
||||||
|
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def daemon_failures(self):
|
||||||
|
"""Daemon failures algorithm
|
||||||
|
Presents all failed puppet manifest messages in the system
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
daemon_files = []
|
||||||
|
|
||||||
|
for host_type in self.hosts.keys():
|
||||||
|
for _, folder in self.hosts[host_type].items():
|
||||||
|
daemon_path = os.path.join(folder, "var", "log", "daemon.log")
|
||||||
|
daemon_files.append(daemon_path)
|
||||||
|
|
||||||
|
daemon_substrings = ["Failed to run the puppet manifest"]
|
||||||
|
data = self.substring(daemon_substrings, daemon_files)
|
||||||
|
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def state_changes(self):
|
||||||
|
"""State changes algorithm
|
||||||
|
Presents all messages in the system regarding the state of hosts
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
sc_files = []
|
||||||
|
|
||||||
|
for _, folder in self.hosts["controllers"].items():
|
||||||
|
sc_path = os.path.join(folder, "var", "log", "mtcAgent.log")
|
||||||
|
sc_files.append(sc_path)
|
||||||
|
|
||||||
|
sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"]
|
||||||
|
data = self.substring(sc_substrings, sc_files)
|
||||||
|
|
||||||
|
return sorted(data)
|
||||||
|
|
||||||
|
def audit(self, start, end, audit_log_path):
|
||||||
|
"""Counts audit events in dcmanager within a specified date range
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
start (string) : start date in YYYY-MM-DD HH:MM:SS format
|
||||||
|
end (string) : end date in YYYY-MM-DD HH:MM:SS format
|
||||||
|
audit_log_path (string) : absolute path of augit log file
|
||||||
|
"""
|
||||||
|
if not shutil.which("lnav"):
|
||||||
|
raise ValueError("Lnav program not found")
|
||||||
|
|
||||||
|
SECONDS_PER_HOUR = 3600
|
||||||
|
fmt = "%Y-%m-%d %H:%M:%S"
|
||||||
|
|
||||||
|
d1 = datetime.strptime(start, fmt)
|
||||||
|
d2 = datetime.strptime(end, fmt)
|
||||||
|
seconds = (d2 - d1).total_seconds()
|
||||||
|
|
||||||
|
log_texts = [
|
||||||
|
"Triggered subcloud audit%",
|
||||||
|
"Trigger patch audit%",
|
||||||
|
"Trigger load audit%",
|
||||||
|
"Triggered firmware audit%",
|
||||||
|
"Triggered kubernetes audit%",
|
||||||
|
# Counts sum of audits from all subclouds
|
||||||
|
]
|
||||||
|
INDEX_MIDDLE_WORD = 1
|
||||||
|
data = [("These rates and totals represent the sum of audits from "
|
||||||
|
+ "all subclouds")]
|
||||||
|
|
||||||
|
def command(text):
|
||||||
|
|
||||||
|
return (
|
||||||
|
f'lnav -R -n -c ";SELECT count(log_body) AS '
|
||||||
|
f'{text.split(" ")[INDEX_MIDDLE_WORD]}_total from '
|
||||||
|
f'openstack_log WHERE '
|
||||||
|
f'(log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
||||||
|
f' AND log_body like \\"{text}\\"" "{audit_log_path}"'
|
||||||
|
)
|
||||||
|
|
||||||
|
for text in log_texts:
|
||||||
|
p = subprocess.Popen(command(text), shell=True,
|
||||||
|
stdout=subprocess.PIPE)
|
||||||
|
for line in p.stdout:
|
||||||
|
line = line.decode("utf-8").strip()
|
||||||
|
if line.isnumeric():
|
||||||
|
data.append(
|
||||||
|
f"rate "
|
||||||
|
f"{round((int(line)/seconds * SECONDS_PER_HOUR), 3)} "
|
||||||
|
f"per hour. total: {line}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
data.append(line)
|
||||||
|
return data
|
||||||
|
|
||||||
|
# -----------------------------------
|
||||||
|
|
||||||
|
def run_correlator(self, output_directory, plugin_output_dir):
|
||||||
|
"""Runs the correlator and prints the results differently based on if
|
||||||
|
the tool was run with or without the verbose option
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
output_directory (string) : directory to place output files from
|
||||||
|
correlator
|
||||||
|
plugin_output_dir (string) : directory with output files from
|
||||||
|
plugins
|
||||||
|
"""
|
||||||
|
correlator = Correlator(plugin_output_dir)
|
||||||
|
failures, events, alarms, state_changes = correlator.run(
|
||||||
|
self.opts.hostname)
|
||||||
|
failures_len, events_len = len(failures), len(events)
|
||||||
|
alarms_len, state_changes_len = len(alarms), len(state_changes)
|
||||||
|
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
|
||||||
|
events.append("\nTotal events found: " + str(events_len) + "\n")
|
||||||
|
alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n")
|
||||||
|
state_changes.append("\nTotal state changes found: "
|
||||||
|
+ str(state_changes_len) + "\n")
|
||||||
|
|
||||||
|
logger.info("\nRunning correlator...")
|
||||||
|
self._create_output_file("correlator_failures", output_directory,
|
||||||
|
failures, "")
|
||||||
|
self._create_output_file("correlator_events", output_directory,
|
||||||
|
events, "")
|
||||||
|
self._create_output_file("correlator_alarms", output_directory,
|
||||||
|
alarms, "")
|
||||||
|
self._create_output_file("correlator_state_changes", output_directory,
|
||||||
|
state_changes, "")
|
||||||
|
|
||||||
|
if not self.opts.verbose:
|
||||||
|
logger.info("Output can be found at "
|
||||||
|
+ os.path.relpath(output_directory) + "\n")
|
||||||
|
logger.info("Failures: " + str(failures_len))
|
||||||
|
for f in failures[:-1]:
|
||||||
|
if "Uncontrolled swact" in f:
|
||||||
|
logger.info(f[0:19] + " "
|
||||||
|
+ re.findall("active controller:? (.+)\n",
|
||||||
|
f)[0] + " uncontrolled swact")
|
||||||
|
elif "failure on" in f:
|
||||||
|
host = re.findall(r"failure on ([^\s]+) ", f)[0]
|
||||||
|
logger.info(f[0:19] + " " + host + " "
|
||||||
|
+ re.findall("^(.+) failure on ",
|
||||||
|
f[43:])[0].lower() + " failure")
|
||||||
|
else:
|
||||||
|
logger.info(f[:-1])
|
||||||
|
if failures_len != 0:
|
||||||
|
logger.info("\nEvents: " + str(events_len))
|
||||||
|
else:
|
||||||
|
logger.info("Events: " + str(events_len))
|
||||||
|
logger.info("Alarms: " + str(alarms_len))
|
||||||
|
logger.info("State Changes: " + str(state_changes_len))
|
||||||
|
else:
|
||||||
|
logger.info("\nFailures: " + str(failures_len))
|
||||||
|
for f in failures[:-1]:
|
||||||
|
logger.info(f[:-1])
|
||||||
|
|
||||||
|
# Dictionary to keep track of number of times events happens on
|
||||||
|
# each host
|
||||||
|
events_summ = {}
|
||||||
|
for e in events[:-1]:
|
||||||
|
k = e[20:-1].split(" (", 1)[0]
|
||||||
|
if not events_summ.get(k):
|
||||||
|
events_summ[k] = 1
|
||||||
|
else:
|
||||||
|
events_summ[k] += 1
|
||||||
|
|
||||||
|
if failures_len != 0:
|
||||||
|
logger.info("\nEvents: " + str(events_len))
|
||||||
|
else:
|
||||||
|
logger.info("Events: " + str(events_len))
|
||||||
|
for k, v in sorted(events_summ.items()):
|
||||||
|
logger.info(k + ": " + str(v) + " time(s)")
|
||||||
|
|
||||||
|
if events_len != 0:
|
||||||
|
logger.info("\nAlarms: " + str(alarms_len))
|
||||||
|
else:
|
||||||
|
logger.info("Alarms: " + str(alarms_len))
|
||||||
|
logger.info("The full list of alarms can be found at "
|
||||||
|
+ os.path.relpath(output_directory)
|
||||||
|
+ "/correlator_alarms")
|
||||||
|
|
||||||
|
# Dictionary to keep track of number of times state changes
|
||||||
|
# happens on each host
|
||||||
|
state_changes_summ = {}
|
||||||
|
for s in state_changes[:-1]:
|
||||||
|
k = s[20:-1]
|
||||||
|
if "enabled" in k:
|
||||||
|
k = k.split("enabled", 1)[0] + "enabled"
|
||||||
|
if not state_changes_summ.get(k):
|
||||||
|
state_changes_summ[k] = 1
|
||||||
|
else:
|
||||||
|
state_changes_summ[k] += 1
|
||||||
|
|
||||||
|
if alarms_len != 0:
|
||||||
|
logger.info("\nState Changes: " + str(state_changes_len))
|
||||||
|
else:
|
||||||
|
logger.info("State Changes: " + str(state_changes_len))
|
||||||
|
for k, v in sorted(state_changes_summ.items()):
|
||||||
|
logger.info(k + ": " + str(v) + " time(s)")
|
||||||
|
|
||||||
|
def _continue(self, file, compressed=False):
|
||||||
|
# don't analyze older files, continue with current file
|
||||||
|
CONTINUE_CURRENT = 0
|
||||||
|
# analyze older files, continue with current file
|
||||||
|
CONTINUE_CURRENT_OLD = 1
|
||||||
|
# don't analyze current file, continue to older files
|
||||||
|
CONTINUE_OLD = 2
|
||||||
|
|
||||||
|
# check date of first log event and compare with provided
|
||||||
|
# start, end dates
|
||||||
|
first = ""
|
||||||
|
|
||||||
|
if not compressed:
|
||||||
|
with open(file) as f:
|
||||||
|
line = f.readline()
|
||||||
|
first = line[0:19]
|
||||||
|
else:
|
||||||
|
with gzip.open(file, "rb") as f:
|
||||||
|
line = f.readline().decode("utf-8")
|
||||||
|
first = line[0:19]
|
||||||
|
try:
|
||||||
|
datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||||
|
first = line[0:19]
|
||||||
|
except ValueError:
|
||||||
|
return CONTINUE_CURRENT_OLD
|
||||||
|
|
||||||
|
if first < self.opts.start:
|
||||||
|
return CONTINUE_CURRENT
|
||||||
|
elif first < self.opts.end and first > self.opts.start:
|
||||||
|
return CONTINUE_CURRENT_OLD
|
||||||
|
elif first > self.opts.end:
|
||||||
|
return CONTINUE_OLD
|
||||||
|
|
||||||
|
def _evaluate_substring(self, data, command):
|
||||||
|
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||||
|
for line in p.stdout:
|
||||||
|
line = line.decode("utf-8")
|
||||||
|
# different date locations for log events
|
||||||
|
dates = [line[0:19], line[2:21]]
|
||||||
|
for date in dates:
|
||||||
|
try:
|
||||||
|
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
||||||
|
if date > self.opts.start and date < self.opts.end:
|
||||||
|
if line[0] == "|": # sm-customer.log edge case
|
||||||
|
line = line[1:].strip()
|
||||||
|
line = re.sub("\\s+", " ", line)
|
||||||
|
data.append(line)
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
if date == dates[-1]:
|
||||||
|
data.append(line)
|
||||||
|
|
||||||
|
def _extract_subfunction(self, host_info_path):
|
||||||
|
GROUP_ONE = 1
|
||||||
|
with open(host_info_path) as file:
|
||||||
|
for line in file:
|
||||||
|
hostname_match = re.match(
|
||||||
|
r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
||||||
|
subfunction_match = re.match(
|
||||||
|
r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
||||||
|
if subfunction_match:
|
||||||
|
subfunction = subfunction_match.group(GROUP_ONE)
|
||||||
|
if hostname_match:
|
||||||
|
hostname = hostname_match.group(GROUP_ONE)
|
||||||
|
return hostname, subfunction
|
||||||
|
|
||||||
|
def _create_output_file(self, filename, directory, data, processing):
|
||||||
|
with open(os.path.join(directory, filename), "w") as file:
|
||||||
|
for i in data:
|
||||||
|
if i[-1] == "\n":
|
||||||
|
file.write(i)
|
||||||
|
else:
|
||||||
|
file.write(i + "\n")
|
||||||
|
if self.opts.verbose:
|
||||||
|
output = ("output at "
|
||||||
|
+ os.path.relpath(os.path.join(directory, filename)))
|
||||||
|
if processing == "":
|
||||||
|
logger.info(output)
|
||||||
|
else:
|
||||||
|
logger.info(processing + ", " + output)
|
||||||
|
elif processing != "":
|
||||||
|
logger.info(processing)
|
@ -40,8 +40,8 @@ class Plugin:
|
|||||||
"files": [],
|
"files": [],
|
||||||
"hosts": [],
|
"hosts": [],
|
||||||
"substring": [],
|
"substring": [],
|
||||||
"alarm_ids": [],
|
"alarm_exclude": [],
|
||||||
"entity_ids": [],
|
"entity_exclude": [],
|
||||||
"start": None,
|
"start": None,
|
||||||
"end": None,
|
"end": None,
|
||||||
}
|
}
|
||||||
@ -93,10 +93,11 @@ class Plugin:
|
|||||||
self.state["substring"].append(data[1])
|
self.state["substring"].append(data[1])
|
||||||
elif label == "hosts":
|
elif label == "hosts":
|
||||||
self.state["hosts"] = value.replace(" ", "").split(",")
|
self.state["hosts"] = value.replace(" ", "").split(",")
|
||||||
elif label == "alarm_ids":
|
elif label == "alarm_exclude":
|
||||||
self.state["alarm_ids"] = value.replace(" ", "").split(",")
|
self.state["alarm_exclude"] = value.replace(" ", "").split(",")
|
||||||
elif label == "entity_ids":
|
elif label == "entity_exclude":
|
||||||
self.state["entity_ids"] = value.replace(" ", "").split(",")
|
self.state["entity_exclude"] = value.replace(
|
||||||
|
" ", "").split(",")
|
||||||
elif label == "files":
|
elif label == "files":
|
||||||
self.state["files"] = value.replace(" ", "").split(",")
|
self.state["files"] = value.replace(" ", "").split(",")
|
||||||
elif label == "start":
|
elif label == "start":
|
||||||
@ -117,74 +118,77 @@ class Plugin:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
plugin_name = os.path.basename(self.file)
|
plugin_name = os.path.basename(self.file)
|
||||||
|
HOSTS_ERR = f"plugin: {plugin_name} should not have hosts specified"
|
||||||
|
|
||||||
if self.state["algorithm"] == algorithms.SUBSTRING:
|
if self.state["algorithm"] == algorithms.SUBSTRING:
|
||||||
if len(self.state["files"]) == 0:
|
self.validate_state(plugin_name, "files")
|
||||||
raise ValueError(
|
self.validate_state(plugin_name, "hosts")
|
||||||
f"plugin: {plugin_name} needs files specified for substring algorithm"
|
self.validate_state(plugin_name, "substring")
|
||||||
)
|
|
||||||
if len(self.state["hosts"]) == 0:
|
|
||||||
raise ValueError(
|
|
||||||
f"plugin: {plugin_name} needs hosts specified for substring algorithm"
|
|
||||||
)
|
|
||||||
if len(self.state["substring"]) == 0:
|
|
||||||
raise ValueError(
|
|
||||||
f"plugin: {plugin_name} need substring specified for substring algorithm"
|
|
||||||
)
|
|
||||||
elif self.state["algorithm"] == algorithms.ALARM:
|
elif self.state["algorithm"] == algorithms.ALARM:
|
||||||
if len(self.state["hosts"]) > 0:
|
if len(self.state["hosts"]) > 0:
|
||||||
raise ValueError(
|
raise ValueError(HOSTS_ERR)
|
||||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
|
||||||
)
|
|
||||||
elif self.state["algorithm"] == algorithms.SYSTEM_INFO:
|
elif self.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||||
if len(self.state["hosts"]) > 0:
|
if len(self.state["hosts"]) > 0:
|
||||||
raise ValueError(
|
raise ValueError(HOSTS_ERR)
|
||||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
elif self.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
||||||
)
|
|
||||||
elif self.state["algorithm"] == algorithms.SWACT:
|
|
||||||
if len(self.state["hosts"]) > 0:
|
if len(self.state["hosts"]) > 0:
|
||||||
raise ValueError(
|
raise ValueError(HOSTS_ERR)
|
||||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
elif self.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
||||||
)
|
|
||||||
elif self.state["algorithm"] == algorithms.PUPPET:
|
|
||||||
if len(self.state["hosts"]) > 0:
|
if len(self.state["hosts"]) > 0:
|
||||||
raise ValueError(
|
raise ValueError(HOSTS_ERR)
|
||||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
elif self.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
||||||
)
|
|
||||||
elif self.state["algorithm"] == algorithms.PROCESS_FAILURE:
|
|
||||||
if len(self.state["hosts"]) > 0:
|
if len(self.state["hosts"]) > 0:
|
||||||
raise ValueError(
|
raise ValueError(HOSTS_ERR)
|
||||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
elif self.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
||||||
)
|
if len(self.state["hosts"]) > 0:
|
||||||
|
raise ValueError(HOSTS_ERR)
|
||||||
|
elif self.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
||||||
|
if len(self.state["hosts"]) > 0:
|
||||||
|
raise ValueError(HOSTS_ERR)
|
||||||
|
elif self.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
||||||
|
if len(self.state["hosts"]) > 0:
|
||||||
|
raise ValueError(HOSTS_ERR)
|
||||||
|
elif self.state["algorithm"] == algorithms.STATE_CHANGES:
|
||||||
|
if len(self.state["hosts"]) > 0:
|
||||||
|
raise ValueError(HOSTS_ERR)
|
||||||
elif self.state["algorithm"] == algorithms.AUDIT:
|
elif self.state["algorithm"] == algorithms.AUDIT:
|
||||||
if len(self.state["hosts"]) > 0:
|
if len(self.state["hosts"]) > 0:
|
||||||
raise ValueError(
|
raise ValueError(HOSTS_ERR)
|
||||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
datetime.strptime(self.state["start"], "%Y-%m-%d %H:%M:%S")
|
datetime.strptime(self.state["start"], "%Y-%m-%d %H:%M:%S")
|
||||||
except:
|
except:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"plugin : {plugin_name} needs a start time in YYYY-MM-DD HH:MM:SS format"
|
f"plugin : {plugin_name} needs a start time in YYYY-MM-DD "
|
||||||
|
f"HH:MM:SS format"
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
datetime.strptime(self.state["end"], "%Y-%m-%d %H:%M:%S")
|
datetime.strptime(self.state["end"], "%Y-%m-%d %H:%M:%S")
|
||||||
except:
|
except:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"plugin : {plugin_name} needs an end time in YYYY-MM-DD HH:MM:SS format"
|
f"plugin : {plugin_name} needs an end time in YYYY-MM-DD "
|
||||||
|
f"HH:MM:SS format"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"plugin: {plugin_name} unknown algorithm {self.state['algorithm']}"
|
f"plugin: {plugin_name} unknown algorithm "
|
||||||
|
f"{self.state['algorithm']}"
|
||||||
)
|
)
|
||||||
|
|
||||||
for host in self.state["hosts"]:
|
for host in self.state["hosts"]:
|
||||||
if host not in ["controllers", "workers", "storages", "all"]:
|
if host not in ["controllers", "workers", "storages", "all"]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"host not recognized: '{host}', accepted hosts are 'controllers', 'workers', 'storages', 'all'"
|
f"host not recognized: '{host}', accepted hosts are "
|
||||||
|
f"'controllers', 'workers', 'storages', 'all'"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def validate_state(self, plugin_name, key):
|
||||||
|
if len(self.state[key]) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"plugin: {plugin_name} needs {key} specified for "
|
||||||
|
f"substring algorithm"
|
||||||
|
)
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f"{json.dumps(self.state)} File: {self.file}"
|
return f"{json.dumps(self.state)} File: {self.file}"
|
3
tools/collector/debian-scripts/report/plugins/alarm
Executable file
3
tools/collector/debian-scripts/report/plugins/alarm
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
algorithm=alarm
|
||||||
|
alarm_exclude=400., 800.
|
||||||
|
entity_exclude=subsystem=vim
|
1
tools/collector/debian-scripts/report/plugins/daemon_failures
Executable file
1
tools/collector/debian-scripts/report/plugins/daemon_failures
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=daemon_failures
|
1
tools/collector/debian-scripts/report/plugins/heartbeat_loss
Executable file
1
tools/collector/debian-scripts/report/plugins/heartbeat_loss
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=heartbeat_loss
|
1
tools/collector/debian-scripts/report/plugins/maintenance_errors
Executable file
1
tools/collector/debian-scripts/report/plugins/maintenance_errors
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=maintenance_errors
|
1
tools/collector/debian-scripts/report/plugins/process_failures
Executable file
1
tools/collector/debian-scripts/report/plugins/process_failures
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=process_failures
|
1
tools/collector/debian-scripts/report/plugins/puppet_errors
Executable file
1
tools/collector/debian-scripts/report/plugins/puppet_errors
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=puppet_errors
|
@ -0,0 +1 @@
|
|||||||
|
algorithm=state_changes
|
5
tools/collector/debian-scripts/report/plugins/substring
Executable file
5
tools/collector/debian-scripts/report/plugins/substring
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
algorithm=substring
|
||||||
|
files=var/log/mtcAgent.log, var/log/sm.log
|
||||||
|
hosts=controllers
|
||||||
|
substring=operation failed
|
||||||
|
substring=Failed to send message
|
1
tools/collector/debian-scripts/report/plugins/swact_activity
Executable file
1
tools/collector/debian-scripts/report/plugins/swact_activity
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=swact_activity
|
1
tools/collector/debian-scripts/report/plugins/system_info
Executable file
1
tools/collector/debian-scripts/report/plugins/system_info
Executable file
@ -0,0 +1 @@
|
|||||||
|
algorithm=system_info
|
360
tools/collector/debian-scripts/report/report.py
Executable file
360
tools/collector/debian-scripts/report/report.py
Executable file
@ -0,0 +1,360 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
########################################################################
|
||||||
|
#
|
||||||
|
# Description: The Report tool is used to gather relevant log events
|
||||||
|
# and information about the system from a collect bundle.
|
||||||
|
#
|
||||||
|
# The report tool allows user created plugins which decides relevance
|
||||||
|
# for log events. Plugins contain an algorithm label which instructs the
|
||||||
|
# tool what information to search and how to search for it.
|
||||||
|
#
|
||||||
|
# The report tool requires the collect bundle and host tarballs to be
|
||||||
|
# untarred.
|
||||||
|
#
|
||||||
|
# The report tool reads user plugins from the report directory in the
|
||||||
|
# top level of the collect bundle, and outputs files containing files
|
||||||
|
# containing relevant logs to this directory as well.
|
||||||
|
#
|
||||||
|
# Typical Usage:
|
||||||
|
# command line functionality
|
||||||
|
# ------------------------------- ----------------------------------
|
||||||
|
# > report.py - Run all plugins in directory
|
||||||
|
# > report.py [plugin ...] - Run only specified plugins
|
||||||
|
# > report.py <algorithm> [labels] - Run algorithm with labels
|
||||||
|
# > report.py --help - help message
|
||||||
|
# > report.py <algorithm> --help - algorithm specific help
|
||||||
|
#
|
||||||
|
# See --help output for a complete list of full and abbreviated
|
||||||
|
# command line options and examples of plugins.
|
||||||
|
#
|
||||||
|
# Refer to README file for more usage and output examples
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from cmath import log
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
|
from datetime import timezone
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from execution_engine import ExecutionEngine
|
||||||
|
from plugin import Plugin
|
||||||
|
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
base_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
parent_dir = os.path.dirname(base_dir)
|
||||||
|
default_path = os.path.dirname(parent_dir)
|
||||||
|
plugins = []
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Log Event Reporter",
|
||||||
|
epilog="Place plugins in 'plugins' directory found in 'report' directory "
|
||||||
|
"at top level of collect bundle.\nOutput files will be placed in 'report' "
|
||||||
|
"directory.\nThis tool will create a report.log and untar.log file along "
|
||||||
|
"with other output files.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v",
|
||||||
|
"--verbose",
|
||||||
|
action="store_true",
|
||||||
|
help="Verbose output",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--start",
|
||||||
|
default="20000101",
|
||||||
|
help="Specify a start date in YYYYMMDD format for analysis "
|
||||||
|
"(default:20000101)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--end",
|
||||||
|
default=datetime.strftime(now + timedelta(days=1), "%Y%m%d"),
|
||||||
|
help="Specify an end date in YYYYMMDD format for analysis "
|
||||||
|
"(default: current date)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-p",
|
||||||
|
"--plugin",
|
||||||
|
default=None,
|
||||||
|
nargs="*",
|
||||||
|
help="Specify what plugins to run (default: runs every plugin in plugins "
|
||||||
|
"folder)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--directory",
|
||||||
|
default=default_path,
|
||||||
|
help="Specify top level of collect bundle to analyze "
|
||||||
|
"(default: two levels above tool directory)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hostname",
|
||||||
|
default="all",
|
||||||
|
help="Specify host for correlator to find significant events and state "
|
||||||
|
"changes for (default: all hosts)",
|
||||||
|
)
|
||||||
|
subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
|
||||||
|
|
||||||
|
# substring algorithm arguments
|
||||||
|
parser_substring = subparsers.add_parser(
|
||||||
|
"substring",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="""Searches through specified files for lines containing specified
|
||||||
|
substring. There will be an output file for each host of the host
|
||||||
|
type specified.""",
|
||||||
|
epilog="Plugin file example:\n"
|
||||||
|
" algorithm=substring\n"
|
||||||
|
" files=var/log/mtcAgent.log, var/log/sm.log\n"
|
||||||
|
" hosts=controllers\n"
|
||||||
|
" substring=operation failed\n"
|
||||||
|
" substring=Failed to send message",
|
||||||
|
)
|
||||||
|
substring_required = parser_substring.add_argument_group("required arguments")
|
||||||
|
substring_required.add_argument(
|
||||||
|
"--files",
|
||||||
|
required=True,
|
||||||
|
nargs="+",
|
||||||
|
help="Files to perform substring analysis on (required)",
|
||||||
|
)
|
||||||
|
substring_required.add_argument(
|
||||||
|
"--substring", nargs="+", required=True,
|
||||||
|
help="Substrings to search for (required)"
|
||||||
|
)
|
||||||
|
substring_required.add_argument(
|
||||||
|
"--hosts",
|
||||||
|
choices=["controllers", "workers", "storages", "all"],
|
||||||
|
required=True,
|
||||||
|
nargs="+",
|
||||||
|
help="Host types to perform analysis on (required)",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# alarm algorithm arguments
|
||||||
|
parser_alarm = subparsers.add_parser(
|
||||||
|
"alarm",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Searches through fm.db.sql.txt for alarms and logs except for those "
|
||||||
|
"specified. There are 2 output files: 'alarm', and 'log'",
|
||||||
|
epilog="Plugin file example:\n"
|
||||||
|
" algorithm=alarm\n"
|
||||||
|
" alarm_exclude=400., 800.\n"
|
||||||
|
" entity_exclude=subsystem=vim\n",
|
||||||
|
)
|
||||||
|
parser_alarm.add_argument(
|
||||||
|
"--alarm_exclude",
|
||||||
|
nargs="+",
|
||||||
|
required=False,
|
||||||
|
default=[],
|
||||||
|
help="Alarm id patterns to not search for (not required)",
|
||||||
|
)
|
||||||
|
parser_alarm.add_argument(
|
||||||
|
"--entity_exclude",
|
||||||
|
nargs="+",
|
||||||
|
required=False,
|
||||||
|
default=[],
|
||||||
|
help="Entity id patterns to not search for (not required)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# system info algorithm
|
||||||
|
parser_system_info = subparsers.add_parser(
|
||||||
|
"system_info",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents information about the system",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=system_info\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# swact activity algorithm
|
||||||
|
parser_swact_activity = subparsers.add_parser(
|
||||||
|
"swact_activity",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents system swacting activity",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=swact_activity\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# puppet errors algorithm
|
||||||
|
parser_puppet_errors = subparsers.add_parser(
|
||||||
|
"puppet_errors",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents any puppet errors",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=puppet_errors\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# process failures algorithm
|
||||||
|
parser_process_failures = subparsers.add_parser(
|
||||||
|
"process_failures",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents any process failures from pmond.log",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=process_failures\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# daemon failures algorithm
|
||||||
|
parser_daemon_failures = subparsers.add_parser(
|
||||||
|
"daemon_failures",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents any puppet manifest failures from daemon.log",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=daemon_failures\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# heartbeat loss algorithm
|
||||||
|
parser_heartbeat_loss = subparsers.add_parser(
|
||||||
|
"heartbeat_loss",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents any heartbeat loss error messages from hbsAgent.log",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=heartbeat_loss\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# maintenance errors algorithm
|
||||||
|
parser_maintenance_errors = subparsers.add_parser(
|
||||||
|
"maintenance_errors",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents errors and other relevant messages from mtcAgent.log and "
|
||||||
|
"mtcClient.log",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=maintenance_errors\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# state changes algorithm
|
||||||
|
parser_state_changes = subparsers.add_parser(
|
||||||
|
"state_changes",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents any messages from mtcAgent.log regarding the state of "
|
||||||
|
"hosts, such as enabled/disabled",
|
||||||
|
epilog="Plugin file example:\n" " algorithm=state_changes\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
# audit algorithm
|
||||||
|
parser_audit = subparsers.add_parser(
|
||||||
|
"audit",
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
help="Presents information about audit events in dcmanager.\n"
|
||||||
|
"The rates and totals represents the sum of audits on all subclouds ",
|
||||||
|
epilog="Plugin file example:\n"
|
||||||
|
" algorithm=audit\n"
|
||||||
|
" start=2022-06-01 10:00:00\n"
|
||||||
|
" end=2022-06-01 04:00:00\n",
|
||||||
|
)
|
||||||
|
parser_audit.add_argument(
|
||||||
|
"--start",
|
||||||
|
required=False,
|
||||||
|
default=datetime.strftime(now - timedelta(days=7), "%Y-%m-%d %H:%M:%S"),
|
||||||
|
type=str,
|
||||||
|
help="Specify a start date in YYYY-MM-DD HH:MM:SS format for analysis "
|
||||||
|
"(not required, default: 1 week ago)"
|
||||||
|
)
|
||||||
|
parser_audit.add_argument(
|
||||||
|
"--end",
|
||||||
|
required=False,
|
||||||
|
default=datetime.strftime(now, "%Y-%m-%d %H:%M:%S"),
|
||||||
|
type=str,
|
||||||
|
help="Specify an end date in YYYY-MM-DD HH:MM:SS format for analysis "
|
||||||
|
"(not required, default: today)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.start = datetime.strptime(args.start, "%Y%m%d").strftime(
|
||||||
|
"%Y-%m-%dT%H:%M:%S")
|
||||||
|
args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
|
||||||
|
if args.directory.endswith("/"):
|
||||||
|
output_directory = os.path.join(
|
||||||
|
default_path, "report", "output",
|
||||||
|
os.path.basename(os.path.dirname(args.directory))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_directory = os.path.join(
|
||||||
|
default_path, "report", "output", os.path.basename(args.directory)
|
||||||
|
)
|
||||||
|
|
||||||
|
# creating report log
|
||||||
|
os.makedirs(output_directory, exist_ok=True)
|
||||||
|
open(os.path.join(output_directory, "report.log"), "w").close()
|
||||||
|
|
||||||
|
# setting up logger
|
||||||
|
formatter = logging.Formatter("%(message)s")
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
filename=os.path.join(output_directory, "report.log"),
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s: %(message)s",
|
||||||
|
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||||
|
)
|
||||||
|
logging.Formatter.converter = time.gmtime
|
||||||
|
|
||||||
|
ch = logging.StreamHandler()
|
||||||
|
ch.setLevel(logging.INFO)
|
||||||
|
ch.setFormatter(formatter)
|
||||||
|
|
||||||
|
logger.addHandler(ch)
|
||||||
|
|
||||||
|
if not os.path.isdir(args.directory):
|
||||||
|
sys.exit("Top level of collect bundle given to analyze is not a directory")
|
||||||
|
else:
|
||||||
|
for obj in (os.scandir(args.directory)):
|
||||||
|
info = os.path.splitext(obj.name)
|
||||||
|
|
||||||
|
# TODO: ask user which file to report on if more than one tarball in
|
||||||
|
# directory
|
||||||
|
# Check if collect tarball is in given directory and extracts it if
|
||||||
|
# not already done
|
||||||
|
if (obj.is_file() and info[1] == ".tar"):
|
||||||
|
try:
|
||||||
|
result = subprocess.check_output(["tar", "tf", obj.path],
|
||||||
|
encoding="UTF-8")
|
||||||
|
result = result.split("\n", 1)
|
||||||
|
if not os.path.isdir(os.path.join(args.directory,
|
||||||
|
os.path.dirname(result[0]))):
|
||||||
|
subprocess.run(["tar", "xfC", obj.path, args.directory],
|
||||||
|
check=True)
|
||||||
|
subprocess.run(["echo", "extracted", obj.name], check=True)
|
||||||
|
args.directory = os.path.join(args.directory,
|
||||||
|
os.path.dirname(result[0]))
|
||||||
|
break
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
engine = ExecutionEngine(args, output_directory)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
sys.exit("Confirm you are running the report tool on a collect bundle")
|
||||||
|
|
||||||
|
if args.algorithm:
|
||||||
|
plugins.append(Plugin(opts=vars(args)))
|
||||||
|
else:
|
||||||
|
if args.plugin:
|
||||||
|
for p in args.plugin:
|
||||||
|
path = os.path.join(default_path, "report", "plugins", p)
|
||||||
|
if os.path.exists(path):
|
||||||
|
try:
|
||||||
|
plugins.append(Plugin(path))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.warning(f"{p} plugin does not exist")
|
||||||
|
else:
|
||||||
|
path = os.path.join(default_path, "report", "plugins")
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.mkdir(path)
|
||||||
|
logger.error("Plugins folder is empty")
|
||||||
|
else:
|
||||||
|
for file in os.listdir(path):
|
||||||
|
try:
|
||||||
|
plugins.append(Plugin(os.path.join(path, file)))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
|
||||||
|
engine.execute(plugins, output_directory)
|
@ -3,3 +3,4 @@ etc/collect.d/* /etc/collect.d
|
|||||||
usr/local/sbin/* /usr/local/sbin
|
usr/local/sbin/* /usr/local/sbin
|
||||||
usr/local/bin/collect /usr/local/bin
|
usr/local/bin/collect /usr/local/bin
|
||||||
usr/sbin/collect /usr/sbin
|
usr/sbin/collect /usr/sbin
|
||||||
|
/usr/local/bin/report/* /usr/local/bin/report
|
||||||
|
@ -13,8 +13,10 @@ override_dh_auto_install:
|
|||||||
|
|
||||||
install -m 755 -d $(SYSCONFDIR)/collect.d
|
install -m 755 -d $(SYSCONFDIR)/collect.d
|
||||||
install -m 755 -d $(SYSCONFDIR)/collect
|
install -m 755 -d $(SYSCONFDIR)/collect
|
||||||
|
install -m 755 -d $(SYSCONFDIR)/collect/plugins # Report Tool
|
||||||
install -m 755 -d $(ROOT)/usr/local/sbin
|
install -m 755 -d $(ROOT)/usr/local/sbin
|
||||||
install -m 755 -d $(ROOT)/usr/local/bin
|
install -m 755 -d $(ROOT)/usr/local/bin
|
||||||
|
install -m 755 -d $(ROOT)/usr/local/bin/report/tool # Report Tool
|
||||||
install -m 755 -d $(SBINDIR)
|
install -m 755 -d $(SBINDIR)
|
||||||
|
|
||||||
install -m 755 -p collect $(ROOT)/usr/local/sbin/collect
|
install -m 755 -p collect $(ROOT)/usr/local/sbin/collect
|
||||||
@ -26,6 +28,24 @@ override_dh_auto_install:
|
|||||||
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
|
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
|
||||||
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
|
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
|
||||||
|
|
||||||
|
# Report Tool
|
||||||
|
install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/tool/report.py
|
||||||
|
install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/tool/execution_engine.py
|
||||||
|
install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/tool/algorithms.py
|
||||||
|
install -m 755 -p report/plugin.py $(ROOT)/usr/local/bin/report/tool/plugin.py
|
||||||
|
install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/tool/correlator.py
|
||||||
|
install -m 755 -p report/README $(ROOT)/usr/local/bin/report/tool/README
|
||||||
|
install -m 755 -p report/plugins/alarm $(SYSCONFDIR)/collect/plugins/alarm
|
||||||
|
install -m 755 -p report/plugins/daemon_failures $(SYSCONFDIR)/collect/plugins/daemon_failures
|
||||||
|
install -m 755 -p report/plugins/heartbeat_loss $(SYSCONFDIR)/collect/plugins/heartbeat_loss
|
||||||
|
install -m 755 -p report/plugins/maintenance_errors $(SYSCONFDIR)/collect/plugins/maintenance_errors
|
||||||
|
install -m 755 -p report/plugins/process_failures $(SYSCONFDIR)/collect/plugins/process_failures
|
||||||
|
install -m 755 -p report/plugins/puppet_errors $(SYSCONFDIR)/collect/plugins/puppet_errors
|
||||||
|
install -m 755 -p report/plugins/state_changes $(SYSCONFDIR)/collect/plugins/state_changes
|
||||||
|
install -m 755 -p report/plugins/substring $(SYSCONFDIR)/collect/plugins/substring
|
||||||
|
install -m 755 -p report/plugins/swact_activity $(SYSCONFDIR)/collect/plugins/swact_activity
|
||||||
|
install -m 755 -p report/plugins/system_info $(SYSCONFDIR)/collect/plugins/system_info
|
||||||
|
|
||||||
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
|
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
|
||||||
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
|
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
|
||||||
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb
|
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb
|
||||||
|
@ -1,62 +0,0 @@
|
|||||||
Refer to report.py file header for a description of the tool
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
Consider the following collect bundle structure
|
|
||||||
|
|
||||||
SELECT_NODES_20220527.193605
|
|
||||||
├── controller-0_20220527.193605
|
|
||||||
│ ├── etc
|
|
||||||
│ ├── root
|
|
||||||
│ └── var
|
|
||||||
├── controller-1_20220527.193605
|
|
||||||
│ ├── etc
|
|
||||||
│ ├── root
|
|
||||||
│ └── var
|
|
||||||
├── plugins (where the plugin files will be placed)
|
|
||||||
│ ├── alarm_plugin_example
|
|
||||||
│ └── substring_plugin_example
|
|
||||||
├── report
|
|
||||||
└── tool (where the tool will be placed)
|
|
||||||
└── output (where the output files will be placed)
|
|
||||||
|
|
||||||
|
|
||||||
> cat plugins/alarm_plugin_example
|
|
||||||
|
|
||||||
algorithm=alarm
|
|
||||||
alarm_ids=400.,401.
|
|
||||||
entity_ids = host=controller-0
|
|
||||||
|
|
||||||
> cat plugins/substring_plugin_example
|
|
||||||
|
|
||||||
algorithm=substring
|
|
||||||
files=var/log/mtcAgent.log
|
|
||||||
hosts=controllers
|
|
||||||
substring=operation failed
|
|
||||||
|
|
||||||
> report/tool/report.py --start 20220501 --end 20220530
|
|
||||||
|
|
||||||
Running the command above will populate the report folder with output files.
|
|
||||||
The tool also provides default values, more details are in 'report.py -h'.
|
|
||||||
|
|
||||||
The substring algorithm creates an output file for every host of the
|
|
||||||
specified host type. The files will contain log events within the
|
|
||||||
provided date range containing the substring 'operation failed'.
|
|
||||||
|
|
||||||
The alarm algorithm creates two output file: 'log' and 'alarm'
|
|
||||||
'log' contains customer log messages created within the provided date range,
|
|
||||||
and 'alarm' contains system alarms created within the provided date range.
|
|
||||||
|
|
||||||
For more detailed information about an algorithm use 'report.py <algorithm> -h'.
|
|
||||||
|
|
||||||
Here is the report directory after running the above command
|
|
||||||
|
|
||||||
report
|
|
||||||
├── output
|
|
||||||
│ └── 20220815.140008 (time in utc when tool was ran)
|
|
||||||
│ ├── alarm
|
|
||||||
│ ├── controller-0_substring_plugin_example_substring
|
|
||||||
│ ├── controller-1_substring_plugin_example_substring
|
|
||||||
│ ├── report.log (log file for report tool)
|
|
||||||
│ └── log
|
|
||||||
└── tool (where the report tool is)
|
|
@ -1,546 +0,0 @@
|
|||||||
########################################################################
|
|
||||||
#
|
|
||||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
|
||||||
#
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
#
|
|
||||||
########################################################################
|
|
||||||
#
|
|
||||||
# This file contains the ExecutionEngine class.
|
|
||||||
# The ExecutionEngine class contains all the available algorithms.
|
|
||||||
#
|
|
||||||
# The ExecutionEngine class runs plugins and gathers relevant logs and
|
|
||||||
# information, creating output files in the report directory.
|
|
||||||
#
|
|
||||||
########################################################################
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import gzip
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
import algorithms
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ExecutionEngine:
|
|
||||||
def __init__(self, opts):
|
|
||||||
"""Constructor for the ExecutionEngine class
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
opts (dictionary): Options from command line
|
|
||||||
"""
|
|
||||||
self.opts = opts
|
|
||||||
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
|
|
||||||
self.active_controller_directory = None
|
|
||||||
|
|
||||||
for folder in (f.path for f in os.scandir(self.opts.directory)):
|
|
||||||
database_path = os.path.join(folder, "var", "extra", "database")
|
|
||||||
host_info_path = os.path.join(folder, "var", "extra", "host.info")
|
|
||||||
|
|
||||||
if os.path.isdir(database_path) and os.listdir(database_path):
|
|
||||||
self.active_controller_directory = folder
|
|
||||||
|
|
||||||
if os.path.exists(host_info_path):
|
|
||||||
hostname, subfunction = self._extract_subfunction(host_info_path)
|
|
||||||
if "controller" in subfunction:
|
|
||||||
self.hosts["controllers"][hostname] = folder
|
|
||||||
elif "worker" in subfunction:
|
|
||||||
self.hosts["workers"][hostname] = folder
|
|
||||||
elif "storage" in subfunction:
|
|
||||||
self.hosts["storages"][hostname] = folder
|
|
||||||
|
|
||||||
if not self.active_controller_directory:
|
|
||||||
raise ValueError("Active controller not found")
|
|
||||||
|
|
||||||
def execute(self, plugins, output_directory):
|
|
||||||
"""Run a list of plugins
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
plugins (Plugin list): List of plugins to run
|
|
||||||
|
|
||||||
Errors:
|
|
||||||
FileNotFoundError
|
|
||||||
"""
|
|
||||||
|
|
||||||
for plugin in plugins:
|
|
||||||
logger.info(f"Processing plugin: {os.path.basename(plugin.file)}")
|
|
||||||
hosts = {}
|
|
||||||
if (
|
|
||||||
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
|
|
||||||
): # if host list is given
|
|
||||||
for h in plugin.state["hosts"]:
|
|
||||||
if h == "all":
|
|
||||||
hosts.update(self.hosts["workers"])
|
|
||||||
hosts.update(self.hosts["storages"])
|
|
||||||
hosts.update(self.hosts["controllers"])
|
|
||||||
else:
|
|
||||||
hosts.update(self.hosts[h])
|
|
||||||
|
|
||||||
for hostname, folderpath in hosts.items():
|
|
||||||
|
|
||||||
events = []
|
|
||||||
if plugin.state["algorithm"] == algorithms.SUBSTRING:
|
|
||||||
try:
|
|
||||||
events = self.substring(
|
|
||||||
plugin.state["substring"],
|
|
||||||
[
|
|
||||||
os.path.join(folderpath, file)
|
|
||||||
for file in plugin.state["files"]
|
|
||||||
],
|
|
||||||
)
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
logger.error(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# creating output file
|
|
||||||
output_file = os.path.join(
|
|
||||||
output_directory,
|
|
||||||
f"{hostname}_{os.path.basename(plugin.file)}_{plugin.state['algorithm']}",
|
|
||||||
)
|
|
||||||
logger.info("output at " + output_file)
|
|
||||||
with open(output_file, "w") as file:
|
|
||||||
file.write(
|
|
||||||
f"Date range: {self.opts.start} until {self.opts.end}\n"
|
|
||||||
)
|
|
||||||
file.write(
|
|
||||||
f"substrings: {' '.join(plugin.state['substring'])}\n"
|
|
||||||
)
|
|
||||||
for line in events:
|
|
||||||
file.write(line + "\n")
|
|
||||||
else:
|
|
||||||
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
|
|
||||||
info = self.system_info()
|
|
||||||
system_info_output = os.path.join(output_directory, "system_info")
|
|
||||||
with open(system_info_output, "w") as file:
|
|
||||||
for i in info:
|
|
||||||
file.write(i + "\n")
|
|
||||||
|
|
||||||
for k, v in self.hosts.items():
|
|
||||||
file.write(f"{k}: {','.join(v.keys())}\n")
|
|
||||||
logger.info("output at " + system_info_output)
|
|
||||||
|
|
||||||
elif plugin.state["algorithm"] == algorithms.AUDIT:
|
|
||||||
hosts = {}
|
|
||||||
hosts.update(self.hosts["workers"])
|
|
||||||
hosts.update(self.hosts["storages"])
|
|
||||||
hosts.update(self.hosts["controllers"])
|
|
||||||
|
|
||||||
for hostname, folderpath in hosts.items():
|
|
||||||
self._create_output_file(
|
|
||||||
f"{hostname}_audit",
|
|
||||||
output_directory,
|
|
||||||
self.audit(
|
|
||||||
plugin.state["start"],
|
|
||||||
plugin.state["end"],
|
|
||||||
os.path.join(
|
|
||||||
folderpath, "var", "log", "dcmanager", "audit.log"
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
elif plugin.state["algorithm"] == algorithms.SWACT:
|
|
||||||
self._create_output_file(
|
|
||||||
"swact_activity", output_directory, self.swact()
|
|
||||||
)
|
|
||||||
|
|
||||||
elif plugin.state["algorithm"] == algorithms.PUPPET:
|
|
||||||
self._create_output_file(
|
|
||||||
"puppet_errors", output_directory, self.puppet()
|
|
||||||
)
|
|
||||||
|
|
||||||
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURE:
|
|
||||||
self._create_output_file(
|
|
||||||
"process_failures", output_directory, self.process_failure()
|
|
||||||
)
|
|
||||||
|
|
||||||
elif plugin.state["algorithm"] == algorithms.ALARM:
|
|
||||||
alarms, logs = self.alarm(
|
|
||||||
plugin.state["alarm_ids"], plugin.state["entity_ids"]
|
|
||||||
)
|
|
||||||
alarm_output = os.path.join(output_directory, "alarm")
|
|
||||||
log_output = os.path.join(output_directory, "log")
|
|
||||||
os.makedirs(os.path.dirname(log_output), exist_ok=True)
|
|
||||||
|
|
||||||
# creating output alarm file
|
|
||||||
with open(alarm_output, "w") as file:
|
|
||||||
for k, v in alarms.items():
|
|
||||||
file.write(f"{k} {v['count']}\n")
|
|
||||||
file.write("\n")
|
|
||||||
for k, v in alarms.items():
|
|
||||||
file.write(f"{k}\n")
|
|
||||||
for date in v["dates"]:
|
|
||||||
file.write(f" {date}\n")
|
|
||||||
|
|
||||||
# creating output log file
|
|
||||||
with open(log_output, "w") as file:
|
|
||||||
for k, v in logs.items():
|
|
||||||
file.write(f"{k} {v['count']}\n")
|
|
||||||
file.write("\n")
|
|
||||||
for k, v in logs.items():
|
|
||||||
file.write(f"{k}\n")
|
|
||||||
for date in v["dates"]:
|
|
||||||
file.write(f" {date}\n")
|
|
||||||
logger.info("output at " + alarm_output)
|
|
||||||
logger.info("output at " + log_output)
|
|
||||||
|
|
||||||
# Built-in algorithms ------------------------------
|
|
||||||
def alarm(self, alarm_ids=[], entity_ids=[]):
|
|
||||||
"""Alarm algorithm
|
|
||||||
Gathers list of alarms and customer logs
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
alarm_ids (string list) : List of alarm id patterns to search for
|
|
||||||
entity_ids (string list): List of entity id patterns to search for
|
|
||||||
"""
|
|
||||||
alarm_data = {}
|
|
||||||
log_data = {}
|
|
||||||
with open(
|
|
||||||
os.path.join(
|
|
||||||
self.active_controller_directory,
|
|
||||||
"var",
|
|
||||||
"extra",
|
|
||||||
"database",
|
|
||||||
"fm.db.sql.txt",
|
|
||||||
)
|
|
||||||
) as file:
|
|
||||||
start = False
|
|
||||||
for line in file:
|
|
||||||
# start of event log
|
|
||||||
if "COPY event_log" in line:
|
|
||||||
start = True
|
|
||||||
elif start and line == "\\.\n":
|
|
||||||
break
|
|
||||||
elif start:
|
|
||||||
entry = re.split(r"\t", line)
|
|
||||||
|
|
||||||
INDEX_ALARM_ID = 5
|
|
||||||
INDEX_ACTION = 6
|
|
||||||
INDEX_ENTITY_ID = 8
|
|
||||||
INDEX_ALARM_DATE = 9
|
|
||||||
INDEX_SEVERITY = 10
|
|
||||||
|
|
||||||
alarm_id = entry[INDEX_ALARM_ID]
|
|
||||||
entity_id = entry[INDEX_ENTITY_ID]
|
|
||||||
action = entry[INDEX_ACTION]
|
|
||||||
severity = entry[INDEX_SEVERITY]
|
|
||||||
alarm_date = entry[INDEX_ALARM_DATE]
|
|
||||||
|
|
||||||
entry_date = alarm_date.replace(
|
|
||||||
" ", "T"
|
|
||||||
) # making time format of alarm the same
|
|
||||||
if self.opts.start <= entry_date and entry_date <= self.opts.end:
|
|
||||||
# if the alarm is not in the user specified list of alarm or entity ids
|
|
||||||
for id in alarm_ids:
|
|
||||||
if id in alarm_id:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if len(alarm_ids) > 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for entity in entity_ids:
|
|
||||||
if entity in entity_id:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if len(entity_ids) > 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
if action == "log":
|
|
||||||
log_info = log_data[
|
|
||||||
f"{alarm_id} {entity_id} {severity}"
|
|
||||||
]
|
|
||||||
log_info["count"] += 1
|
|
||||||
log_info["dates"].append(alarm_date)
|
|
||||||
else:
|
|
||||||
alarm_info = alarm_data[
|
|
||||||
f"{alarm_id} {entity_id} {severity}"
|
|
||||||
]
|
|
||||||
alarm_info["count"] += 1
|
|
||||||
alarm_info["dates"].append(f"{alarm_date} {action}")
|
|
||||||
except KeyError:
|
|
||||||
if entry[6] != "log":
|
|
||||||
alarm_data[f"{alarm_id} {entity_id} {severity}"] = {
|
|
||||||
"count": 1,
|
|
||||||
"dates": [f"{alarm_date} {action}"],
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
log_data[f"{alarm_id} {entity_id} {severity}"] = {
|
|
||||||
"count": 1,
|
|
||||||
"dates": [alarm_date],
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, v in alarm_data.items():
|
|
||||||
v["dates"] = sorted(v["dates"])
|
|
||||||
|
|
||||||
for _, v in log_data.items():
|
|
||||||
v["dates"] = sorted(v["dates"])
|
|
||||||
|
|
||||||
return alarm_data, log_data
|
|
||||||
|
|
||||||
def substring(self, substr, files):
|
|
||||||
"""Substring algorithm
|
|
||||||
Looks for substrings within files
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
substr (string list): List of substrings to look for
|
|
||||||
files (string list): List of absolute filepaths to search in
|
|
||||||
|
|
||||||
Errors:
|
|
||||||
FileNotFoundError
|
|
||||||
"""
|
|
||||||
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
|
|
||||||
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
|
|
||||||
|
|
||||||
data = []
|
|
||||||
for file in files:
|
|
||||||
if not os.path.exists(file):
|
|
||||||
raise FileNotFoundError(f"File not found: {file}")
|
|
||||||
cont = True
|
|
||||||
# Searching through file
|
|
||||||
command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}"""
|
|
||||||
status = self._continue(file)
|
|
||||||
|
|
||||||
if (
|
|
||||||
status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD
|
|
||||||
): # continue with current file
|
|
||||||
if status == CONTINUE_CURRENT:
|
|
||||||
cont = False
|
|
||||||
self._evaluate_substring(data, command)
|
|
||||||
|
|
||||||
# Searching through rotated log files
|
|
||||||
n = 1
|
|
||||||
while os.path.exists(f"{file}.{n}.gz") and cont:
|
|
||||||
command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz"""
|
|
||||||
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
|
||||||
|
|
||||||
if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD:
|
|
||||||
if status == CONTINUE_CURRENT:
|
|
||||||
cont = False
|
|
||||||
self._evaluate_substring(data, command)
|
|
||||||
|
|
||||||
n += 1
|
|
||||||
|
|
||||||
return sorted(data)
|
|
||||||
|
|
||||||
def system_info(self):
|
|
||||||
"""System info algorithm
|
|
||||||
Presents basic information about the system
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
with open(
|
|
||||||
os.path.join(
|
|
||||||
self.active_controller_directory, "etc", "platform", "platform.conf"
|
|
||||||
)
|
|
||||||
) as file:
|
|
||||||
for line in file:
|
|
||||||
if "system_mode" in line:
|
|
||||||
data.append(
|
|
||||||
f"System Mode: {re.match('^system_mode=(.*)', line).group(1)}"
|
|
||||||
)
|
|
||||||
elif "system_type" in line:
|
|
||||||
data.append(
|
|
||||||
f"System Type: {re.match('^system_type=(.*)', line).group(1)}"
|
|
||||||
)
|
|
||||||
elif "distributed_cloud_role" in line:
|
|
||||||
data.append(
|
|
||||||
f"Distributed cloud role: {re.match('^distributed_cloud_role=(.*)', line).group(1)}"
|
|
||||||
)
|
|
||||||
elif "sw_version" in line:
|
|
||||||
data.append(
|
|
||||||
f"SW Version: {re.match('^sw_version=(.*)', line).group(1)}"
|
|
||||||
)
|
|
||||||
with open(
|
|
||||||
os.path.join(self.active_controller_directory, "etc", "build.info")
|
|
||||||
) as file:
|
|
||||||
for line in file:
|
|
||||||
if "BUILD_TYPE" in line:
|
|
||||||
data.append(
|
|
||||||
f"Build Type: {re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
|
||||||
)
|
|
||||||
elif re.match("^OS=(.*)", line):
|
|
||||||
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def swact(self):
|
|
||||||
"""Swact activity algorithm
|
|
||||||
Presents all swacting activity in the system
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
sm_files = []
|
|
||||||
sm_customer_files = []
|
|
||||||
swact_start = None
|
|
||||||
swact_in_progress = False
|
|
||||||
swact_end = None
|
|
||||||
|
|
||||||
for _, folder in self.hosts["controllers"].items():
|
|
||||||
sm_path = os.path.join(folder, "var", "log", "sm.log")
|
|
||||||
sm_files.append(sm_path)
|
|
||||||
|
|
||||||
sm_substrings = ["Swact has started,", "Swact update"]
|
|
||||||
data = self.substring(sm_substrings, sm_files)
|
|
||||||
|
|
||||||
for i, line in enumerate(data):
|
|
||||||
if "Swact has started," in line and not swact_in_progress:
|
|
||||||
swact_in_progress = True
|
|
||||||
swact_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
elif "Swact update" in line and swact_in_progress:
|
|
||||||
swact_in_progress = False
|
|
||||||
swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
line += f" SWACT TOOK {swact_end - swact_start} \n"
|
|
||||||
data[i] = line
|
|
||||||
|
|
||||||
for _, folder in self.hosts["controllers"].items():
|
|
||||||
sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log")
|
|
||||||
sm_customer_files.append(sm_customer_path)
|
|
||||||
|
|
||||||
sm_customer_substrings = ["swact"]
|
|
||||||
data += self.substring(sm_customer_substrings, sm_customer_files)
|
|
||||||
|
|
||||||
return sorted(data)
|
|
||||||
|
|
||||||
def puppet(self):
|
|
||||||
"""Puppet error algorithm
|
|
||||||
Presents log errors from puppet logs
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
for _, folder in self.hosts["controllers"].items():
|
|
||||||
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
|
||||||
command = f"grep -rh 'Error:' {puppet_folder}"
|
|
||||||
self._evaluate_substring(data, command)
|
|
||||||
return sorted(data)
|
|
||||||
|
|
||||||
def process_failure(self):
|
|
||||||
"""Process failure algorithm
|
|
||||||
Presents log errors from pmond
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
files = []
|
|
||||||
for host_type in self.hosts.keys():
|
|
||||||
for _, folder in self.hosts[host_type].items():
|
|
||||||
pmond = os.path.join(folder, "var", "log", "pmond.log")
|
|
||||||
files.append(pmond)
|
|
||||||
data = self.substring(["Error :"], files)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def audit(self, start, end, audit_log_path):
|
|
||||||
"""Counts audit events in dcmanager within a specified date range
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
start (string) : start date in YYYY-MM-DD HH:MM:SS format
|
|
||||||
end (string) : end date in YYYY-MM-DD HH:MM:SS format
|
|
||||||
audit_log_path (string) : absolute path of augit log file
|
|
||||||
"""
|
|
||||||
if not shutil.which("lnav"):
|
|
||||||
raise ValueError("Lnav program not found")
|
|
||||||
|
|
||||||
SECONDS_PER_HOUR = 3600
|
|
||||||
fmt = "%Y-%m-%d %H:%M:%S"
|
|
||||||
|
|
||||||
d1 = datetime.strptime(start, fmt)
|
|
||||||
d2 = datetime.strptime(end, fmt)
|
|
||||||
seconds = (d2 - d1).total_seconds()
|
|
||||||
|
|
||||||
log_texts = [
|
|
||||||
"Triggered subcloud audit%",
|
|
||||||
"Trigger patch audit%",
|
|
||||||
"Trigger load audit%",
|
|
||||||
"Triggered firmware audit%",
|
|
||||||
"Triggered kubernetes audit%",
|
|
||||||
# Counts sum of audits from all subclouds
|
|
||||||
]
|
|
||||||
INDEX_MIDDLE_WORD = 1
|
|
||||||
data = ["These rates and totals represent the sum of audits from all subclouds"]
|
|
||||||
|
|
||||||
def command(text):
|
|
||||||
|
|
||||||
return (
|
|
||||||
f'lnav -R -n -c ";SELECT count(log_body) AS {text.split(" ")[INDEX_MIDDLE_WORD]}_total'
|
|
||||||
f' from openstack_log WHERE (log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
|
||||||
f' AND log_body like \\"{text}\\"" "{audit_log_path}"'
|
|
||||||
)
|
|
||||||
|
|
||||||
for text in log_texts:
|
|
||||||
p = subprocess.Popen(command(text), shell=True, stdout=subprocess.PIPE)
|
|
||||||
for line in p.stdout:
|
|
||||||
line = line.decode("utf-8").strip()
|
|
||||||
if line.isnumeric():
|
|
||||||
data.append(
|
|
||||||
f"rate {round((int(line)/seconds * SECONDS_PER_HOUR), 3)} per hour. total: {line}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
data.append(line)
|
|
||||||
return data
|
|
||||||
|
|
||||||
# -----------------------------------
|
|
||||||
|
|
||||||
def _continue(self, file, compressed=False):
|
|
||||||
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
|
|
||||||
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
|
|
||||||
CONTINUE_OLD = 2 # don't analyze current file, continue to older files
|
|
||||||
|
|
||||||
# check date of first log event and compare with provided start end dates
|
|
||||||
first = ""
|
|
||||||
|
|
||||||
if not compressed:
|
|
||||||
with open(file) as f:
|
|
||||||
line = f.readline()
|
|
||||||
first = line[0:19]
|
|
||||||
else:
|
|
||||||
with gzip.open(file, "rb") as f:
|
|
||||||
line = f.readline().decode("utf-8")
|
|
||||||
first = line[0:19]
|
|
||||||
try:
|
|
||||||
datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
|
||||||
first = line[0:19]
|
|
||||||
except ValueError:
|
|
||||||
return CONTINUE_CURRENT_OLD
|
|
||||||
|
|
||||||
if first < self.opts.start:
|
|
||||||
return CONTINUE_CURRENT
|
|
||||||
elif first < self.opts.end and first > self.opts.start:
|
|
||||||
return CONTINUE_CURRENT_OLD
|
|
||||||
elif first > self.opts.end:
|
|
||||||
return CONTINUE_OLD
|
|
||||||
|
|
||||||
def _evaluate_substring(self, data, command):
|
|
||||||
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
|
||||||
for line in p.stdout:
|
|
||||||
line = line.decode("utf-8")
|
|
||||||
dates = [line[0:19], line[2:21]] # different date locations for log events
|
|
||||||
for date in dates:
|
|
||||||
try:
|
|
||||||
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
|
||||||
if date > self.opts.start and date < self.opts.end:
|
|
||||||
if line[0] == "|": # sm-customer.log edge case
|
|
||||||
line = line.replace("|", "").strip()
|
|
||||||
line = re.sub("\s+", " ", line)
|
|
||||||
data.append(line)
|
|
||||||
break
|
|
||||||
except ValueError:
|
|
||||||
if date == dates[-1]:
|
|
||||||
data.append(line)
|
|
||||||
|
|
||||||
def _extract_subfunction(self, host_info_path):
|
|
||||||
GROUP_ONE = 1
|
|
||||||
with open(host_info_path) as file:
|
|
||||||
for line in file:
|
|
||||||
hostname_match = re.match("^hostname => (.+)", line)
|
|
||||||
subfunction_match = re.match("^subfunction => (.+)", line)
|
|
||||||
if subfunction_match:
|
|
||||||
subfunction = subfunction_match.group(GROUP_ONE)
|
|
||||||
if hostname_match:
|
|
||||||
hostname = hostname_match.group(GROUP_ONE)
|
|
||||||
return hostname, subfunction
|
|
||||||
|
|
||||||
def _create_output_file(self, filename, directory, events):
|
|
||||||
with open(os.path.join(directory, filename), "w") as file:
|
|
||||||
for i in events:
|
|
||||||
file.write(i + "\n")
|
|
||||||
logger.info("output at " + os.path.join(directory, filename))
|
|
@ -1,257 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
########################################################################
|
|
||||||
#
|
|
||||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
|
||||||
#
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
#
|
|
||||||
########################################################################
|
|
||||||
#
|
|
||||||
# Description: The Report tool is used to gather relevant log events
|
|
||||||
# and information about the system from a collect bundle.
|
|
||||||
#
|
|
||||||
# The report tool allows user created plugins which decides relevance
|
|
||||||
# for log events. Plugins contain an algorithm label which instructs the
|
|
||||||
# tool what information to search and how to search for it.
|
|
||||||
#
|
|
||||||
# The report tool requires the collect bundle and host tarballs to be
|
|
||||||
# untarred.
|
|
||||||
#
|
|
||||||
# The report tool reads user plugins from a plugins directory in the
|
|
||||||
# top level of the collect bundle, and outputs files containing
|
|
||||||
# relevant logs to a report directory in the top level as well.
|
|
||||||
#
|
|
||||||
# Typical Usage:
|
|
||||||
# command line functionality
|
|
||||||
# ------------------------------- ----------------------------------
|
|
||||||
# > report.py - Run all plugins in directory
|
|
||||||
# > report.py [plugin ...] - Run only specified plugins
|
|
||||||
# > report.py <algorithm> [labels] - Run algorithm with labels
|
|
||||||
# > report.py --help - help message
|
|
||||||
# > report.py <algorithm> --help - algorithm specific help
|
|
||||||
#
|
|
||||||
# See --help output for a complete list of full and abbreviated
|
|
||||||
# command line options and examples of plugins.
|
|
||||||
#
|
|
||||||
# Refer to README file for more usage and output examples
|
|
||||||
#######################################################################
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
from cmath import log
|
|
||||||
from datetime import datetime
|
|
||||||
from datetime import timezone
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
from execution_engine import ExecutionEngine
|
|
||||||
from plugin import Plugin
|
|
||||||
|
|
||||||
|
|
||||||
now = datetime.now(timezone.utc)
|
|
||||||
base_dir = os.path.realpath(__file__)
|
|
||||||
default_path = os.path.join(os.path.dirname(base_dir), "..", "..")
|
|
||||||
plugins = []
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Log Event Reporter",
|
|
||||||
epilog="Place plugins in 'plugins' directory at top level of collect bundle. Output files will be placed in 'report' directory."
|
|
||||||
"\nThis tool will create a report.log file along with other output files",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-s",
|
|
||||||
"--start",
|
|
||||||
default="20000101",
|
|
||||||
help="Specify a start date in YYYYMMDD format for analysis (default:20000101)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-e",
|
|
||||||
"--end",
|
|
||||||
default=datetime.strftime(now, "%Y%m%d"),
|
|
||||||
help="Specify an end date in YYYYMMDD format for analysis (default: current date)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-p",
|
|
||||||
"--plugin",
|
|
||||||
default=None,
|
|
||||||
nargs="*",
|
|
||||||
help="Specify what plugins to run (default: runs every plugin in plugins folder)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-d",
|
|
||||||
"--directory",
|
|
||||||
default=default_path,
|
|
||||||
help="Specify top level of collect bundle to analyze (default: two levels above current location)",
|
|
||||||
)
|
|
||||||
subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
|
|
||||||
|
|
||||||
# substring algorithm arguments
|
|
||||||
parser_substring = subparsers.add_parser(
|
|
||||||
"substring",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="""Searches through specified files for lines containing specified substring.
|
|
||||||
There will be an output file for each host of the host type specified.""",
|
|
||||||
epilog="Plugin file example:\n"
|
|
||||||
" algorithm=substring\n"
|
|
||||||
" files=mtcAgent.log, sm.log\n"
|
|
||||||
" hosts=controllers, workers\n"
|
|
||||||
" substring=Swact in progress\n"
|
|
||||||
" substring=Swact update",
|
|
||||||
)
|
|
||||||
substring_required = parser_substring.add_argument_group("required arguments")
|
|
||||||
substring_required.add_argument(
|
|
||||||
"--files",
|
|
||||||
required=True,
|
|
||||||
nargs="+",
|
|
||||||
help="Files to perform substring analysis on (required)",
|
|
||||||
)
|
|
||||||
substring_required.add_argument(
|
|
||||||
"--substring", nargs="+", required=True, help="Substrings to search for (required)"
|
|
||||||
)
|
|
||||||
substring_required.add_argument(
|
|
||||||
"--hosts",
|
|
||||||
choices=["controllers", "workers", "storages", "all"],
|
|
||||||
required=True,
|
|
||||||
nargs="+",
|
|
||||||
help="Host types to perform analysis on (required)",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# alarm algorithm arguments
|
|
||||||
parser_alarm = subparsers.add_parser(
|
|
||||||
"alarm",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="Searches through fm.db.sql.txt for alarms and logs. There are 2 output files: 'alarm', and 'log'",
|
|
||||||
epilog="Plugin file example:\n"
|
|
||||||
" algorithm=alarm\n"
|
|
||||||
" alarm_ids=400.005,200.004\n"
|
|
||||||
" entity_ids= host=controller-0,host=controller-1\n",
|
|
||||||
)
|
|
||||||
parser_alarm.add_argument(
|
|
||||||
"--alarm_ids",
|
|
||||||
nargs="+",
|
|
||||||
required=False,
|
|
||||||
default=[],
|
|
||||||
help="Alarm id patterns to search for (not required)",
|
|
||||||
)
|
|
||||||
parser_alarm.add_argument(
|
|
||||||
"--entity_ids",
|
|
||||||
nargs="+",
|
|
||||||
required=False,
|
|
||||||
default=[],
|
|
||||||
help="Entity id patterns to search for (not required)",
|
|
||||||
)
|
|
||||||
|
|
||||||
# system info algorithm
|
|
||||||
parser_system_info = subparsers.add_parser(
|
|
||||||
"system_info",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="Presents information about the system",
|
|
||||||
epilog="Plugin file example:\n" " algorithm=system_info\n",
|
|
||||||
)
|
|
||||||
|
|
||||||
# swact activity algorithm
|
|
||||||
parser_swact = subparsers.add_parser(
|
|
||||||
"swact",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="Presents system swacting activity",
|
|
||||||
epilog="Plugin file example:\n" " algorithm=swact\n",
|
|
||||||
)
|
|
||||||
|
|
||||||
# puppet errors algorithm
|
|
||||||
parser_puppet = subparsers.add_parser(
|
|
||||||
"puppet",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="Presents any puppet errors",
|
|
||||||
epilog="Plugin file example:\n" " algorithm=puppet\n",
|
|
||||||
)
|
|
||||||
|
|
||||||
# process failure algorithm
|
|
||||||
parser_process_failure = subparsers.add_parser(
|
|
||||||
"process_failure",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="Presents any process failures from pmond.log",
|
|
||||||
epilog="Plugin file example:\n" " algorithm=process_failure\n",
|
|
||||||
)
|
|
||||||
|
|
||||||
# audit algorithm
|
|
||||||
parser_audit = subparsers.add_parser(
|
|
||||||
"audit",
|
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
|
||||||
help="Presents information about audit events in dcmanager.\n"
|
|
||||||
"The rates and totals represents the sum of audits on all subclouds ",
|
|
||||||
epilog="Plugin file example:\n"
|
|
||||||
" algorithm=audit\n"
|
|
||||||
" start=2022-06-01 10:00:00\n"
|
|
||||||
" end=2022-06-01 04:00:00\n",
|
|
||||||
)
|
|
||||||
parser_audit_required = parser_audit.add_argument_group("required arguments")
|
|
||||||
parser_audit_required.add_argument("--start", required=True)
|
|
||||||
parser_audit_required.add_argument(
|
|
||||||
"--end",
|
|
||||||
required=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
args.start = datetime.strptime(args.start, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
|
||||||
args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
|
||||||
|
|
||||||
output_directory = os.path.join(
|
|
||||||
args.directory, "report", "output", now.strftime("%Y%m%d.%H%M%S")
|
|
||||||
)
|
|
||||||
|
|
||||||
# creating report log
|
|
||||||
os.makedirs(output_directory)
|
|
||||||
open(os.path.join(output_directory, "report.log"), "w").close()
|
|
||||||
|
|
||||||
# setting up logger
|
|
||||||
formatter = logging.Formatter("%(message)s")
|
|
||||||
logger = logging.getLogger()
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
filename=os.path.join(output_directory, "report.log"),
|
|
||||||
level=logging.INFO,
|
|
||||||
format="%(asctime)s %(levelname)s: %(message)s",
|
|
||||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
||||||
)
|
|
||||||
logging.Formatter.converter = time.gmtime
|
|
||||||
|
|
||||||
ch = logging.StreamHandler()
|
|
||||||
ch.setLevel(logging.INFO)
|
|
||||||
ch.setFormatter(formatter)
|
|
||||||
|
|
||||||
logger.addHandler(ch)
|
|
||||||
|
|
||||||
try:
|
|
||||||
engine = ExecutionEngine(args)
|
|
||||||
except ValueError as e:
|
|
||||||
logger.error(str(e))
|
|
||||||
|
|
||||||
if args.algorithm:
|
|
||||||
plugins.append(Plugin(opts=vars(args)))
|
|
||||||
else:
|
|
||||||
if args.plugin:
|
|
||||||
for p in args.plugin:
|
|
||||||
path = os.path.join(args.directory, "plugins", p)
|
|
||||||
if os.path.exists(path):
|
|
||||||
try:
|
|
||||||
plugins.append(Plugin(path))
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(str(e))
|
|
||||||
|
|
||||||
else:
|
|
||||||
logger.warning(f"{p} plugin does not exist")
|
|
||||||
else:
|
|
||||||
path = os.path.join(args.directory, "plugins")
|
|
||||||
if not os.path.exists(path):
|
|
||||||
os.mkdir(path)
|
|
||||||
logger.error("Plugins folder is empty")
|
|
||||||
else:
|
|
||||||
for file in os.listdir(path):
|
|
||||||
try:
|
|
||||||
plugins.append(Plugin(os.path.join(path, file)))
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(str(e))
|
|
||||||
|
|
||||||
engine.execute(plugins, output_directory)
|
|
Loading…
Reference in New Issue
Block a user