Merge "Report Tool: Package and add plugins/correlator"
This commit is contained in:
commit
882eab3cd4
@ -2941,6 +2941,56 @@ collect_subclouds()
|
||||
fi
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Name : get_report_tool
|
||||
#
|
||||
# Purpose : Fetch report tool from current host
|
||||
#
|
||||
# Parameters: $1 - local path destination
|
||||
#
|
||||
############################################################################
|
||||
|
||||
function get_report_tool()
|
||||
{
|
||||
local local_dest=${1}
|
||||
|
||||
mkdir -p ${local_dest}
|
||||
cp -r /usr/local/bin/report/tool ${local_dest}
|
||||
|
||||
local rc=${?}
|
||||
if [ ${rc} -ne ${PASS} ] ; then
|
||||
report_error "failed to get report tool from /usr/local/bin" ${rc}
|
||||
else
|
||||
ilog "copied report tool from host"
|
||||
fi
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Name : get_report_plugins
|
||||
#
|
||||
# Purpose : Fetch plugins for report tool from current host
|
||||
#
|
||||
# Parameters: $1 - local path destination
|
||||
#
|
||||
############################################################################
|
||||
|
||||
function get_report_plugins()
|
||||
{
|
||||
local local_dest=${1}
|
||||
|
||||
mkdir -p ${local_dest}
|
||||
cp -r /etc/collect/plugins ${local_dest}
|
||||
|
||||
local rc=${?}
|
||||
if [ ${rc} -ne ${PASS} ] ; then
|
||||
report_error "failed to get report plugins from /etc/collect" ${rc}
|
||||
else
|
||||
ilog "copied plugins for report tool from host"
|
||||
fi
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Handle subcloud and system hosts batched collect
|
||||
@ -3031,6 +3081,17 @@ echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... "
|
||||
|
||||
remove_file_local ${COLLECT_ERROR_LOG}
|
||||
remove_file_local ${HOST_COLLECT_ERROR_LOG}
|
||||
get_report_tool ${COLLECT_DIR}/report
|
||||
get_report_plugins ${COLLECT_DIR}/report
|
||||
|
||||
cd ${COLLECT_DIR}
|
||||
tar -czf report_tool.tgz report
|
||||
rc=${?}
|
||||
if [ ${rc} -ne ${PASS} ] ; then
|
||||
report_error "failed to tar report tool" ${rc}
|
||||
else
|
||||
rm -r report
|
||||
fi
|
||||
|
||||
/usr/bin/expect << EOF
|
||||
log_user ${USER_LOG_MODE}
|
||||
|
81
tools/collector/debian-scripts/report/README
Normal file
81
tools/collector/debian-scripts/report/README
Normal file
@ -0,0 +1,81 @@
|
||||
Refer to report.py file header for a description of the tool
|
||||
|
||||
Example:
|
||||
|
||||
Consider the following collect bundle structure
|
||||
|
||||
SELECT_NODES_20220527.193605
|
||||
├── controller-0_20220527.193605
|
||||
│ ├── etc
|
||||
│ ├── root
|
||||
│ └── var
|
||||
├── controller-1_20220527.193605
|
||||
│ ├── etc
|
||||
│ ├── root
|
||||
│ └── var
|
||||
├── report
|
||||
├── plugins (where the plugin files will be placed)
|
||||
│ ├── alarm
|
||||
│ ├── substring
|
||||
│ └── ...
|
||||
├── tool (where the tool will be placed)
|
||||
└── output (where the output files will be placed)
|
||||
|
||||
|
||||
> cat plugins/alarm
|
||||
|
||||
algorithm=alarm
|
||||
alarm_exclude=400., 800.
|
||||
entity_exclude=subsystem=vim
|
||||
|
||||
> cat plugins/substring
|
||||
|
||||
algorithm=substring
|
||||
files=var/log/mtcAgent.log, var/log/sm.log
|
||||
hosts=controllers
|
||||
substring=operation failed
|
||||
substring=Failed to send message
|
||||
|
||||
> report/tool/report.py --start 20220501 --end 20220530
|
||||
|
||||
Running the command above will populate the report folder with output files.
|
||||
The tool also provides default values, more details are in 'report.py -h'.
|
||||
|
||||
The substring algorithm creates an output file for every host of the
|
||||
specified host type. The files will contain log events within the
|
||||
provided date range containing the substring 'operation failed' and 'Failed
|
||||
to send message'.
|
||||
|
||||
The alarm algorithm creates two output file: 'log' and 'alarm'
|
||||
'log' contains customer log messages created within the provided date range,
|
||||
and 'alarm' contains system alarms created within the provided date range, as
|
||||
long as the alarm ids and entity ids are not included in the alarm plugin file.
|
||||
|
||||
For more detailed information about an algorithm use 'report.py <algorithm> -h'.
|
||||
|
||||
Here is the report directory after running the above command
|
||||
|
||||
report
|
||||
├── output
|
||||
│ └── SELECT_NODES_20220527.193605 (collect bundle that the report tool was run on)
|
||||
│ ├── plugins (output files for plugins)
|
||||
│ │ ├── alarm
|
||||
│ │ └── ...
|
||||
│ ├── correlator_failures
|
||||
│ ├── correlator_events
|
||||
│ ├── correlator_state_changes
|
||||
│ ├── report.log (log file for report tool)
|
||||
│ └── untar.log (log file for untarring collect bundle and host tar files)
|
||||
├── plugins (where the plugins files are)
|
||||
└── tool (where the report tool is)
|
||||
|
||||
The report tool also allows users to point it at any collect bundle and
|
||||
have it automatically extract the tarball and tar files for each host
|
||||
before running.
|
||||
|
||||
> report/tool/report.py -d CGTS-19143
|
||||
|
||||
Users may specify if they want the correlator to only find events
|
||||
and state changes for a specific host.
|
||||
|
||||
> report/tool/report.py --hostname controller-0
|
@ -9,8 +9,12 @@
|
||||
# Algorithm string constants
|
||||
ALARM = "alarm"
|
||||
AUDIT = "audit"
|
||||
PROCESS_FAILURE = "process_failure"
|
||||
PUPPET = "puppet"
|
||||
DAEMON_FAILURES = "daemon_failures"
|
||||
HEARTBEAT_LOSS = "heartbeat_loss"
|
||||
MAINTENANCE_ERR = "maintenance_errors"
|
||||
PROCESS_FAILURES = "process_failures"
|
||||
PUPPET_ERRORS = "puppet_errors"
|
||||
STATE_CHANGES = "state_changes"
|
||||
SUBSTRING = "substring"
|
||||
SWACT = "swact"
|
||||
SWACT_ACTIVITY = "swact_activity"
|
||||
SYSTEM_INFO = "system_info"
|
583
tools/collector/debian-scripts/report/correlator.py
Executable file
583
tools/collector/debian-scripts/report/correlator.py
Executable file
@ -0,0 +1,583 @@
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This file contains the Correlator class.
|
||||
# The Correlator class contains algorithms that search for failures.
|
||||
#
|
||||
# The Correlator class reads through all the output files created by
|
||||
# the plugins and detects failures. A summary of the failures and their
|
||||
# causes are printed to standard output and an output file is created
|
||||
# in the report directory.
|
||||
#
|
||||
# TODO: Modularize code and separate methods into their own files
|
||||
#
|
||||
########################################################################
|
||||
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Correlator:
|
||||
def __init__(self, plugin_output_dir):
|
||||
"""Constructor for the Correlator class
|
||||
|
||||
Parameters:
|
||||
plugin_output_dir (string): Path to directory with output files
|
||||
from plugins
|
||||
"""
|
||||
self.plugin_output_dir = plugin_output_dir
|
||||
|
||||
def run(self, hostname):
|
||||
"""Searches through the output files created by the plugins for
|
||||
failures and determines their causes, as well as extracts significant
|
||||
events and state changes
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
failures = []
|
||||
try:
|
||||
failures += self.uncontrolled_swact()
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
try:
|
||||
failures += self.mtc_errors()
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
events = []
|
||||
try:
|
||||
events += self.get_events(hostname)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
alarms = []
|
||||
try:
|
||||
alarms += self.get_alarms(hostname)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
state_changes = []
|
||||
try:
|
||||
state_changes += self.get_state_changes(hostname)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
return (sorted(failures), sorted(events), sorted(alarms),
|
||||
sorted(state_changes))
|
||||
|
||||
def uncontrolled_swact(self):
|
||||
"""Searches through the output file created by the swact activity
|
||||
plugin for uncontrolled swacts and determines their causes through
|
||||
other indicators, like the log "Neighbour [..] is now in the down"
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Variables to keep track of indicators for failure causes
|
||||
start_time = end_time = svc_failed = None
|
||||
ctrlr_down = None # Active controller that went down, causing swact
|
||||
ctrlr_svc_fail = None # Active controller where service failed
|
||||
ctrlr_link_down = None # Orig. active controller when link went down
|
||||
hb_loss = active_failed = go_active_failed = link_down = False
|
||||
|
||||
# Open output file from swact activity plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
|
||||
|
||||
with open(file_path, "r") as swact_activity:
|
||||
for line in swact_activity:
|
||||
if "Uncontrolled swact" in line and not start_time:
|
||||
start_time = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
if ("Host from active to failed, Peer from standby to "
|
||||
"active" in line):
|
||||
link_down = True
|
||||
ctrlr_link_down = re.findall(
|
||||
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) "
|
||||
"sm:", line)[0]
|
||||
elif (re.search("Neighbor (.+) is now in the down", line)
|
||||
and start_time and not ctrlr_down):
|
||||
ctrlr_down = re.findall(
|
||||
r"Neighbor \((.+)\) received event", line)[0]
|
||||
elif (re.search("Service (.+) is failed and has reached max "
|
||||
"failures", line) and not svc_failed):
|
||||
svc_failed = re.findall(
|
||||
r"Service \((.+)\) is failed", line)[0]
|
||||
ctrlr_svc_fail = re.findall(
|
||||
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
|
||||
line)[0]
|
||||
elif (svc_failed and re.search(
|
||||
"active-failed\\s+\\| disabling-failed\\s+\\| "
|
||||
+ svc_failed, line)):
|
||||
if re.search(r"\| go-active-failed\s+\|", line):
|
||||
go_active_failed = True
|
||||
else:
|
||||
active_failed = True
|
||||
elif "Swact update" in line and start_time and not end_time:
|
||||
end_time = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
if ctrlr_down:
|
||||
try:
|
||||
hb_loss = self.search_hb_loss(
|
||||
start_time, end_time, ctrlr_down)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
if link_down:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact, refer to SM logs "
|
||||
"for in-depth analysis, original active "
|
||||
"controller: " + ctrlr_link_down + "\n")
|
||||
elif ctrlr_down:
|
||||
if hb_loss:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact due to "
|
||||
"spontaneous reset of active "
|
||||
"controller " + ctrlr_down + "\n")
|
||||
else:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact likely due to "
|
||||
"spontaneous reset of active "
|
||||
"controller " + ctrlr_down + "\n")
|
||||
elif svc_failed:
|
||||
if active_failed and go_active_failed:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact due to service "
|
||||
"failure (" + svc_failed + ") twice "
|
||||
"in 2 minutes was unsuccessful so "
|
||||
"\"bounced back\" to original active "
|
||||
"controller " + ctrlr_svc_fail + "\n")
|
||||
elif active_failed:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact due to service "
|
||||
"failure (" + svc_failed + ") twice "
|
||||
"in 2 minutes on active controller "
|
||||
+ ctrlr_svc_fail + "\n")
|
||||
else:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact likely due to "
|
||||
"service failure (" + svc_failed
|
||||
+ ") twice in 2 minutes on active "
|
||||
"controller " + ctrlr_svc_fail + "\n")
|
||||
|
||||
start_time = end_time = svc_failed = None
|
||||
ctrlr_down = ctrlr_svc_fail = ctrlr_link_down = None
|
||||
hb_loss = active_failed = go_active_failed = False
|
||||
link_down = False
|
||||
|
||||
return data
|
||||
|
||||
def mtc_errors(self):
|
||||
"""Searches through the output file created by the maintenance errors
|
||||
plugin for failures and determines their causes through other
|
||||
indicators, like the log "Loss Of Communication for 5 seconds"
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Variables to keep track of indicators for failure causes
|
||||
goenable_start = goenable_end = goenable_host = None
|
||||
goenable_tst_f = config_tst_f = None # Tests failed
|
||||
config_start = config_end = config_host = puppet_error = None
|
||||
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||
daemon_fail = comm_loss = auto_recov_dis = False
|
||||
|
||||
# Open output file from maintenance errors plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
|
||||
|
||||
with open(file_path, "r") as mtc:
|
||||
for line in mtc:
|
||||
if "auto recovery disabled" in line and not auto_recov_dis:
|
||||
# Check if previous failure recorded was go-enable,
|
||||
# configuration or heartbeat failure
|
||||
if (data and
|
||||
re.search(r"Go-enable|[cC]onfiguration|Heartbeat",
|
||||
data[-1])):
|
||||
host = re.findall(r"failure on ([^\s]+)", data[-1])
|
||||
# Check if host in auto recovery disabled mode is same
|
||||
# as host with previous failure
|
||||
if (host and re.search(
|
||||
host[0] + " auto recovery disabled", line)):
|
||||
old = data[-1].split("due", 1)
|
||||
if len(old) == 1:
|
||||
data[-1] = (data[-1][:-1]
|
||||
+ " (auto recovery disabled)\n")
|
||||
else:
|
||||
data[-1] = (old[0]
|
||||
+ "(auto recovery disabled) due"
|
||||
+ old[1])
|
||||
auto_recov_dis = True
|
||||
elif "GOENABLED Failed" in line and not goenable_start:
|
||||
goenable_start, auto_recov_dis = line[0:19], False
|
||||
goenable_host = re.findall(
|
||||
"Error : (.+) got GOENABLED Failed", line)[0]
|
||||
elif ("configuration failed or incomplete" in line
|
||||
and not config_start):
|
||||
config_start = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
auto_recov_dis = False
|
||||
config_host = re.findall(
|
||||
"Error : (.+) configuration failed", line)[0]
|
||||
elif "Heartbeat Loss" in line:
|
||||
# Check if previous failure recorded was heartbeat loss
|
||||
# due to missing heartbeat messages
|
||||
if ("(during recovery soak)" in line and data and
|
||||
re.search("missing heartbeat messages", data[-1])):
|
||||
host = re.findall(
|
||||
"failure on (.+) due to", data[-1])[0]
|
||||
# Check if host with hearbeat loss failure is the same
|
||||
# as host with previous failure
|
||||
if (re.search(host + " (.+) Heartbeat Loss (.+) "
|
||||
"\\(during recovery soak\\)", line)):
|
||||
old = data[-1]
|
||||
data[-1] = (old[0:23] + line[0:19] + old[42:-1]
|
||||
+ " (recovery over disabled due to "
|
||||
"heartbeat soak failure)\n")
|
||||
else:
|
||||
hb_loss_start = line[0:19]
|
||||
comm_loss = auto_recov_dis = False
|
||||
hb_loss_host = re.findall("Error : (.+) [CM]", line)[0]
|
||||
# Check if previous failure recorded was heartbeat loss due to
|
||||
# missing heartbeat messages
|
||||
elif ("regained MTCALIVE from host that has rebooted" in line
|
||||
and data and re.search(r"Heartbeat loss failure (.+) "
|
||||
r"\(recovery over disabled\)",
|
||||
data[-1])):
|
||||
host = re.findall("failure on (.+) due to", data[-1])[0]
|
||||
if re.search(host + " regained MTCALIVE", line):
|
||||
old = data[-1].split("due", 1)[0]
|
||||
data[-1] = (old[0:23] + line[0:19] + old[42:]
|
||||
+ "due to uncontrolled reboot\n")
|
||||
elif (hb_loss_start and not comm_loss and hb_loss_host and
|
||||
re.search(hb_loss_host + " Loss Of Communication for 5 "
|
||||
"seconds", line)):
|
||||
comm_loss = True
|
||||
elif re.search("mtcClient --- (.+)Error : FAILED:", line):
|
||||
if goenable_start and not goenable_tst_f:
|
||||
goenable_tst_f = re.findall(
|
||||
r"Error : FAILED: (.+) \(\d", line)[0]
|
||||
elif config_start and not config_tst_f:
|
||||
config_tst_f = re.findall(
|
||||
r"Error : FAILED: (.+) \(\d", line)[0]
|
||||
elif (goenable_host and not goenable_end and
|
||||
re.search(goenable_host + " Task: In-Test Failure, "
|
||||
"threshold reached", line)):
|
||||
goenable_end = line[0:19]
|
||||
if goenable_tst_f:
|
||||
data.append(goenable_start + " to " + goenable_end
|
||||
+ " Go-enable test failure on "
|
||||
+ goenable_host + " due to failing of "
|
||||
+ goenable_tst_f + "\n")
|
||||
else:
|
||||
data.append(goenable_start + " to " + goenable_end
|
||||
+ " Go-enable test failure on "
|
||||
+ goenable_host + " due to unknown test "
|
||||
"failing\n")
|
||||
|
||||
goenable_start = goenable_end = goenable_host = None
|
||||
goenable_tst_f = None
|
||||
elif (config_host and not config_end and
|
||||
re.search(config_host + " Task: Configuration failure, "
|
||||
"threshold reached", line)):
|
||||
config_end = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
if (config_tst_f
|
||||
!= "/etc/goenabled.d/config_goenabled_check.sh"):
|
||||
try:
|
||||
daemon_fail = self.search_daemon_fail(
|
||||
config_start, config_end, config_host)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
if (config_tst_f ==
|
||||
"/etc/goenabled.d/config_goenabled_check.sh"
|
||||
or daemon_fail):
|
||||
try:
|
||||
puppet_error = self.search_puppet_error(
|
||||
config_start, config_end)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
config_start = config_start.strftime(
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
if puppet_error:
|
||||
data.append(config_start + " to " + config_end
|
||||
+ " Configuration failure on "
|
||||
+ config_host + " due to:\n"
|
||||
+ puppet_error)
|
||||
else:
|
||||
data.append(config_start + " to " + config_end
|
||||
+ " Configuration failure on "
|
||||
+ config_host
|
||||
+ " due to unknown cause\n")
|
||||
else:
|
||||
config_start = config_start.strftime(
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
data.append(config_start + " to " + config_end
|
||||
+ " Possible configuration failure on "
|
||||
+ config_host + "\n")
|
||||
|
||||
config_start = config_end = config_host = None
|
||||
config_tst_f = puppet_error = None
|
||||
daemon_fail = False
|
||||
elif (hb_loss_start and not hb_loss_end and hb_loss_host and
|
||||
re.search(hb_loss_host + " Connectivity Recovered ",
|
||||
line)):
|
||||
hb_loss_end = line[0:19]
|
||||
data.append(hb_loss_start + " to " + hb_loss_end
|
||||
+ " Heartbeat loss failure on " + hb_loss_host
|
||||
+ " due to too many missing heartbeat "
|
||||
"messages\n")
|
||||
|
||||
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||
comm_loss = False
|
||||
elif (hb_loss_start and comm_loss and not hb_loss_end and
|
||||
hb_loss_host and re.search(
|
||||
hb_loss_host + " Graceful Recovery Wait", line)):
|
||||
hb_loss_end = line[0:19]
|
||||
data.append(hb_loss_start + " to " + hb_loss_end
|
||||
+ " Heartbeat loss failure on " + hb_loss_host
|
||||
+ " due to too many missing heartbeat "
|
||||
"messages (recovery over disabled)\n")
|
||||
|
||||
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||
comm_loss = False
|
||||
|
||||
return data
|
||||
|
||||
def search_hb_loss(self, start_time, end_time, host):
|
||||
"""Searches through the output file created by the heartbeat loss
|
||||
plugin for "Heartbeat Loss" message from host between one minute before
|
||||
start_time and end_time
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
hb_loss = False
|
||||
|
||||
# Open output file from heartbeat loss plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "heartbeat_loss")
|
||||
|
||||
with open(file_path, "r") as heartbeat_loss:
|
||||
for line in heartbeat_loss:
|
||||
if (re.search("Error : " + host + " (.+) Heartbeat Loss ",
|
||||
line)):
|
||||
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
if (date >= start_time - timedelta(minutes=1)
|
||||
and date <= end_time):
|
||||
hb_loss = True
|
||||
break
|
||||
|
||||
return hb_loss
|
||||
|
||||
def search_daemon_fail(self, start_time, end_time, host):
|
||||
"""Searches through the output file created by the daemon failures
|
||||
plugin for "Failed to run the puppet manifest" message from host
|
||||
between 10 seconds before start_time and end_time
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
daemon_fail = False
|
||||
|
||||
# Open output file from daemon failures plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "daemon_failures")
|
||||
|
||||
with open(file_path, "r") as daemon_failures:
|
||||
for line in daemon_failures:
|
||||
if (re.search("\\d " + host
|
||||
+ " (.+) Failed to run the puppet manifest",
|
||||
line)):
|
||||
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
if (date >= start_time - timedelta(seconds=10)
|
||||
and date <= end_time):
|
||||
daemon_fail = True
|
||||
break
|
||||
|
||||
return daemon_fail
|
||||
|
||||
def search_puppet_error(self, start_time, end_time):
|
||||
"""Searches through the output file created by the puppet errors
|
||||
plugin for "Error:" message between 10 seconds before start_time and
|
||||
end_time and returns it
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
puppet_log = None
|
||||
|
||||
# Open output file from puppet errors plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "puppet_errors")
|
||||
|
||||
with open(file_path, "r") as puppet_errors:
|
||||
for line in puppet_errors:
|
||||
if "Error: " in line:
|
||||
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
if (date >= start_time - timedelta(seconds=10)
|
||||
and date <= end_time):
|
||||
puppet_log = line
|
||||
break
|
||||
|
||||
return puppet_log
|
||||
|
||||
def get_events(self, hostname):
|
||||
"""Searches through the output files created by the plugins for
|
||||
significant events and summarizes them, such as "force failed by SM"
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Variables to keep track of details for events
|
||||
mnfa_start, mnfa_hist = None, ""
|
||||
|
||||
# Open output file from maintenance errors plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
|
||||
|
||||
with open(file_path, "r") as mtc:
|
||||
for line in mtc:
|
||||
if "force failed by SM" in line:
|
||||
host = re.findall("Error : (.+) is being", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host
|
||||
+ " force failed by SM\n")
|
||||
elif "Graceful Recovery Failed" in line:
|
||||
host = re.findall("Info : (.+) Task:", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host
|
||||
+ " graceful recovery failed\n")
|
||||
elif "MNFA ENTER" in line:
|
||||
mnfa_start = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
elif "MNFA POOL" in line:
|
||||
pool_hosts = len(line.split("MNFA POOL: ")[1].split())
|
||||
if mnfa_start:
|
||||
mnfa_hist += (" " + str(pool_hosts))
|
||||
else:
|
||||
data_len = len(data)
|
||||
for n in range(0, data_len):
|
||||
event = data[data_len - 1 - n]
|
||||
if "Multi-node failure" in event:
|
||||
temp = " " + str(pool_hosts) + ")\n"
|
||||
data[data_len - 1 - n] = event[:-2] + temp
|
||||
break
|
||||
elif "MNFA EXIT" in line:
|
||||
mnfa_duration = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
mnfa_duration -= mnfa_start
|
||||
mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
data.append(mnfa_start + " Multi-node failure avoidance "
|
||||
+ "(duration: " + str(mnfa_duration)
|
||||
+ "; history:" + mnfa_hist + ")\n")
|
||||
|
||||
mnfa_start, mnfa_hist = None, ""
|
||||
|
||||
# Open output file from swact activity plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
|
||||
|
||||
with open(file_path, "r") as swact_activity:
|
||||
for line in swact_activity:
|
||||
if (re.search("Service (.+) is failed and has reached max "
|
||||
"failures", line)):
|
||||
host = re.findall(
|
||||
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
|
||||
line)[0]
|
||||
svc_failed = re.findall(
|
||||
r"Service \((.+)\) is failed", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host
|
||||
+ " service failure (" + svc_failed
|
||||
+ ")\n")
|
||||
|
||||
return data
|
||||
|
||||
def get_alarms(self, hostname):
|
||||
"""Searches through the 'alarm' output file created by the alarm plugin
|
||||
and summarizes which alarms were found as well as the number of times
|
||||
they were set and cleared
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Open 'alarm' output file from alarm plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "alarm")
|
||||
|
||||
with open(file_path, "r") as alarm:
|
||||
extract = False
|
||||
for line in alarm:
|
||||
if re.search(" \\d", line) and extract:
|
||||
if line.split()[2] == "set":
|
||||
data[-1]["set"] += 1
|
||||
else:
|
||||
data[-1]["clear"] += 1
|
||||
elif hostname == "all" or hostname in line:
|
||||
extract = True
|
||||
alarm = {
|
||||
"name": line[:-1],
|
||||
"set": 0,
|
||||
"clear": 0,
|
||||
}
|
||||
|
||||
data.append(alarm)
|
||||
else:
|
||||
extract = False
|
||||
|
||||
temp = []
|
||||
for entry in data:
|
||||
temp.append(entry["name"] + " - set: " + str(entry["set"])
|
||||
+ ", clear: " + str(entry["clear"]) + "\n")
|
||||
data = temp
|
||||
|
||||
return data
|
||||
|
||||
def get_state_changes(self, hostname):
|
||||
"""Searches through the output files created by the state changes
|
||||
plugin and summarizes the changes of state of the hosts, such as
|
||||
"is ENABLED"
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Open output file from state changes plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "state_changes")
|
||||
|
||||
with open(file_path, "r") as state_changes:
|
||||
for line in state_changes:
|
||||
if "is ENABLED" in line:
|
||||
host = re.findall("Info : (.+) is ENABLED", line)[0]
|
||||
state = re.findall("is (.+)\n", line)[0].lower()
|
||||
if hostname == "all" or hostname in host:
|
||||
data.append(line[0:19] + " " + host + " " + state
|
||||
+ "\n")
|
||||
elif "locked-disabled" in line:
|
||||
host = re.findall(
|
||||
"Info : (.+) u?n?locked-disabled", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host + " disabled\n")
|
||||
|
||||
return data
|
869
tools/collector/debian-scripts/report/execution_engine.py
Executable file
869
tools/collector/debian-scripts/report/execution_engine.py
Executable file
@ -0,0 +1,869 @@
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This file contains the ExecutionEngine class.
|
||||
# The ExecutionEngine class contains all the available algorithms.
|
||||
#
|
||||
# The ExecutionEngine class runs plugins and gathers relevant logs and
|
||||
# information, creating output files in the report directory.
|
||||
#
|
||||
# TODO: Modularize code and separate plugin algorithms into their own
|
||||
# files
|
||||
#
|
||||
########################################################################
|
||||
|
||||
from datetime import datetime
|
||||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tarfile
|
||||
|
||||
import algorithms
|
||||
from correlator import Correlator
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExecutionEngine:
|
||||
def __init__(self, opts, output_directory):
|
||||
"""Constructor for the ExecutionEngine class
|
||||
|
||||
Parameters:
|
||||
opts (dictionary): Options from command line
|
||||
"""
|
||||
self.opts = opts
|
||||
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
|
||||
self.active_controller_directory = None
|
||||
|
||||
# Uncompresses host tar files if not already done
|
||||
with open(os.path.join(output_directory, "untar.log"), "a") as logfile:
|
||||
for obj in (os.scandir(self.opts.directory)):
|
||||
info = os.path.splitext(obj.name)
|
||||
if (obj.is_file() and obj.name != "report_tool.tgz" and
|
||||
tarfile.is_tarfile(obj.path) and not
|
||||
os.path.isdir(os.path.join(self.opts.directory,
|
||||
info[0]))):
|
||||
try:
|
||||
subprocess.run(["tar", "xzfC", obj.path,
|
||||
self.opts.directory],
|
||||
stderr=logfile, check=True)
|
||||
subprocess.run(["echo", "uncompressed", obj.name],
|
||||
check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(e)
|
||||
|
||||
for folder in (f.path for f in os.scandir(self.opts.directory)):
|
||||
database_path = os.path.join(folder, "var", "extra", "database")
|
||||
host_info_path = os.path.join(folder, "var", "extra", "host.info")
|
||||
|
||||
if os.path.isdir(database_path) and os.listdir(database_path):
|
||||
self.active_controller_directory = folder
|
||||
|
||||
if os.path.exists(host_info_path):
|
||||
hostname, subfunction = self._extract_subfunction(
|
||||
host_info_path)
|
||||
if "controller" in subfunction:
|
||||
self.hosts["controllers"][hostname] = folder
|
||||
elif "worker" in subfunction:
|
||||
self.hosts["workers"][hostname] = folder
|
||||
elif "storage" in subfunction:
|
||||
self.hosts["storages"][hostname] = folder
|
||||
|
||||
if not self.active_controller_directory:
|
||||
raise ValueError("Active controller not found")
|
||||
|
||||
def execute(self, plugins, output_directory):
|
||||
"""Run a list of plugins
|
||||
|
||||
Parameters:
|
||||
plugins (Plugin list): List of plugins to run
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
plugin_output_dir = os.path.join(output_directory, "plugins")
|
||||
os.makedirs(plugin_output_dir, exist_ok=True)
|
||||
|
||||
for plugin in plugins:
|
||||
processing = "Processing plugin: " + os.path.basename(plugin.file)
|
||||
hosts = {}
|
||||
if (
|
||||
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
|
||||
): # if host list is given
|
||||
logger.info(
|
||||
f"Processing plugin: {os.path.basename(plugin.file)}")
|
||||
|
||||
for h in plugin.state["hosts"]:
|
||||
if h == "all":
|
||||
hosts.update(self.hosts["workers"])
|
||||
hosts.update(self.hosts["storages"])
|
||||
hosts.update(self.hosts["controllers"])
|
||||
else:
|
||||
hosts.update(self.hosts[h])
|
||||
|
||||
for hostname, folderpath in hosts.items():
|
||||
|
||||
events = []
|
||||
if plugin.state["algorithm"] == algorithms.SUBSTRING:
|
||||
events = self.substring(
|
||||
plugin.state["substring"],
|
||||
[
|
||||
os.path.join(folderpath, file)
|
||||
for file in plugin.state["files"]
|
||||
],
|
||||
)
|
||||
|
||||
# creating output file
|
||||
output_file = os.path.join(
|
||||
plugin_output_dir,
|
||||
f"substring_{hostname}",
|
||||
)
|
||||
if self.opts.verbose:
|
||||
logger.info("output at "
|
||||
+ os.path.relpath(output_file))
|
||||
with open(output_file, "w") as file:
|
||||
file.write(
|
||||
f"Date range: {self.opts.start} until "
|
||||
f"{self.opts.end}\n"
|
||||
)
|
||||
file.write(
|
||||
f"substrings: "
|
||||
f"{' '.join(plugin.state['substring'])}\n"
|
||||
)
|
||||
for line in events:
|
||||
if line[-1] == "\n":
|
||||
file.write(line)
|
||||
else:
|
||||
file.write(line + "\n")
|
||||
else:
|
||||
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||
info = self.system_info()
|
||||
system_info_output = os.path.join(plugin_output_dir,
|
||||
"system_info")
|
||||
with open(system_info_output, "w") as file:
|
||||
for i in info:
|
||||
file.write(i + "\n")
|
||||
|
||||
for k, v in self.hosts.items():
|
||||
file.write(f"{k}: {','.join(v.keys())}\n")
|
||||
if self.opts.verbose:
|
||||
logger.info(processing + ", output at "
|
||||
+ os.path.relpath(system_info_output))
|
||||
else:
|
||||
logger.info(processing)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.AUDIT:
|
||||
hosts = {}
|
||||
hosts.update(self.hosts["workers"])
|
||||
hosts.update(self.hosts["storages"])
|
||||
hosts.update(self.hosts["controllers"])
|
||||
|
||||
for hostname, folderpath in hosts.items():
|
||||
self._create_output_file(
|
||||
f"{hostname}_audit",
|
||||
plugin_output_dir,
|
||||
self.audit(
|
||||
plugin.state["start"],
|
||||
plugin.state["end"],
|
||||
os.path.join(
|
||||
folderpath, "var", "log", "dcmanager",
|
||||
"audit.log"
|
||||
),
|
||||
),
|
||||
processing,
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
||||
self._create_output_file(
|
||||
"swact_activity", plugin_output_dir,
|
||||
self.swact_activity(), processing
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
||||
self._create_output_file(
|
||||
"puppet_errors", plugin_output_dir,
|
||||
self.puppet_errors(), processing
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
||||
self._create_output_file(
|
||||
"process_failures", plugin_output_dir,
|
||||
self.process_failures(), processing
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.ALARM:
|
||||
alarms, logs = self.alarm(
|
||||
plugin.state["alarm_exclude"],
|
||||
plugin.state["entity_exclude"]
|
||||
)
|
||||
alarm_output = os.path.join(plugin_output_dir, "alarm")
|
||||
log_output = os.path.join(plugin_output_dir, "log")
|
||||
|
||||
# creating output alarm file
|
||||
with open(alarm_output, "w") as file:
|
||||
for k, v in alarms.items():
|
||||
file.write(f"{k}:\n")
|
||||
for date in v["dates"]:
|
||||
file.write(f" {date}\n")
|
||||
|
||||
# creating output log file
|
||||
with open(log_output, "w") as file:
|
||||
for k, v in logs.items():
|
||||
file.write(f"{k}: {v['count']}\n")
|
||||
file.write("\n")
|
||||
for k, v in logs.items():
|
||||
file.write(f"{k}:\n")
|
||||
for date in v["dates"]:
|
||||
file.write(f" {date}\n")
|
||||
if self.opts.verbose:
|
||||
logger.info(processing + ", output at "
|
||||
+ os.path.relpath(alarm_output)
|
||||
+ ", " + os.path.relpath(log_output))
|
||||
else:
|
||||
logger.info(processing)
|
||||
elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
||||
self._create_output_file(
|
||||
"heartbeat_loss", plugin_output_dir,
|
||||
self.heartbeat_loss(), processing
|
||||
)
|
||||
elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
||||
self._create_output_file(
|
||||
"maintenance_errors", plugin_output_dir,
|
||||
self.maintenance_errors(), processing
|
||||
)
|
||||
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
||||
self._create_output_file(
|
||||
"daemon_failures", plugin_output_dir,
|
||||
self.daemon_failures(), processing
|
||||
)
|
||||
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
|
||||
self._create_output_file(
|
||||
"state_changes", plugin_output_dir,
|
||||
self.state_changes(), processing
|
||||
)
|
||||
|
||||
if not self.opts.verbose:
|
||||
logger.info("Output files for plugins can be found at " +
|
||||
os.path.relpath(plugin_output_dir))
|
||||
|
||||
# Running the correlator and printing the output from it
|
||||
self.run_correlator(output_directory, plugin_output_dir)
|
||||
|
||||
# Built-in algorithms ------------------------------
|
||||
def alarm(self, alarm_exclude=[], entity_exclude=[]):
|
||||
"""Alarm algorithm
|
||||
Gathers list of alarms and customer logs
|
||||
|
||||
Parameters:
|
||||
alarm_exclude (string list) : List of alarm id patterns to not
|
||||
search for
|
||||
entity_exclude (string list): List of entity id patterns to not
|
||||
search for
|
||||
"""
|
||||
alarm_data = {}
|
||||
log_data = {}
|
||||
|
||||
with open(
|
||||
os.path.join(
|
||||
self.active_controller_directory,
|
||||
"var", "extra", "database", "fm.db.sql.txt"
|
||||
)
|
||||
) as file:
|
||||
start = False
|
||||
for line in file:
|
||||
# start of event log
|
||||
if re.search(r"COPY (public\.)?event_log", line):
|
||||
start = True
|
||||
elif start and line == "\\.\n":
|
||||
break
|
||||
elif start:
|
||||
entry = re.split(r"\t", line)
|
||||
|
||||
INDEX_ALARM_ID = 5
|
||||
INDEX_ACTION = 6
|
||||
INDEX_ENTITY_ID = 8
|
||||
INDEX_ALARM_DATE = 9
|
||||
INDEX_SEVERITY = 10
|
||||
|
||||
alarm_id = entry[INDEX_ALARM_ID]
|
||||
entity_id = entry[INDEX_ENTITY_ID]
|
||||
action = entry[INDEX_ACTION]
|
||||
severity = entry[INDEX_SEVERITY]
|
||||
alarm_date = entry[INDEX_ALARM_DATE]
|
||||
|
||||
entry_date = alarm_date.replace(
|
||||
" ", "T"
|
||||
) # making time format of alarm the same
|
||||
if (self.opts.start <= entry_date
|
||||
and entry_date <= self.opts.end):
|
||||
cont = True
|
||||
# Checks if the alarm is in the user specified list of
|
||||
# alarm or entity ids
|
||||
for id in alarm_exclude:
|
||||
if id in alarm_id:
|
||||
cont = False
|
||||
break
|
||||
|
||||
for entity in entity_exclude:
|
||||
if entity in entity_id:
|
||||
cont = False
|
||||
break
|
||||
|
||||
if not cont:
|
||||
continue
|
||||
|
||||
try:
|
||||
if action == "log":
|
||||
log_info = log_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
]
|
||||
log_info["count"] += 1
|
||||
log_info["dates"].append(alarm_date)
|
||||
else:
|
||||
alarm_info = alarm_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
]
|
||||
alarm_info["dates"].append(
|
||||
f"{alarm_date} {action}")
|
||||
except KeyError:
|
||||
if entry[6] != "log":
|
||||
alarm_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
] = {
|
||||
"dates": [f"{alarm_date} {action}"],
|
||||
}
|
||||
else:
|
||||
log_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
] = {
|
||||
"count": 1,
|
||||
"dates": [alarm_date],
|
||||
}
|
||||
|
||||
for _, v in alarm_data.items():
|
||||
v["dates"] = sorted(v["dates"])
|
||||
temp = []
|
||||
temp.append(v["dates"][0])
|
||||
for i in range(1, len(v["dates"])):
|
||||
if v["dates"][i].split()[2] != v["dates"][i-1].split()[2]:
|
||||
temp.append(v["dates"][i])
|
||||
v["dates"] = temp
|
||||
|
||||
for _, v in log_data.items():
|
||||
v["dates"] = sorted(v["dates"])
|
||||
|
||||
return alarm_data, log_data
|
||||
|
||||
def substring(self, substr, files):
|
||||
"""Substring algorithm
|
||||
Looks for substrings within files
|
||||
|
||||
Parameters:
|
||||
substr (string list): List of substrings to look for
|
||||
files (string list): List of absolute filepaths to search in
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
# don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT = 0
|
||||
# analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1
|
||||
|
||||
data = []
|
||||
for file in files:
|
||||
try:
|
||||
if not os.path.exists(file):
|
||||
if (re.search("controller-1_(.+)/var/log/mtcAgent.log",
|
||||
file)):
|
||||
continue
|
||||
raise FileNotFoundError(f"File not found: {file}")
|
||||
cont = True
|
||||
# Searching through file
|
||||
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
||||
f"""{file} 2>/dev/null""")
|
||||
status = self._continue(file)
|
||||
|
||||
if (status == CONTINUE_CURRENT
|
||||
or status == CONTINUE_CURRENT_OLD):
|
||||
# continue with current file
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
# Searching through rotated log files that aren't compressed
|
||||
n = 1
|
||||
while os.path.exists(f"{file}.{n}") and cont:
|
||||
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
||||
f"""{file}.{n} 2>/dev/null""")
|
||||
status = self._continue(f"{file}.{n}")
|
||||
|
||||
if (status == CONTINUE_CURRENT
|
||||
or status == CONTINUE_CURRENT_OLD):
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
n += 1
|
||||
|
||||
# Searching through rotated log files
|
||||
while os.path.exists(f"{file}.{n}.gz") and cont:
|
||||
command = (f"""zgrep -E "{'|'.join(s for s in substr)}" """
|
||||
f"""{file}.{n}.gz 2>/dev/null""")
|
||||
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
||||
|
||||
if (status == CONTINUE_CURRENT
|
||||
or status == CONTINUE_CURRENT_OLD):
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
n += 1
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
continue
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def system_info(self):
|
||||
"""System info algorithm
|
||||
Presents basic information about the system
|
||||
"""
|
||||
data = []
|
||||
with open(
|
||||
os.path.join(
|
||||
self.active_controller_directory, "etc", "platform",
|
||||
"platform.conf"
|
||||
)
|
||||
) as file:
|
||||
for line in file:
|
||||
if "system_mode" in line:
|
||||
data.append(
|
||||
f"System Mode: "
|
||||
f"{re.match('^system_mode=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "system_type" in line:
|
||||
data.append(
|
||||
f"System Type: "
|
||||
f"{re.match('^system_type=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "distributed_cloud_role" in line:
|
||||
role = re.match('^distributed_cloud_role=(.*)',
|
||||
line).group(1)
|
||||
data.append(f"Distributed cloud role: {role}")
|
||||
elif "sw_version" in line:
|
||||
data.append(
|
||||
f"SW Version: "
|
||||
f"{re.match('^sw_version=(.*)', line).group(1)}"
|
||||
)
|
||||
with open(
|
||||
os.path.join(self.active_controller_directory, "etc", "build.info")
|
||||
) as file:
|
||||
for line in file:
|
||||
if "BUILD_TYPE" in line:
|
||||
data.append(
|
||||
f"Build Type: "
|
||||
f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
||||
)
|
||||
elif re.match("^OS=(.*)", line):
|
||||
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
|
||||
|
||||
return data
|
||||
|
||||
def swact_activity(self):
|
||||
"""Swact activity algorithm
|
||||
Presents all swacting activity in the system
|
||||
"""
|
||||
data = []
|
||||
sm_files = []
|
||||
sm_customer_files = []
|
||||
swact_start = None
|
||||
swact_in_progress = False
|
||||
swact_end = None
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
sm_path = os.path.join(folder, "var", "log", "sm.log")
|
||||
sm_files.append(sm_path)
|
||||
sm_customer_path = os.path.join(folder, "var", "log",
|
||||
"sm-customer.log")
|
||||
sm_customer_files.append(sm_customer_path)
|
||||
|
||||
sm_substrings = ["Uncontrolled swact", "Swact has started,",
|
||||
"Neighbor (.+) is now in the down",
|
||||
"Service (.+) has reached max failures",
|
||||
"Swact update"]
|
||||
data = self.substring(sm_substrings, sm_files)
|
||||
|
||||
for i, line in enumerate(data):
|
||||
if "Swact has started," in line and not swact_in_progress:
|
||||
swact_in_progress = True
|
||||
swact_start = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
elif "Swact update" in line and swact_in_progress:
|
||||
swact_in_progress = False
|
||||
swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
line += f" SWACT TOOK {swact_end - swact_start} \n"
|
||||
data[i] = line
|
||||
|
||||
sm_customer_substrings = [
|
||||
"swact", "active-failed\\s+\\| disabling-failed\\s+\\|"
|
||||
]
|
||||
data += self.substring(sm_customer_substrings, sm_customer_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def puppet_errors(self):
|
||||
"""Puppet errors algorithm
|
||||
Presents log errors from puppet logs
|
||||
"""
|
||||
data = []
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
||||
command = (f"""grep -rh "[m ]Error: " {puppet_folder} """
|
||||
f"""2>/dev/null""")
|
||||
self._evaluate_substring(data, command)
|
||||
return sorted(data)
|
||||
|
||||
def process_failures(self):
|
||||
"""Process failures algorithm
|
||||
Presents log errors from pmond
|
||||
"""
|
||||
data = []
|
||||
files = []
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
pmond = os.path.join(folder, "var", "log", "pmond.log")
|
||||
files.append(pmond)
|
||||
|
||||
data = self.substring(["Error :"], files)
|
||||
|
||||
return data
|
||||
|
||||
def heartbeat_loss(self):
|
||||
"""Heartbeat loss algorithm
|
||||
Presents all heartbeat loss error messages in the system
|
||||
"""
|
||||
data = []
|
||||
hb_files = []
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
hb_path = os.path.join(folder, "var", "log", "hbsAgent.log")
|
||||
hb_files.append(hb_path)
|
||||
|
||||
hb_substrings = ["Heartbeat Loss"]
|
||||
data = self.substring(hb_substrings, hb_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def maintenance_errors(self):
|
||||
"""Maintenance errors algorithm
|
||||
Presents maintenance errors and other relevant log messages in system
|
||||
"""
|
||||
data = []
|
||||
mtc_files = []
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
agent = os.path.join(folder, "var", "log", "mtcAgent.log")
|
||||
mtc_files.append(agent)
|
||||
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
client = os.path.join(folder, "var", "log", "mtcClient.log")
|
||||
mtc_files.append(client)
|
||||
|
||||
mtc_substrings = ["Error : ", "Configuration failure",
|
||||
"In-Test Failure", "Loss Of Communication",
|
||||
"Graceful Recovery Wait ",
|
||||
"regained MTCALIVE from host that has rebooted",
|
||||
"Connectivity Recovered ; ",
|
||||
"auto recovery disabled", "Graceful Recovery Failed",
|
||||
"MNFA ENTER", "MNFA EXIT", "MNFA POOL"]
|
||||
data = self.substring(mtc_substrings, mtc_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def daemon_failures(self):
|
||||
"""Daemon failures algorithm
|
||||
Presents all failed puppet manifest messages in the system
|
||||
"""
|
||||
data = []
|
||||
daemon_files = []
|
||||
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
daemon_path = os.path.join(folder, "var", "log", "daemon.log")
|
||||
daemon_files.append(daemon_path)
|
||||
|
||||
daemon_substrings = ["Failed to run the puppet manifest"]
|
||||
data = self.substring(daemon_substrings, daemon_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def state_changes(self):
|
||||
"""State changes algorithm
|
||||
Presents all messages in the system regarding the state of hosts
|
||||
"""
|
||||
data = []
|
||||
sc_files = []
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
sc_path = os.path.join(folder, "var", "log", "mtcAgent.log")
|
||||
sc_files.append(sc_path)
|
||||
|
||||
sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"]
|
||||
data = self.substring(sc_substrings, sc_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def audit(self, start, end, audit_log_path):
|
||||
"""Counts audit events in dcmanager within a specified date range
|
||||
|
||||
Parameters:
|
||||
start (string) : start date in YYYY-MM-DD HH:MM:SS format
|
||||
end (string) : end date in YYYY-MM-DD HH:MM:SS format
|
||||
audit_log_path (string) : absolute path of augit log file
|
||||
"""
|
||||
if not shutil.which("lnav"):
|
||||
raise ValueError("Lnav program not found")
|
||||
|
||||
SECONDS_PER_HOUR = 3600
|
||||
fmt = "%Y-%m-%d %H:%M:%S"
|
||||
|
||||
d1 = datetime.strptime(start, fmt)
|
||||
d2 = datetime.strptime(end, fmt)
|
||||
seconds = (d2 - d1).total_seconds()
|
||||
|
||||
log_texts = [
|
||||
"Triggered subcloud audit%",
|
||||
"Trigger patch audit%",
|
||||
"Trigger load audit%",
|
||||
"Triggered firmware audit%",
|
||||
"Triggered kubernetes audit%",
|
||||
# Counts sum of audits from all subclouds
|
||||
]
|
||||
INDEX_MIDDLE_WORD = 1
|
||||
data = [("These rates and totals represent the sum of audits from "
|
||||
+ "all subclouds")]
|
||||
|
||||
def command(text):
|
||||
|
||||
return (
|
||||
f'lnav -R -n -c ";SELECT count(log_body) AS '
|
||||
f'{text.split(" ")[INDEX_MIDDLE_WORD]}_total from '
|
||||
f'openstack_log WHERE '
|
||||
f'(log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
||||
f' AND log_body like \\"{text}\\"" "{audit_log_path}"'
|
||||
)
|
||||
|
||||
for text in log_texts:
|
||||
p = subprocess.Popen(command(text), shell=True,
|
||||
stdout=subprocess.PIPE)
|
||||
for line in p.stdout:
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.isnumeric():
|
||||
data.append(
|
||||
f"rate "
|
||||
f"{round((int(line)/seconds * SECONDS_PER_HOUR), 3)} "
|
||||
f"per hour. total: {line}"
|
||||
)
|
||||
else:
|
||||
data.append(line)
|
||||
return data
|
||||
|
||||
# -----------------------------------
|
||||
|
||||
def run_correlator(self, output_directory, plugin_output_dir):
|
||||
"""Runs the correlator and prints the results differently based on if
|
||||
the tool was run with or without the verbose option
|
||||
|
||||
Parameters:
|
||||
output_directory (string) : directory to place output files from
|
||||
correlator
|
||||
plugin_output_dir (string) : directory with output files from
|
||||
plugins
|
||||
"""
|
||||
correlator = Correlator(plugin_output_dir)
|
||||
failures, events, alarms, state_changes = correlator.run(
|
||||
self.opts.hostname)
|
||||
failures_len, events_len = len(failures), len(events)
|
||||
alarms_len, state_changes_len = len(alarms), len(state_changes)
|
||||
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
|
||||
events.append("\nTotal events found: " + str(events_len) + "\n")
|
||||
alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n")
|
||||
state_changes.append("\nTotal state changes found: "
|
||||
+ str(state_changes_len) + "\n")
|
||||
|
||||
logger.info("\nRunning correlator...")
|
||||
self._create_output_file("correlator_failures", output_directory,
|
||||
failures, "")
|
||||
self._create_output_file("correlator_events", output_directory,
|
||||
events, "")
|
||||
self._create_output_file("correlator_alarms", output_directory,
|
||||
alarms, "")
|
||||
self._create_output_file("correlator_state_changes", output_directory,
|
||||
state_changes, "")
|
||||
|
||||
if not self.opts.verbose:
|
||||
logger.info("Output can be found at "
|
||||
+ os.path.relpath(output_directory) + "\n")
|
||||
logger.info("Failures: " + str(failures_len))
|
||||
for f in failures[:-1]:
|
||||
if "Uncontrolled swact" in f:
|
||||
logger.info(f[0:19] + " "
|
||||
+ re.findall("active controller:? (.+)\n",
|
||||
f)[0] + " uncontrolled swact")
|
||||
elif "failure on" in f:
|
||||
host = re.findall(r"failure on ([^\s]+) ", f)[0]
|
||||
logger.info(f[0:19] + " " + host + " "
|
||||
+ re.findall("^(.+) failure on ",
|
||||
f[43:])[0].lower() + " failure")
|
||||
else:
|
||||
logger.info(f[:-1])
|
||||
if failures_len != 0:
|
||||
logger.info("\nEvents: " + str(events_len))
|
||||
else:
|
||||
logger.info("Events: " + str(events_len))
|
||||
logger.info("Alarms: " + str(alarms_len))
|
||||
logger.info("State Changes: " + str(state_changes_len))
|
||||
else:
|
||||
logger.info("\nFailures: " + str(failures_len))
|
||||
for f in failures[:-1]:
|
||||
logger.info(f[:-1])
|
||||
|
||||
# Dictionary to keep track of number of times events happens on
|
||||
# each host
|
||||
events_summ = {}
|
||||
for e in events[:-1]:
|
||||
k = e[20:-1].split(" (", 1)[0]
|
||||
if not events_summ.get(k):
|
||||
events_summ[k] = 1
|
||||
else:
|
||||
events_summ[k] += 1
|
||||
|
||||
if failures_len != 0:
|
||||
logger.info("\nEvents: " + str(events_len))
|
||||
else:
|
||||
logger.info("Events: " + str(events_len))
|
||||
for k, v in sorted(events_summ.items()):
|
||||
logger.info(k + ": " + str(v) + " time(s)")
|
||||
|
||||
if events_len != 0:
|
||||
logger.info("\nAlarms: " + str(alarms_len))
|
||||
else:
|
||||
logger.info("Alarms: " + str(alarms_len))
|
||||
logger.info("The full list of alarms can be found at "
|
||||
+ os.path.relpath(output_directory)
|
||||
+ "/correlator_alarms")
|
||||
|
||||
# Dictionary to keep track of number of times state changes
|
||||
# happens on each host
|
||||
state_changes_summ = {}
|
||||
for s in state_changes[:-1]:
|
||||
k = s[20:-1]
|
||||
if "enabled" in k:
|
||||
k = k.split("enabled", 1)[0] + "enabled"
|
||||
if not state_changes_summ.get(k):
|
||||
state_changes_summ[k] = 1
|
||||
else:
|
||||
state_changes_summ[k] += 1
|
||||
|
||||
if alarms_len != 0:
|
||||
logger.info("\nState Changes: " + str(state_changes_len))
|
||||
else:
|
||||
logger.info("State Changes: " + str(state_changes_len))
|
||||
for k, v in sorted(state_changes_summ.items()):
|
||||
logger.info(k + ": " + str(v) + " time(s)")
|
||||
|
||||
def _continue(self, file, compressed=False):
|
||||
# don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT = 0
|
||||
# analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1
|
||||
# don't analyze current file, continue to older files
|
||||
CONTINUE_OLD = 2
|
||||
|
||||
# check date of first log event and compare with provided
|
||||
# start, end dates
|
||||
first = ""
|
||||
|
||||
if not compressed:
|
||||
with open(file) as f:
|
||||
line = f.readline()
|
||||
first = line[0:19]
|
||||
else:
|
||||
with gzip.open(file, "rb") as f:
|
||||
line = f.readline().decode("utf-8")
|
||||
first = line[0:19]
|
||||
try:
|
||||
datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
first = line[0:19]
|
||||
except ValueError:
|
||||
return CONTINUE_CURRENT_OLD
|
||||
|
||||
if first < self.opts.start:
|
||||
return CONTINUE_CURRENT
|
||||
elif first < self.opts.end and first > self.opts.start:
|
||||
return CONTINUE_CURRENT_OLD
|
||||
elif first > self.opts.end:
|
||||
return CONTINUE_OLD
|
||||
|
||||
def _evaluate_substring(self, data, command):
|
||||
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
for line in p.stdout:
|
||||
line = line.decode("utf-8")
|
||||
# different date locations for log events
|
||||
dates = [line[0:19], line[2:21]]
|
||||
for date in dates:
|
||||
try:
|
||||
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
||||
if date > self.opts.start and date < self.opts.end:
|
||||
if line[0] == "|": # sm-customer.log edge case
|
||||
line = line[1:].strip()
|
||||
line = re.sub("\\s+", " ", line)
|
||||
data.append(line)
|
||||
break
|
||||
except ValueError:
|
||||
if date == dates[-1]:
|
||||
data.append(line)
|
||||
|
||||
def _extract_subfunction(self, host_info_path):
|
||||
GROUP_ONE = 1
|
||||
with open(host_info_path) as file:
|
||||
for line in file:
|
||||
hostname_match = re.match(
|
||||
r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
||||
subfunction_match = re.match(
|
||||
r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
||||
if subfunction_match:
|
||||
subfunction = subfunction_match.group(GROUP_ONE)
|
||||
if hostname_match:
|
||||
hostname = hostname_match.group(GROUP_ONE)
|
||||
return hostname, subfunction
|
||||
|
||||
def _create_output_file(self, filename, directory, data, processing):
|
||||
with open(os.path.join(directory, filename), "w") as file:
|
||||
for i in data:
|
||||
if i[-1] == "\n":
|
||||
file.write(i)
|
||||
else:
|
||||
file.write(i + "\n")
|
||||
if self.opts.verbose:
|
||||
output = ("output at "
|
||||
+ os.path.relpath(os.path.join(directory, filename)))
|
||||
if processing == "":
|
||||
logger.info(output)
|
||||
else:
|
||||
logger.info(processing + ", " + output)
|
||||
elif processing != "":
|
||||
logger.info(processing)
|
@ -40,8 +40,8 @@ class Plugin:
|
||||
"files": [],
|
||||
"hosts": [],
|
||||
"substring": [],
|
||||
"alarm_ids": [],
|
||||
"entity_ids": [],
|
||||
"alarm_exclude": [],
|
||||
"entity_exclude": [],
|
||||
"start": None,
|
||||
"end": None,
|
||||
}
|
||||
@ -93,10 +93,11 @@ class Plugin:
|
||||
self.state["substring"].append(data[1])
|
||||
elif label == "hosts":
|
||||
self.state["hosts"] = value.replace(" ", "").split(",")
|
||||
elif label == "alarm_ids":
|
||||
self.state["alarm_ids"] = value.replace(" ", "").split(",")
|
||||
elif label == "entity_ids":
|
||||
self.state["entity_ids"] = value.replace(" ", "").split(",")
|
||||
elif label == "alarm_exclude":
|
||||
self.state["alarm_exclude"] = value.replace(" ", "").split(",")
|
||||
elif label == "entity_exclude":
|
||||
self.state["entity_exclude"] = value.replace(
|
||||
" ", "").split(",")
|
||||
elif label == "files":
|
||||
self.state["files"] = value.replace(" ", "").split(",")
|
||||
elif label == "start":
|
||||
@ -117,74 +118,77 @@ class Plugin:
|
||||
"""
|
||||
|
||||
plugin_name = os.path.basename(self.file)
|
||||
HOSTS_ERR = f"plugin: {plugin_name} should not have hosts specified"
|
||||
|
||||
if self.state["algorithm"] == algorithms.SUBSTRING:
|
||||
if len(self.state["files"]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} needs files specified for substring algorithm"
|
||||
)
|
||||
if len(self.state["hosts"]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} needs hosts specified for substring algorithm"
|
||||
)
|
||||
if len(self.state["substring"]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} need substring specified for substring algorithm"
|
||||
)
|
||||
self.validate_state(plugin_name, "files")
|
||||
self.validate_state(plugin_name, "hosts")
|
||||
self.validate_state(plugin_name, "substring")
|
||||
elif self.state["algorithm"] == algorithms.ALARM:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
)
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.SWACT:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.PUPPET:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.PROCESS_FAILURE:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
)
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.STATE_CHANGES:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(HOSTS_ERR)
|
||||
elif self.state["algorithm"] == algorithms.AUDIT:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
)
|
||||
raise ValueError(HOSTS_ERR)
|
||||
|
||||
try:
|
||||
datetime.strptime(self.state["start"], "%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
raise ValueError(
|
||||
f"plugin : {plugin_name} needs a start time in YYYY-MM-DD HH:MM:SS format"
|
||||
f"plugin : {plugin_name} needs a start time in YYYY-MM-DD "
|
||||
f"HH:MM:SS format"
|
||||
)
|
||||
|
||||
try:
|
||||
datetime.strptime(self.state["end"], "%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
raise ValueError(
|
||||
f"plugin : {plugin_name} needs an end time in YYYY-MM-DD HH:MM:SS format"
|
||||
f"plugin : {plugin_name} needs an end time in YYYY-MM-DD "
|
||||
f"HH:MM:SS format"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} unknown algorithm {self.state['algorithm']}"
|
||||
f"plugin: {plugin_name} unknown algorithm "
|
||||
f"{self.state['algorithm']}"
|
||||
)
|
||||
|
||||
for host in self.state["hosts"]:
|
||||
if host not in ["controllers", "workers", "storages", "all"]:
|
||||
raise ValueError(
|
||||
f"host not recognized: '{host}', accepted hosts are 'controllers', 'workers', 'storages', 'all'"
|
||||
f"host not recognized: '{host}', accepted hosts are "
|
||||
f"'controllers', 'workers', 'storages', 'all'"
|
||||
)
|
||||
|
||||
def validate_state(self, plugin_name, key):
|
||||
if len(self.state[key]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} needs {key} specified for "
|
||||
f"substring algorithm"
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{json.dumps(self.state)} File: {self.file}"
|
3
tools/collector/debian-scripts/report/plugins/alarm
Executable file
3
tools/collector/debian-scripts/report/plugins/alarm
Executable file
@ -0,0 +1,3 @@
|
||||
algorithm=alarm
|
||||
alarm_exclude=400., 800.
|
||||
entity_exclude=subsystem=vim
|
1
tools/collector/debian-scripts/report/plugins/daemon_failures
Executable file
1
tools/collector/debian-scripts/report/plugins/daemon_failures
Executable file
@ -0,0 +1 @@
|
||||
algorithm=daemon_failures
|
1
tools/collector/debian-scripts/report/plugins/heartbeat_loss
Executable file
1
tools/collector/debian-scripts/report/plugins/heartbeat_loss
Executable file
@ -0,0 +1 @@
|
||||
algorithm=heartbeat_loss
|
1
tools/collector/debian-scripts/report/plugins/maintenance_errors
Executable file
1
tools/collector/debian-scripts/report/plugins/maintenance_errors
Executable file
@ -0,0 +1 @@
|
||||
algorithm=maintenance_errors
|
1
tools/collector/debian-scripts/report/plugins/process_failures
Executable file
1
tools/collector/debian-scripts/report/plugins/process_failures
Executable file
@ -0,0 +1 @@
|
||||
algorithm=process_failures
|
1
tools/collector/debian-scripts/report/plugins/puppet_errors
Executable file
1
tools/collector/debian-scripts/report/plugins/puppet_errors
Executable file
@ -0,0 +1 @@
|
||||
algorithm=puppet_errors
|
@ -0,0 +1 @@
|
||||
algorithm=state_changes
|
5
tools/collector/debian-scripts/report/plugins/substring
Executable file
5
tools/collector/debian-scripts/report/plugins/substring
Executable file
@ -0,0 +1,5 @@
|
||||
algorithm=substring
|
||||
files=var/log/mtcAgent.log, var/log/sm.log
|
||||
hosts=controllers
|
||||
substring=operation failed
|
||||
substring=Failed to send message
|
1
tools/collector/debian-scripts/report/plugins/swact_activity
Executable file
1
tools/collector/debian-scripts/report/plugins/swact_activity
Executable file
@ -0,0 +1 @@
|
||||
algorithm=swact_activity
|
1
tools/collector/debian-scripts/report/plugins/system_info
Executable file
1
tools/collector/debian-scripts/report/plugins/system_info
Executable file
@ -0,0 +1 @@
|
||||
algorithm=system_info
|
360
tools/collector/debian-scripts/report/report.py
Executable file
360
tools/collector/debian-scripts/report/report.py
Executable file
@ -0,0 +1,360 @@
|
||||
#!/usr/bin/env python3
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# Description: The Report tool is used to gather relevant log events
|
||||
# and information about the system from a collect bundle.
|
||||
#
|
||||
# The report tool allows user created plugins which decides relevance
|
||||
# for log events. Plugins contain an algorithm label which instructs the
|
||||
# tool what information to search and how to search for it.
|
||||
#
|
||||
# The report tool requires the collect bundle and host tarballs to be
|
||||
# untarred.
|
||||
#
|
||||
# The report tool reads user plugins from the report directory in the
|
||||
# top level of the collect bundle, and outputs files containing files
|
||||
# containing relevant logs to this directory as well.
|
||||
#
|
||||
# Typical Usage:
|
||||
# command line functionality
|
||||
# ------------------------------- ----------------------------------
|
||||
# > report.py - Run all plugins in directory
|
||||
# > report.py [plugin ...] - Run only specified plugins
|
||||
# > report.py <algorithm> [labels] - Run algorithm with labels
|
||||
# > report.py --help - help message
|
||||
# > report.py <algorithm> --help - algorithm specific help
|
||||
#
|
||||
# See --help output for a complete list of full and abbreviated
|
||||
# command line options and examples of plugins.
|
||||
#
|
||||
# Refer to README file for more usage and output examples
|
||||
#######################################################################
|
||||
|
||||
import argparse
|
||||
from cmath import log
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
from execution_engine import ExecutionEngine
|
||||
from plugin import Plugin
|
||||
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
base_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
parent_dir = os.path.dirname(base_dir)
|
||||
default_path = os.path.dirname(parent_dir)
|
||||
plugins = []
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Log Event Reporter",
|
||||
epilog="Place plugins in 'plugins' directory found in 'report' directory "
|
||||
"at top level of collect bundle.\nOutput files will be placed in 'report' "
|
||||
"directory.\nThis tool will create a report.log and untar.log file along "
|
||||
"with other output files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Verbose output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--start",
|
||||
default="20000101",
|
||||
help="Specify a start date in YYYYMMDD format for analysis "
|
||||
"(default:20000101)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--end",
|
||||
default=datetime.strftime(now + timedelta(days=1), "%Y%m%d"),
|
||||
help="Specify an end date in YYYYMMDD format for analysis "
|
||||
"(default: current date)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--plugin",
|
||||
default=None,
|
||||
nargs="*",
|
||||
help="Specify what plugins to run (default: runs every plugin in plugins "
|
||||
"folder)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--directory",
|
||||
default=default_path,
|
||||
help="Specify top level of collect bundle to analyze "
|
||||
"(default: two levels above tool directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hostname",
|
||||
default="all",
|
||||
help="Specify host for correlator to find significant events and state "
|
||||
"changes for (default: all hosts)",
|
||||
)
|
||||
subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
|
||||
|
||||
# substring algorithm arguments
|
||||
parser_substring = subparsers.add_parser(
|
||||
"substring",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="""Searches through specified files for lines containing specified
|
||||
substring. There will be an output file for each host of the host
|
||||
type specified.""",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=substring\n"
|
||||
" files=var/log/mtcAgent.log, var/log/sm.log\n"
|
||||
" hosts=controllers\n"
|
||||
" substring=operation failed\n"
|
||||
" substring=Failed to send message",
|
||||
)
|
||||
substring_required = parser_substring.add_argument_group("required arguments")
|
||||
substring_required.add_argument(
|
||||
"--files",
|
||||
required=True,
|
||||
nargs="+",
|
||||
help="Files to perform substring analysis on (required)",
|
||||
)
|
||||
substring_required.add_argument(
|
||||
"--substring", nargs="+", required=True,
|
||||
help="Substrings to search for (required)"
|
||||
)
|
||||
substring_required.add_argument(
|
||||
"--hosts",
|
||||
choices=["controllers", "workers", "storages", "all"],
|
||||
required=True,
|
||||
nargs="+",
|
||||
help="Host types to perform analysis on (required)",
|
||||
)
|
||||
|
||||
|
||||
# alarm algorithm arguments
|
||||
parser_alarm = subparsers.add_parser(
|
||||
"alarm",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Searches through fm.db.sql.txt for alarms and logs except for those "
|
||||
"specified. There are 2 output files: 'alarm', and 'log'",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=alarm\n"
|
||||
" alarm_exclude=400., 800.\n"
|
||||
" entity_exclude=subsystem=vim\n",
|
||||
)
|
||||
parser_alarm.add_argument(
|
||||
"--alarm_exclude",
|
||||
nargs="+",
|
||||
required=False,
|
||||
default=[],
|
||||
help="Alarm id patterns to not search for (not required)",
|
||||
)
|
||||
parser_alarm.add_argument(
|
||||
"--entity_exclude",
|
||||
nargs="+",
|
||||
required=False,
|
||||
default=[],
|
||||
help="Entity id patterns to not search for (not required)",
|
||||
)
|
||||
|
||||
# system info algorithm
|
||||
parser_system_info = subparsers.add_parser(
|
||||
"system_info",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents information about the system",
|
||||
epilog="Plugin file example:\n" " algorithm=system_info\n",
|
||||
)
|
||||
|
||||
# swact activity algorithm
|
||||
parser_swact_activity = subparsers.add_parser(
|
||||
"swact_activity",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents system swacting activity",
|
||||
epilog="Plugin file example:\n" " algorithm=swact_activity\n",
|
||||
)
|
||||
|
||||
# puppet errors algorithm
|
||||
parser_puppet_errors = subparsers.add_parser(
|
||||
"puppet_errors",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any puppet errors",
|
||||
epilog="Plugin file example:\n" " algorithm=puppet_errors\n",
|
||||
)
|
||||
|
||||
# process failures algorithm
|
||||
parser_process_failures = subparsers.add_parser(
|
||||
"process_failures",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any process failures from pmond.log",
|
||||
epilog="Plugin file example:\n" " algorithm=process_failures\n",
|
||||
)
|
||||
|
||||
# daemon failures algorithm
|
||||
parser_daemon_failures = subparsers.add_parser(
|
||||
"daemon_failures",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any puppet manifest failures from daemon.log",
|
||||
epilog="Plugin file example:\n" " algorithm=daemon_failures\n",
|
||||
)
|
||||
|
||||
# heartbeat loss algorithm
|
||||
parser_heartbeat_loss = subparsers.add_parser(
|
||||
"heartbeat_loss",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any heartbeat loss error messages from hbsAgent.log",
|
||||
epilog="Plugin file example:\n" " algorithm=heartbeat_loss\n",
|
||||
)
|
||||
|
||||
# maintenance errors algorithm
|
||||
parser_maintenance_errors = subparsers.add_parser(
|
||||
"maintenance_errors",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents errors and other relevant messages from mtcAgent.log and "
|
||||
"mtcClient.log",
|
||||
epilog="Plugin file example:\n" " algorithm=maintenance_errors\n",
|
||||
)
|
||||
|
||||
# state changes algorithm
|
||||
parser_state_changes = subparsers.add_parser(
|
||||
"state_changes",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any messages from mtcAgent.log regarding the state of "
|
||||
"hosts, such as enabled/disabled",
|
||||
epilog="Plugin file example:\n" " algorithm=state_changes\n",
|
||||
)
|
||||
|
||||
# audit algorithm
|
||||
parser_audit = subparsers.add_parser(
|
||||
"audit",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents information about audit events in dcmanager.\n"
|
||||
"The rates and totals represents the sum of audits on all subclouds ",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=audit\n"
|
||||
" start=2022-06-01 10:00:00\n"
|
||||
" end=2022-06-01 04:00:00\n",
|
||||
)
|
||||
parser_audit.add_argument(
|
||||
"--start",
|
||||
required=False,
|
||||
default=datetime.strftime(now - timedelta(days=7), "%Y-%m-%d %H:%M:%S"),
|
||||
type=str,
|
||||
help="Specify a start date in YYYY-MM-DD HH:MM:SS format for analysis "
|
||||
"(not required, default: 1 week ago)"
|
||||
)
|
||||
parser_audit.add_argument(
|
||||
"--end",
|
||||
required=False,
|
||||
default=datetime.strftime(now, "%Y-%m-%d %H:%M:%S"),
|
||||
type=str,
|
||||
help="Specify an end date in YYYY-MM-DD HH:MM:SS format for analysis "
|
||||
"(not required, default: today)"
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
args.start = datetime.strptime(args.start, "%Y%m%d").strftime(
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
if args.directory.endswith("/"):
|
||||
output_directory = os.path.join(
|
||||
default_path, "report", "output",
|
||||
os.path.basename(os.path.dirname(args.directory))
|
||||
)
|
||||
else:
|
||||
output_directory = os.path.join(
|
||||
default_path, "report", "output", os.path.basename(args.directory)
|
||||
)
|
||||
|
||||
# creating report log
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
open(os.path.join(output_directory, "report.log"), "w").close()
|
||||
|
||||
# setting up logger
|
||||
formatter = logging.Formatter("%(message)s")
|
||||
logger = logging.getLogger()
|
||||
|
||||
logging.basicConfig(
|
||||
filename=os.path.join(output_directory, "report.log"),
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s: %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
)
|
||||
logging.Formatter.converter = time.gmtime
|
||||
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(formatter)
|
||||
|
||||
logger.addHandler(ch)
|
||||
|
||||
if not os.path.isdir(args.directory):
|
||||
sys.exit("Top level of collect bundle given to analyze is not a directory")
|
||||
else:
|
||||
for obj in (os.scandir(args.directory)):
|
||||
info = os.path.splitext(obj.name)
|
||||
|
||||
# TODO: ask user which file to report on if more than one tarball in
|
||||
# directory
|
||||
# Check if collect tarball is in given directory and extracts it if
|
||||
# not already done
|
||||
if (obj.is_file() and info[1] == ".tar"):
|
||||
try:
|
||||
result = subprocess.check_output(["tar", "tf", obj.path],
|
||||
encoding="UTF-8")
|
||||
result = result.split("\n", 1)
|
||||
if not os.path.isdir(os.path.join(args.directory,
|
||||
os.path.dirname(result[0]))):
|
||||
subprocess.run(["tar", "xfC", obj.path, args.directory],
|
||||
check=True)
|
||||
subprocess.run(["echo", "extracted", obj.name], check=True)
|
||||
args.directory = os.path.join(args.directory,
|
||||
os.path.dirname(result[0]))
|
||||
break
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(e)
|
||||
|
||||
try:
|
||||
engine = ExecutionEngine(args, output_directory)
|
||||
except ValueError as e:
|
||||
logger.error(str(e))
|
||||
sys.exit("Confirm you are running the report tool on a collect bundle")
|
||||
|
||||
if args.algorithm:
|
||||
plugins.append(Plugin(opts=vars(args)))
|
||||
else:
|
||||
if args.plugin:
|
||||
for p in args.plugin:
|
||||
path = os.path.join(default_path, "report", "plugins", p)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
plugins.append(Plugin(path))
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
|
||||
else:
|
||||
logger.warning(f"{p} plugin does not exist")
|
||||
else:
|
||||
path = os.path.join(default_path, "report", "plugins")
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
logger.error("Plugins folder is empty")
|
||||
else:
|
||||
for file in os.listdir(path):
|
||||
try:
|
||||
plugins.append(Plugin(os.path.join(path, file)))
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
|
||||
engine.execute(plugins, output_directory)
|
@ -3,3 +3,4 @@ etc/collect.d/* /etc/collect.d
|
||||
usr/local/sbin/* /usr/local/sbin
|
||||
usr/local/bin/collect /usr/local/bin
|
||||
usr/sbin/collect /usr/sbin
|
||||
/usr/local/bin/report/* /usr/local/bin/report
|
||||
|
@ -13,8 +13,10 @@ override_dh_auto_install:
|
||||
|
||||
install -m 755 -d $(SYSCONFDIR)/collect.d
|
||||
install -m 755 -d $(SYSCONFDIR)/collect
|
||||
install -m 755 -d $(SYSCONFDIR)/collect/plugins # Report Tool
|
||||
install -m 755 -d $(ROOT)/usr/local/sbin
|
||||
install -m 755 -d $(ROOT)/usr/local/bin
|
||||
install -m 755 -d $(ROOT)/usr/local/bin/report/tool # Report Tool
|
||||
install -m 755 -d $(SBINDIR)
|
||||
|
||||
install -m 755 -p collect $(ROOT)/usr/local/sbin/collect
|
||||
@ -26,6 +28,24 @@ override_dh_auto_install:
|
||||
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
|
||||
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
|
||||
|
||||
# Report Tool
|
||||
install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/tool/report.py
|
||||
install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/tool/execution_engine.py
|
||||
install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/tool/algorithms.py
|
||||
install -m 755 -p report/plugin.py $(ROOT)/usr/local/bin/report/tool/plugin.py
|
||||
install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/tool/correlator.py
|
||||
install -m 755 -p report/README $(ROOT)/usr/local/bin/report/tool/README
|
||||
install -m 755 -p report/plugins/alarm $(SYSCONFDIR)/collect/plugins/alarm
|
||||
install -m 755 -p report/plugins/daemon_failures $(SYSCONFDIR)/collect/plugins/daemon_failures
|
||||
install -m 755 -p report/plugins/heartbeat_loss $(SYSCONFDIR)/collect/plugins/heartbeat_loss
|
||||
install -m 755 -p report/plugins/maintenance_errors $(SYSCONFDIR)/collect/plugins/maintenance_errors
|
||||
install -m 755 -p report/plugins/process_failures $(SYSCONFDIR)/collect/plugins/process_failures
|
||||
install -m 755 -p report/plugins/puppet_errors $(SYSCONFDIR)/collect/plugins/puppet_errors
|
||||
install -m 755 -p report/plugins/state_changes $(SYSCONFDIR)/collect/plugins/state_changes
|
||||
install -m 755 -p report/plugins/substring $(SYSCONFDIR)/collect/plugins/substring
|
||||
install -m 755 -p report/plugins/swact_activity $(SYSCONFDIR)/collect/plugins/swact_activity
|
||||
install -m 755 -p report/plugins/system_info $(SYSCONFDIR)/collect/plugins/system_info
|
||||
|
||||
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
|
||||
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
|
||||
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb
|
||||
|
@ -1,62 +0,0 @@
|
||||
Refer to report.py file header for a description of the tool
|
||||
|
||||
Example:
|
||||
|
||||
Consider the following collect bundle structure
|
||||
|
||||
SELECT_NODES_20220527.193605
|
||||
├── controller-0_20220527.193605
|
||||
│ ├── etc
|
||||
│ ├── root
|
||||
│ └── var
|
||||
├── controller-1_20220527.193605
|
||||
│ ├── etc
|
||||
│ ├── root
|
||||
│ └── var
|
||||
├── plugins (where the plugin files will be placed)
|
||||
│ ├── alarm_plugin_example
|
||||
│ └── substring_plugin_example
|
||||
├── report
|
||||
└── tool (where the tool will be placed)
|
||||
└── output (where the output files will be placed)
|
||||
|
||||
|
||||
> cat plugins/alarm_plugin_example
|
||||
|
||||
algorithm=alarm
|
||||
alarm_ids=400.,401.
|
||||
entity_ids = host=controller-0
|
||||
|
||||
> cat plugins/substring_plugin_example
|
||||
|
||||
algorithm=substring
|
||||
files=var/log/mtcAgent.log
|
||||
hosts=controllers
|
||||
substring=operation failed
|
||||
|
||||
> report/tool/report.py --start 20220501 --end 20220530
|
||||
|
||||
Running the command above will populate the report folder with output files.
|
||||
The tool also provides default values, more details are in 'report.py -h'.
|
||||
|
||||
The substring algorithm creates an output file for every host of the
|
||||
specified host type. The files will contain log events within the
|
||||
provided date range containing the substring 'operation failed'.
|
||||
|
||||
The alarm algorithm creates two output file: 'log' and 'alarm'
|
||||
'log' contains customer log messages created within the provided date range,
|
||||
and 'alarm' contains system alarms created within the provided date range.
|
||||
|
||||
For more detailed information about an algorithm use 'report.py <algorithm> -h'.
|
||||
|
||||
Here is the report directory after running the above command
|
||||
|
||||
report
|
||||
├── output
|
||||
│ └── 20220815.140008 (time in utc when tool was ran)
|
||||
│ ├── alarm
|
||||
│ ├── controller-0_substring_plugin_example_substring
|
||||
│ ├── controller-1_substring_plugin_example_substring
|
||||
│ ├── report.log (log file for report tool)
|
||||
│ └── log
|
||||
└── tool (where the report tool is)
|
@ -1,546 +0,0 @@
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This file contains the ExecutionEngine class.
|
||||
# The ExecutionEngine class contains all the available algorithms.
|
||||
#
|
||||
# The ExecutionEngine class runs plugins and gathers relevant logs and
|
||||
# information, creating output files in the report directory.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
from datetime import datetime
|
||||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
import algorithms
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExecutionEngine:
|
||||
def __init__(self, opts):
|
||||
"""Constructor for the ExecutionEngine class
|
||||
|
||||
Parameters:
|
||||
opts (dictionary): Options from command line
|
||||
"""
|
||||
self.opts = opts
|
||||
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
|
||||
self.active_controller_directory = None
|
||||
|
||||
for folder in (f.path for f in os.scandir(self.opts.directory)):
|
||||
database_path = os.path.join(folder, "var", "extra", "database")
|
||||
host_info_path = os.path.join(folder, "var", "extra", "host.info")
|
||||
|
||||
if os.path.isdir(database_path) and os.listdir(database_path):
|
||||
self.active_controller_directory = folder
|
||||
|
||||
if os.path.exists(host_info_path):
|
||||
hostname, subfunction = self._extract_subfunction(host_info_path)
|
||||
if "controller" in subfunction:
|
||||
self.hosts["controllers"][hostname] = folder
|
||||
elif "worker" in subfunction:
|
||||
self.hosts["workers"][hostname] = folder
|
||||
elif "storage" in subfunction:
|
||||
self.hosts["storages"][hostname] = folder
|
||||
|
||||
if not self.active_controller_directory:
|
||||
raise ValueError("Active controller not found")
|
||||
|
||||
def execute(self, plugins, output_directory):
|
||||
"""Run a list of plugins
|
||||
|
||||
Parameters:
|
||||
plugins (Plugin list): List of plugins to run
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
|
||||
for plugin in plugins:
|
||||
logger.info(f"Processing plugin: {os.path.basename(plugin.file)}")
|
||||
hosts = {}
|
||||
if (
|
||||
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
|
||||
): # if host list is given
|
||||
for h in plugin.state["hosts"]:
|
||||
if h == "all":
|
||||
hosts.update(self.hosts["workers"])
|
||||
hosts.update(self.hosts["storages"])
|
||||
hosts.update(self.hosts["controllers"])
|
||||
else:
|
||||
hosts.update(self.hosts[h])
|
||||
|
||||
for hostname, folderpath in hosts.items():
|
||||
|
||||
events = []
|
||||
if plugin.state["algorithm"] == algorithms.SUBSTRING:
|
||||
try:
|
||||
events = self.substring(
|
||||
plugin.state["substring"],
|
||||
[
|
||||
os.path.join(folderpath, file)
|
||||
for file in plugin.state["files"]
|
||||
],
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
continue
|
||||
|
||||
# creating output file
|
||||
output_file = os.path.join(
|
||||
output_directory,
|
||||
f"{hostname}_{os.path.basename(plugin.file)}_{plugin.state['algorithm']}",
|
||||
)
|
||||
logger.info("output at " + output_file)
|
||||
with open(output_file, "w") as file:
|
||||
file.write(
|
||||
f"Date range: {self.opts.start} until {self.opts.end}\n"
|
||||
)
|
||||
file.write(
|
||||
f"substrings: {' '.join(plugin.state['substring'])}\n"
|
||||
)
|
||||
for line in events:
|
||||
file.write(line + "\n")
|
||||
else:
|
||||
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||
info = self.system_info()
|
||||
system_info_output = os.path.join(output_directory, "system_info")
|
||||
with open(system_info_output, "w") as file:
|
||||
for i in info:
|
||||
file.write(i + "\n")
|
||||
|
||||
for k, v in self.hosts.items():
|
||||
file.write(f"{k}: {','.join(v.keys())}\n")
|
||||
logger.info("output at " + system_info_output)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.AUDIT:
|
||||
hosts = {}
|
||||
hosts.update(self.hosts["workers"])
|
||||
hosts.update(self.hosts["storages"])
|
||||
hosts.update(self.hosts["controllers"])
|
||||
|
||||
for hostname, folderpath in hosts.items():
|
||||
self._create_output_file(
|
||||
f"{hostname}_audit",
|
||||
output_directory,
|
||||
self.audit(
|
||||
plugin.state["start"],
|
||||
plugin.state["end"],
|
||||
os.path.join(
|
||||
folderpath, "var", "log", "dcmanager", "audit.log"
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.SWACT:
|
||||
self._create_output_file(
|
||||
"swact_activity", output_directory, self.swact()
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.PUPPET:
|
||||
self._create_output_file(
|
||||
"puppet_errors", output_directory, self.puppet()
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURE:
|
||||
self._create_output_file(
|
||||
"process_failures", output_directory, self.process_failure()
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.ALARM:
|
||||
alarms, logs = self.alarm(
|
||||
plugin.state["alarm_ids"], plugin.state["entity_ids"]
|
||||
)
|
||||
alarm_output = os.path.join(output_directory, "alarm")
|
||||
log_output = os.path.join(output_directory, "log")
|
||||
os.makedirs(os.path.dirname(log_output), exist_ok=True)
|
||||
|
||||
# creating output alarm file
|
||||
with open(alarm_output, "w") as file:
|
||||
for k, v in alarms.items():
|
||||
file.write(f"{k} {v['count']}\n")
|
||||
file.write("\n")
|
||||
for k, v in alarms.items():
|
||||
file.write(f"{k}\n")
|
||||
for date in v["dates"]:
|
||||
file.write(f" {date}\n")
|
||||
|
||||
# creating output log file
|
||||
with open(log_output, "w") as file:
|
||||
for k, v in logs.items():
|
||||
file.write(f"{k} {v['count']}\n")
|
||||
file.write("\n")
|
||||
for k, v in logs.items():
|
||||
file.write(f"{k}\n")
|
||||
for date in v["dates"]:
|
||||
file.write(f" {date}\n")
|
||||
logger.info("output at " + alarm_output)
|
||||
logger.info("output at " + log_output)
|
||||
|
||||
# Built-in algorithms ------------------------------
|
||||
def alarm(self, alarm_ids=[], entity_ids=[]):
|
||||
"""Alarm algorithm
|
||||
Gathers list of alarms and customer logs
|
||||
|
||||
Parameters:
|
||||
alarm_ids (string list) : List of alarm id patterns to search for
|
||||
entity_ids (string list): List of entity id patterns to search for
|
||||
"""
|
||||
alarm_data = {}
|
||||
log_data = {}
|
||||
with open(
|
||||
os.path.join(
|
||||
self.active_controller_directory,
|
||||
"var",
|
||||
"extra",
|
||||
"database",
|
||||
"fm.db.sql.txt",
|
||||
)
|
||||
) as file:
|
||||
start = False
|
||||
for line in file:
|
||||
# start of event log
|
||||
if "COPY event_log" in line:
|
||||
start = True
|
||||
elif start and line == "\\.\n":
|
||||
break
|
||||
elif start:
|
||||
entry = re.split(r"\t", line)
|
||||
|
||||
INDEX_ALARM_ID = 5
|
||||
INDEX_ACTION = 6
|
||||
INDEX_ENTITY_ID = 8
|
||||
INDEX_ALARM_DATE = 9
|
||||
INDEX_SEVERITY = 10
|
||||
|
||||
alarm_id = entry[INDEX_ALARM_ID]
|
||||
entity_id = entry[INDEX_ENTITY_ID]
|
||||
action = entry[INDEX_ACTION]
|
||||
severity = entry[INDEX_SEVERITY]
|
||||
alarm_date = entry[INDEX_ALARM_DATE]
|
||||
|
||||
entry_date = alarm_date.replace(
|
||||
" ", "T"
|
||||
) # making time format of alarm the same
|
||||
if self.opts.start <= entry_date and entry_date <= self.opts.end:
|
||||
# if the alarm is not in the user specified list of alarm or entity ids
|
||||
for id in alarm_ids:
|
||||
if id in alarm_id:
|
||||
break
|
||||
else:
|
||||
if len(alarm_ids) > 0:
|
||||
continue
|
||||
|
||||
for entity in entity_ids:
|
||||
if entity in entity_id:
|
||||
break
|
||||
else:
|
||||
if len(entity_ids) > 0:
|
||||
continue
|
||||
|
||||
try:
|
||||
if action == "log":
|
||||
log_info = log_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
]
|
||||
log_info["count"] += 1
|
||||
log_info["dates"].append(alarm_date)
|
||||
else:
|
||||
alarm_info = alarm_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
]
|
||||
alarm_info["count"] += 1
|
||||
alarm_info["dates"].append(f"{alarm_date} {action}")
|
||||
except KeyError:
|
||||
if entry[6] != "log":
|
||||
alarm_data[f"{alarm_id} {entity_id} {severity}"] = {
|
||||
"count": 1,
|
||||
"dates": [f"{alarm_date} {action}"],
|
||||
}
|
||||
else:
|
||||
log_data[f"{alarm_id} {entity_id} {severity}"] = {
|
||||
"count": 1,
|
||||
"dates": [alarm_date],
|
||||
}
|
||||
|
||||
for _, v in alarm_data.items():
|
||||
v["dates"] = sorted(v["dates"])
|
||||
|
||||
for _, v in log_data.items():
|
||||
v["dates"] = sorted(v["dates"])
|
||||
|
||||
return alarm_data, log_data
|
||||
|
||||
def substring(self, substr, files):
|
||||
"""Substring algorithm
|
||||
Looks for substrings within files
|
||||
|
||||
Parameters:
|
||||
substr (string list): List of substrings to look for
|
||||
files (string list): List of absolute filepaths to search in
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
|
||||
|
||||
data = []
|
||||
for file in files:
|
||||
if not os.path.exists(file):
|
||||
raise FileNotFoundError(f"File not found: {file}")
|
||||
cont = True
|
||||
# Searching through file
|
||||
command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}"""
|
||||
status = self._continue(file)
|
||||
|
||||
if (
|
||||
status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD
|
||||
): # continue with current file
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
# Searching through rotated log files
|
||||
n = 1
|
||||
while os.path.exists(f"{file}.{n}.gz") and cont:
|
||||
command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz"""
|
||||
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
||||
|
||||
if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD:
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
n += 1
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def system_info(self):
|
||||
"""System info algorithm
|
||||
Presents basic information about the system
|
||||
"""
|
||||
data = []
|
||||
with open(
|
||||
os.path.join(
|
||||
self.active_controller_directory, "etc", "platform", "platform.conf"
|
||||
)
|
||||
) as file:
|
||||
for line in file:
|
||||
if "system_mode" in line:
|
||||
data.append(
|
||||
f"System Mode: {re.match('^system_mode=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "system_type" in line:
|
||||
data.append(
|
||||
f"System Type: {re.match('^system_type=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "distributed_cloud_role" in line:
|
||||
data.append(
|
||||
f"Distributed cloud role: {re.match('^distributed_cloud_role=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "sw_version" in line:
|
||||
data.append(
|
||||
f"SW Version: {re.match('^sw_version=(.*)', line).group(1)}"
|
||||
)
|
||||
with open(
|
||||
os.path.join(self.active_controller_directory, "etc", "build.info")
|
||||
) as file:
|
||||
for line in file:
|
||||
if "BUILD_TYPE" in line:
|
||||
data.append(
|
||||
f"Build Type: {re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
||||
)
|
||||
elif re.match("^OS=(.*)", line):
|
||||
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
|
||||
|
||||
return data
|
||||
|
||||
def swact(self):
|
||||
"""Swact activity algorithm
|
||||
Presents all swacting activity in the system
|
||||
"""
|
||||
data = []
|
||||
sm_files = []
|
||||
sm_customer_files = []
|
||||
swact_start = None
|
||||
swact_in_progress = False
|
||||
swact_end = None
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
sm_path = os.path.join(folder, "var", "log", "sm.log")
|
||||
sm_files.append(sm_path)
|
||||
|
||||
sm_substrings = ["Swact has started,", "Swact update"]
|
||||
data = self.substring(sm_substrings, sm_files)
|
||||
|
||||
for i, line in enumerate(data):
|
||||
if "Swact has started," in line and not swact_in_progress:
|
||||
swact_in_progress = True
|
||||
swact_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
elif "Swact update" in line and swact_in_progress:
|
||||
swact_in_progress = False
|
||||
swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
line += f" SWACT TOOK {swact_end - swact_start} \n"
|
||||
data[i] = line
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log")
|
||||
sm_customer_files.append(sm_customer_path)
|
||||
|
||||
sm_customer_substrings = ["swact"]
|
||||
data += self.substring(sm_customer_substrings, sm_customer_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def puppet(self):
|
||||
"""Puppet error algorithm
|
||||
Presents log errors from puppet logs
|
||||
"""
|
||||
data = []
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
||||
command = f"grep -rh 'Error:' {puppet_folder}"
|
||||
self._evaluate_substring(data, command)
|
||||
return sorted(data)
|
||||
|
||||
def process_failure(self):
|
||||
"""Process failure algorithm
|
||||
Presents log errors from pmond
|
||||
"""
|
||||
data = []
|
||||
files = []
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
pmond = os.path.join(folder, "var", "log", "pmond.log")
|
||||
files.append(pmond)
|
||||
data = self.substring(["Error :"], files)
|
||||
return data
|
||||
|
||||
def audit(self, start, end, audit_log_path):
|
||||
"""Counts audit events in dcmanager within a specified date range
|
||||
|
||||
Parameters:
|
||||
start (string) : start date in YYYY-MM-DD HH:MM:SS format
|
||||
end (string) : end date in YYYY-MM-DD HH:MM:SS format
|
||||
audit_log_path (string) : absolute path of augit log file
|
||||
"""
|
||||
if not shutil.which("lnav"):
|
||||
raise ValueError("Lnav program not found")
|
||||
|
||||
SECONDS_PER_HOUR = 3600
|
||||
fmt = "%Y-%m-%d %H:%M:%S"
|
||||
|
||||
d1 = datetime.strptime(start, fmt)
|
||||
d2 = datetime.strptime(end, fmt)
|
||||
seconds = (d2 - d1).total_seconds()
|
||||
|
||||
log_texts = [
|
||||
"Triggered subcloud audit%",
|
||||
"Trigger patch audit%",
|
||||
"Trigger load audit%",
|
||||
"Triggered firmware audit%",
|
||||
"Triggered kubernetes audit%",
|
||||
# Counts sum of audits from all subclouds
|
||||
]
|
||||
INDEX_MIDDLE_WORD = 1
|
||||
data = ["These rates and totals represent the sum of audits from all subclouds"]
|
||||
|
||||
def command(text):
|
||||
|
||||
return (
|
||||
f'lnav -R -n -c ";SELECT count(log_body) AS {text.split(" ")[INDEX_MIDDLE_WORD]}_total'
|
||||
f' from openstack_log WHERE (log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
||||
f' AND log_body like \\"{text}\\"" "{audit_log_path}"'
|
||||
)
|
||||
|
||||
for text in log_texts:
|
||||
p = subprocess.Popen(command(text), shell=True, stdout=subprocess.PIPE)
|
||||
for line in p.stdout:
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.isnumeric():
|
||||
data.append(
|
||||
f"rate {round((int(line)/seconds * SECONDS_PER_HOUR), 3)} per hour. total: {line}"
|
||||
)
|
||||
else:
|
||||
data.append(line)
|
||||
return data
|
||||
|
||||
# -----------------------------------
|
||||
|
||||
def _continue(self, file, compressed=False):
|
||||
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
|
||||
CONTINUE_OLD = 2 # don't analyze current file, continue to older files
|
||||
|
||||
# check date of first log event and compare with provided start end dates
|
||||
first = ""
|
||||
|
||||
if not compressed:
|
||||
with open(file) as f:
|
||||
line = f.readline()
|
||||
first = line[0:19]
|
||||
else:
|
||||
with gzip.open(file, "rb") as f:
|
||||
line = f.readline().decode("utf-8")
|
||||
first = line[0:19]
|
||||
try:
|
||||
datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
first = line[0:19]
|
||||
except ValueError:
|
||||
return CONTINUE_CURRENT_OLD
|
||||
|
||||
if first < self.opts.start:
|
||||
return CONTINUE_CURRENT
|
||||
elif first < self.opts.end and first > self.opts.start:
|
||||
return CONTINUE_CURRENT_OLD
|
||||
elif first > self.opts.end:
|
||||
return CONTINUE_OLD
|
||||
|
||||
def _evaluate_substring(self, data, command):
|
||||
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
for line in p.stdout:
|
||||
line = line.decode("utf-8")
|
||||
dates = [line[0:19], line[2:21]] # different date locations for log events
|
||||
for date in dates:
|
||||
try:
|
||||
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
||||
if date > self.opts.start and date < self.opts.end:
|
||||
if line[0] == "|": # sm-customer.log edge case
|
||||
line = line.replace("|", "").strip()
|
||||
line = re.sub("\s+", " ", line)
|
||||
data.append(line)
|
||||
break
|
||||
except ValueError:
|
||||
if date == dates[-1]:
|
||||
data.append(line)
|
||||
|
||||
def _extract_subfunction(self, host_info_path):
|
||||
GROUP_ONE = 1
|
||||
with open(host_info_path) as file:
|
||||
for line in file:
|
||||
hostname_match = re.match("^hostname => (.+)", line)
|
||||
subfunction_match = re.match("^subfunction => (.+)", line)
|
||||
if subfunction_match:
|
||||
subfunction = subfunction_match.group(GROUP_ONE)
|
||||
if hostname_match:
|
||||
hostname = hostname_match.group(GROUP_ONE)
|
||||
return hostname, subfunction
|
||||
|
||||
def _create_output_file(self, filename, directory, events):
|
||||
with open(os.path.join(directory, filename), "w") as file:
|
||||
for i in events:
|
||||
file.write(i + "\n")
|
||||
logger.info("output at " + os.path.join(directory, filename))
|
@ -1,257 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# Description: The Report tool is used to gather relevant log events
|
||||
# and information about the system from a collect bundle.
|
||||
#
|
||||
# The report tool allows user created plugins which decides relevance
|
||||
# for log events. Plugins contain an algorithm label which instructs the
|
||||
# tool what information to search and how to search for it.
|
||||
#
|
||||
# The report tool requires the collect bundle and host tarballs to be
|
||||
# untarred.
|
||||
#
|
||||
# The report tool reads user plugins from a plugins directory in the
|
||||
# top level of the collect bundle, and outputs files containing
|
||||
# relevant logs to a report directory in the top level as well.
|
||||
#
|
||||
# Typical Usage:
|
||||
# command line functionality
|
||||
# ------------------------------- ----------------------------------
|
||||
# > report.py - Run all plugins in directory
|
||||
# > report.py [plugin ...] - Run only specified plugins
|
||||
# > report.py <algorithm> [labels] - Run algorithm with labels
|
||||
# > report.py --help - help message
|
||||
# > report.py <algorithm> --help - algorithm specific help
|
||||
#
|
||||
# See --help output for a complete list of full and abbreviated
|
||||
# command line options and examples of plugins.
|
||||
#
|
||||
# Refer to README file for more usage and output examples
|
||||
#######################################################################
|
||||
|
||||
import argparse
|
||||
from cmath import log
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
from execution_engine import ExecutionEngine
|
||||
from plugin import Plugin
|
||||
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
base_dir = os.path.realpath(__file__)
|
||||
default_path = os.path.join(os.path.dirname(base_dir), "..", "..")
|
||||
plugins = []
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Log Event Reporter",
|
||||
epilog="Place plugins in 'plugins' directory at top level of collect bundle. Output files will be placed in 'report' directory."
|
||||
"\nThis tool will create a report.log file along with other output files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--start",
|
||||
default="20000101",
|
||||
help="Specify a start date in YYYYMMDD format for analysis (default:20000101)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--end",
|
||||
default=datetime.strftime(now, "%Y%m%d"),
|
||||
help="Specify an end date in YYYYMMDD format for analysis (default: current date)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--plugin",
|
||||
default=None,
|
||||
nargs="*",
|
||||
help="Specify what plugins to run (default: runs every plugin in plugins folder)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--directory",
|
||||
default=default_path,
|
||||
help="Specify top level of collect bundle to analyze (default: two levels above current location)",
|
||||
)
|
||||
subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
|
||||
|
||||
# substring algorithm arguments
|
||||
parser_substring = subparsers.add_parser(
|
||||
"substring",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="""Searches through specified files for lines containing specified substring.
|
||||
There will be an output file for each host of the host type specified.""",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=substring\n"
|
||||
" files=mtcAgent.log, sm.log\n"
|
||||
" hosts=controllers, workers\n"
|
||||
" substring=Swact in progress\n"
|
||||
" substring=Swact update",
|
||||
)
|
||||
substring_required = parser_substring.add_argument_group("required arguments")
|
||||
substring_required.add_argument(
|
||||
"--files",
|
||||
required=True,
|
||||
nargs="+",
|
||||
help="Files to perform substring analysis on (required)",
|
||||
)
|
||||
substring_required.add_argument(
|
||||
"--substring", nargs="+", required=True, help="Substrings to search for (required)"
|
||||
)
|
||||
substring_required.add_argument(
|
||||
"--hosts",
|
||||
choices=["controllers", "workers", "storages", "all"],
|
||||
required=True,
|
||||
nargs="+",
|
||||
help="Host types to perform analysis on (required)",
|
||||
)
|
||||
|
||||
|
||||
# alarm algorithm arguments
|
||||
parser_alarm = subparsers.add_parser(
|
||||
"alarm",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Searches through fm.db.sql.txt for alarms and logs. There are 2 output files: 'alarm', and 'log'",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=alarm\n"
|
||||
" alarm_ids=400.005,200.004\n"
|
||||
" entity_ids= host=controller-0,host=controller-1\n",
|
||||
)
|
||||
parser_alarm.add_argument(
|
||||
"--alarm_ids",
|
||||
nargs="+",
|
||||
required=False,
|
||||
default=[],
|
||||
help="Alarm id patterns to search for (not required)",
|
||||
)
|
||||
parser_alarm.add_argument(
|
||||
"--entity_ids",
|
||||
nargs="+",
|
||||
required=False,
|
||||
default=[],
|
||||
help="Entity id patterns to search for (not required)",
|
||||
)
|
||||
|
||||
# system info algorithm
|
||||
parser_system_info = subparsers.add_parser(
|
||||
"system_info",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents information about the system",
|
||||
epilog="Plugin file example:\n" " algorithm=system_info\n",
|
||||
)
|
||||
|
||||
# swact activity algorithm
|
||||
parser_swact = subparsers.add_parser(
|
||||
"swact",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents system swacting activity",
|
||||
epilog="Plugin file example:\n" " algorithm=swact\n",
|
||||
)
|
||||
|
||||
# puppet errors algorithm
|
||||
parser_puppet = subparsers.add_parser(
|
||||
"puppet",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any puppet errors",
|
||||
epilog="Plugin file example:\n" " algorithm=puppet\n",
|
||||
)
|
||||
|
||||
# process failure algorithm
|
||||
parser_process_failure = subparsers.add_parser(
|
||||
"process_failure",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any process failures from pmond.log",
|
||||
epilog="Plugin file example:\n" " algorithm=process_failure\n",
|
||||
)
|
||||
|
||||
# audit algorithm
|
||||
parser_audit = subparsers.add_parser(
|
||||
"audit",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents information about audit events in dcmanager.\n"
|
||||
"The rates and totals represents the sum of audits on all subclouds ",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=audit\n"
|
||||
" start=2022-06-01 10:00:00\n"
|
||||
" end=2022-06-01 04:00:00\n",
|
||||
)
|
||||
parser_audit_required = parser_audit.add_argument_group("required arguments")
|
||||
parser_audit_required.add_argument("--start", required=True)
|
||||
parser_audit_required.add_argument(
|
||||
"--end",
|
||||
required=True,
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
args.start = datetime.strptime(args.start, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
output_directory = os.path.join(
|
||||
args.directory, "report", "output", now.strftime("%Y%m%d.%H%M%S")
|
||||
)
|
||||
|
||||
# creating report log
|
||||
os.makedirs(output_directory)
|
||||
open(os.path.join(output_directory, "report.log"), "w").close()
|
||||
|
||||
# setting up logger
|
||||
formatter = logging.Formatter("%(message)s")
|
||||
logger = logging.getLogger()
|
||||
|
||||
logging.basicConfig(
|
||||
filename=os.path.join(output_directory, "report.log"),
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s: %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
)
|
||||
logging.Formatter.converter = time.gmtime
|
||||
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(formatter)
|
||||
|
||||
logger.addHandler(ch)
|
||||
|
||||
try:
|
||||
engine = ExecutionEngine(args)
|
||||
except ValueError as e:
|
||||
logger.error(str(e))
|
||||
|
||||
if args.algorithm:
|
||||
plugins.append(Plugin(opts=vars(args)))
|
||||
else:
|
||||
if args.plugin:
|
||||
for p in args.plugin:
|
||||
path = os.path.join(args.directory, "plugins", p)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
plugins.append(Plugin(path))
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
|
||||
else:
|
||||
logger.warning(f"{p} plugin does not exist")
|
||||
else:
|
||||
path = os.path.join(args.directory, "plugins")
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
logger.error("Plugins folder is empty")
|
||||
else:
|
||||
for file in os.listdir(path):
|
||||
try:
|
||||
plugins.append(Plugin(os.path.join(path, file)))
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
|
||||
engine.execute(plugins, output_directory)
|
Loading…
Reference in New Issue
Block a user