Files
charm-magpie/src/lib/charms/layer/magpie_tools.py
2023-09-27 08:50:34 +00:00

1141 lines
39 KiB
Python

# Copyright 2020 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
from collections.abc import Iterable
import datetime
import os
import subprocess
import math
import re
import time
import json
import psutil
from typing import Tuple
from charmhelpers.core import hookenv
from charmhelpers.core.host import get_nic_mtu, service_start, service_running
from charmhelpers.fetch import apt_install
import charmhelpers.contrib.network.ip as ch_ip
from prometheus_client import Gauge, start_http_server
import netifaces
import yaml
class Lldp():
enabled = False
parsed_data = None
def __init__(self):
self.lldp_out = '/home/ubuntu/lldp_output.' +\
hookenv.application_name() + '.txt'
def install(self):
apt_install("lldpd")
def disable_i40e_lldp_agent(self):
path = '/sys/kernel/debug/i40e'
if os.path.isdir(path):
hookenv.log('Disabling NIC internal LLDP agent', 'INFO')
for r, dirs, files in os.walk(path):
for d in dirs:
with open("{}/{}/command".format(path, d), "w") as fh:
fh.write('lldp stop')
def enable(self):
self.disable_i40e_lldp_agent()
if not service_running('lldpd'):
service_start('lldpd')
hookenv.log('Waiting to collect LLDP data', 'INFO')
time.sleep(30)
def collect_data(self):
cmd = "lldpcli show neighbors details -f json | tee " + self.lldp_out
os.system(cmd)
def data(self):
if not self.parsed_data:
with open(self.lldp_out, 'r') as f:
self.parsed_data = json.load(f)
return self.parsed_data
def get_interface(self, iface):
for i in self.data()['lldp']['interface']:
if iface in i:
return i[iface]
return None
def get_interface_vlan(self, iface):
try:
return self.get_interface(iface)['vlan']['vlan-id']
except (KeyError, TypeError):
hookenv.log('No LLDP data for {}'.format(iface), 'INFO')
return None
def get_interface_port_descr(self, iface):
try:
return self.get_interface(iface)['port']['descr']
except (KeyError, TypeError):
hookenv.log('No LLDP data for {}'.format(iface), 'INFO')
return None
async def run(cmd):
proc = await asyncio.create_subprocess_shell(
cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
if stdout:
return stdout.decode()
if stderr:
print('[stderr]')
print(stderr.decode())
async def run_iperf(node_name, ip, iperf_batch_time, concurrency):
"""
this function will perform an iperf command
The following is an example of the output
$ iperf -c 192.168.2.1 -t 10 --port 5001 -P 2 --reportstyle c
19700101000000,192.168.2.2,60266,192.168.2.1,5001,2,0.0-10.1,95158332,75301087
19700101000000,192.168.2.2,60268,192.168.2.1,5001,1,0.0-10.1,161742908,127989222
First field: timestamp of the iperf run
As of right now with iperf 2.1.7 (Jammy), the timestamp
is always outputting 19700101000000
Second field: source IP address
Third field: source port
Fourth field: destination address
Fifth field: destination port
Sixth field: Session number when in parallel, iperf doesn't
seem to reorder the output by the session number
Seventh field: duration of test
Eighth field: transferred bytes
Ninth field: average speed in bits per second
"""
node_name = node_name.replace('/', '_')
cmd = "iperf -t{} -c {} --port 5001 -P{} --reportstyle c".format(
iperf_batch_time,
ip,
concurrency)
hookenv.log(cmd, 'INFO')
out = await run(cmd)
results = {'src_port': [],
'dest_port': '',
'dest_node': node_name,
'session': [],
'transferred_bytes': 0,
'bits_per_second': 0,
}
src_ip = get_src_ip_from_dest(ip)
interface = ch_ip.get_iface_from_addr(src_ip)
src_mac = get_iface_mac(interface)
results['src_interface'] = interface
results['src_mac'] = src_mac
results['src_ip'] = src_ip
results['dest_ip'] = ip
if out:
for line in out.split():
timestamp, src_ip, src_port, dest_ip, dest_port, \
session, time_interval, xferred_bytes, bits_per_s \
= line.split(',')
# On Focal, a supplementary line with the summarised total
# needs to be ignored, this can be recognised with the
# session being set to -1 on the line
if session != '-1':
# The following values will be identical on each line,
# so only set the fields once.
if not results.get('timestamp'):
results['timestamp'] = timestamp
results['dest_port'] = dest_port
results['time_interval'] = time_interval
results['concurrency'] = concurrency
# for now magpie only use one iperf server as
# destination, it is not useful to record multiple
# time the destination port since it is identical
results['src_port'].append(int(src_port))
results['session'].append(int(session))
results['transferred_bytes'] += int(xferred_bytes)
results['bits_per_second'] += int(bits_per_s)
results['GBytes_transferred'] = round(
float(results['transferred_bytes'] / 1024**3),
3,
)
results['Mbits_per_second'] = int(
results['bits_per_second'] / 1024**2
)
# retrieve supplementary informations not provided by iperf
dest_mac = get_dest_mac(interface, results['dest_ip'])
results['dest_mac'] = dest_mac
hookenv.log(
f"Source: {results['src_ip']}, "
f"Destination: {results['dest_ip']} "
f"({results['dest_node']}) : "
f"{results['GBytes_transferred']} GB, "
f"{results['Mbits_per_second']} Mbps",
'INFO'
)
return results
class Iperf():
"""
Install and start a server automatically
"""
BATCH_CTRL_FILE = '/tmp/batch_hostcheck.ctrl'
IPERF_BASE_PORT = 5001
# Dict of prometheus metrics
metrics = {}
def __init__(self):
self.iperf_out = '/home/ubuntu/iperf_output.' + \
hookenv.application_name() + '.txt'
def listen(self, cidr=None, port=None):
port = port or self.IPERF_BASE_PORT
if cidr:
bind_addreess = ch_ip.get_address_in_network(cidr)
else:
bind_addreess = (
hookenv.network_get('magpie')
['bind-addresses'][0]['addresses'][0]['address']
)
cmd = (
"iperf -s -fm --port " + str(port) +
" -B " + bind_addreess + " | tee " +
self.iperf_out + " &"
)
os.system(cmd)
def speed(self):
with open(self.iperf_out) as f:
for line in f.readlines():
if "bits" in line:
match = line
try:
return match.rsplit(' ', 2)[1]
except UnboundLocalError:
return "no iperf test results: failed"
def selfcheck(self):
subprocess.check_output(["iperf", "-c", "localhost", "-t", "1"])
def hostcheck(self, nodes, iperf_duration):
# Wait for other nodes to start their servers...
for node in nodes:
msg = "checking iperf on {}".format(node[1])
hookenv.log(msg)
cmd = "iperf -t {} -c {} -P{}".format(iperf_duration, node[1],
min(8, self.num_cpus()))
os.system(cmd)
def get_increment(self, total_runtime, progression):
return datetime.timedelta(
seconds=math.ceil(total_runtime / len(progression)))
def get_plan(self, progression, increment):
now = datetime.datetime.now()
plan = []
for i in enumerate(progression):
start_time = now + (i[0] * increment)
plan.append((start_time, i[1]))
return plan
def num_cpus(self):
'''
Compatibility wrapper for calculating the number of CPU's
a unit has.
@returns: int: number of CPU cores detected
'''
try:
return psutil.cpu_count()
except AttributeError:
return psutil.NUM_CPUS
def update_plan(self, plan, skip_to, increment):
progression = []
for (_time, conc) in plan:
if conc >= skip_to:
progression.append(conc)
return self.get_plan(progression, increment)
def get_concurrency(self, plan):
now = datetime.datetime.now()
for (_time, conc) in reversed(plan):
if _time < now:
return conc
def wipe_batch_ctrl_file(self):
with open(self.BATCH_CTRL_FILE, "w") as ctrl_file:
ctrl_file.truncate(0)
def read_batch_ctrl_file(self):
with open(self.BATCH_CTRL_FILE, 'r') as ctrl_file:
contents = ctrl_file.read()
return contents
def add_iperf_bandwidth_metric(self, src_unit, dst_unit, value, tag):
"""
labels:
fio_{read|write}_{iops,bandwidth,latency}
rbd_bench_{read|write}_??
rados_bench_{read|write}_??
"""
if 'magpie_iperf_bandwidth' not in self.metrics:
self.metrics['magpie_iperf_bandwidth'] = Gauge(
'magpie_iperf_bandwidth',
'magpie iperf bandwidth (bits/s)',
['model', 'src', 'dest', 'tag']
)
self.metrics['magpie_iperf_bandwidth'].labels(
model=hookenv.model_name(),
src=src_unit, dest=dst_unit,
tag=tag).set(value)
def add_iperf_concurrency_metric(self, src_unit, dst_unit, value, tag):
if 'magpie_iperf_concurrency' not in self.metrics:
self.metrics['magpie_iperf_concurrency'] = Gauge(
'magpie_iperf_concurrency',
'magpie iperf process concurrency',
['model', 'src', 'dest', 'tag']
)
self.metrics['magpie_iperf_concurrency'].labels(
model=hookenv.model_name(),
src=src_unit, dest=dst_unit,
tag=tag).set(value)
def process_results(self, results, nodes, concurrency, tag):
bandwidth = {ip: 0 for ip in nodes.keys()}
src_unit = hookenv.local_unit().replace('/', '_')
for result in results:
bandwidth[result['dest_ip']] += math.ceil(
int(result['bits_per_second']))
for ip, node_name in nodes.items():
dst_unit = node_name.replace('/', '_')
self.add_iperf_bandwidth_metric(
src_unit,
dst_unit,
bandwidth[ip],
tag)
self.add_iperf_concurrency_metric(
src_unit,
dst_unit,
concurrency,
tag)
def batch_hostcheck(self, nodes, total_runtime, iperf_batch_time=None,
progression=None, tag='default'):
iperf_batch_time = iperf_batch_time or 60
progression = progression or [4, 8, 16, 24, 32, 40]
increment = self.get_increment(total_runtime, progression)
plan = self.get_plan(progression, increment)
finish_time = datetime.datetime.now() + datetime.timedelta(
seconds=total_runtime)
failure = False
self.wipe_batch_ctrl_file()
action_output = []
# Prometheus target for scraping of collected FIO metrics
start_http_server(8088)
while datetime.datetime.now() < finish_time and not failure:
contents = self.read_batch_ctrl_file()
if contents:
try:
plan = self.update_plan(plan, int(contents), increment)
self.wipe_batch_ctrl_file()
except ValueError:
pass
concurrency = self.get_concurrency(plan)
hookenv.status_set(
'active',
'Concurrency: {} Nodes: {}'.format(
concurrency,
', '.join([i[0] for i in nodes])))
loop = asyncio.get_event_loop()
results = loop.run_until_complete(
asyncio.gather(
*[
run_iperf(
node_name,
ip,
iperf_batch_time,
concurrency,
)
for ip, node_name, in nodes.items()
]
)
)
# the loop should be stopped if iperf does not work
for result in results:
if result['transferred_bytes'] == 0:
failure = True
self.process_results(results, nodes, concurrency, tag)
action_output.append(results)
return action_output
def safe_status(workload, status):
cfg = hookenv.config()
if not cfg.get('supress_status'):
hookenv.status_set(workload, status)
async def ping(addr, timeout, count, interval, mtu=None) -> Tuple[int, int]:
"""
Ping `addr` with provided options.
Return a tuple of integers,
where the first integer is the number of packets received,
and the second integer is the number of packets transmitted.
:param addr: ip address or hostname to ping
:type addr: str
:param timeout: timeout in seconds per icmp request
:type timeout: Union[str, int, float]
:param count: number of packets to send
:type count: Union[str, int]
:param interval: seconds between sending each packet
:type interval: Union[str, int, float]
:param mtu: optional packet size to send
:type mtu: Union[str, int, None]
:returns: a tuple of two integers
:rtype: Tuple[int, int]
"""
args = [
"ping",
"-c", str(count),
"-i", str(interval),
"-W", str(timeout),
]
if mtu:
args.extend(("-M", "do", "-s", str(int(mtu) - 28)))
args.append(addr)
args = " ".join(args)
# Yes this is a blocking call, but doesn't block for long.
# We only need to parallelise the ping calls, which block for much longer.
hookenv.log('Ping command: {}'.format(args), hookenv.DEBUG)
stdout = await run(args)
hookenv.log(f'ping stdout {stdout}')
match = re.search(
r"(\d+)\s*packets\s+transmitted,\s*(\d+)\s*received",
stdout
)
hookenv.log(f'match {match}')
if match:
return (int(match.group(2)), int(match.group(1)))
hookenv.log(
f"pinging {addr} failed with output: '{stdout}'",
hookenv.DEBUG
)
return (0, count)
def check_local_hostname():
local_hostname = subprocess.check_output('hostname', shell=True)\
.decode('utf-8').rstrip()
lookup_cmd = "getent hosts {}".format(local_hostname)
hookenv.log('Looking up local hostname: {}'.format(local_hostname))
try:
result = subprocess.check_output(lookup_cmd, shell=True)\
.decode('utf-8').rstrip()
result = ''
stderr = 0
except subprocess.CalledProcessError as exc:
result = local_hostname
stderr = exc.returncode
return result, stderr
def check_local_mtu(required_mtu, iface_mtu):
if required_mtu == 0:
return 0
elif 0 <= (int(iface_mtu) - int(required_mtu)) <= 12:
return 100
else:
return 200
def status_for_speed_check(min_speed, speed, link_speed):
"""
Generate and return a portion of the status line for the iperf speed test.
:param min_speed: raw value of min-speed from charm config
:type min_speed: str
:param speed: speed in mbit/s from iperf
:type speed: float
:param link_speed: link speed in mbit/s
:type link_speed: int
"""
if not re.match(r'^\d+%?$', min_speed):
return ', invalid min_speed: {!r}'.format(min_speed)
if int(min_speed.rstrip('%')) == 0:
return ', {} mbit/s'.format(speed)
if '%' in min_speed:
# virtual link with no defined speed
if link_speed < 0:
hookenv.log(
'link speed negative, so unable to '
'calculate value for min_speed percentage',
'INFO'
)
return ', speed failed: link speed undefined'
# convert percentage to integer mbit/s
min_speed = int(min_speed.rstrip('%')) * link_speed // 100
else:
min_speed = int(min_speed)
if min_speed <= speed:
return ', speed ok: {} mbit/s'.format(speed)
else:
return ', speed failed: {} < {} mbit/s'.format(speed, min_speed)
def check_port_description(lldp):
iface_dir = "/sys/class/net"
status = None
local_hostname = subprocess.check_output('hostname', shell=True)\
.decode('utf-8').rstrip()
for r, dirs, files in os.walk(iface_dir):
for d in dirs:
if d == 'lo':
continue
if d.startswith('vnet'):
continue
if d.startswith('veth'):
continue
if check_iface_type(d) == 'eth':
if not check_iface_down(d):
desc = lldp.get_interface_port_descr(d)
hookenv.log("Port {} description {}".format(d, desc),
'INFO')
if desc:
if not re.search(local_hostname, desc):
if status:
status = "{} {}:{}"\
.format(status, d, desc)
else:
status = "{}:{}".format(d, desc)
if status:
return "ports failed: {}".format(status)
else:
return "ports ok"
def check_iface_type(iface):
iface_dir = "/sys/class/net/{}".format(iface)
with open("{}/uevent".format(iface_dir)) as fos:
content = fos.read()
if re.search('DEVTYPE', content):
return "complex"
return 'eth'
def check_iface_down(iface):
iface_dir = "/sys/class/net/{}".format(iface)
with open("{}/operstate".format(iface_dir)) as fos:
content = fos.read()
if not re.search('up', content):
return "down"
with open("{}/carrier".format(iface_dir)) as fos:
content = fos.read()
if not re.search('1', content):
return "down"
return None
def check_aggregator_id(bond_iface, slave_iface):
bond_iface_dir = "/sys/class/net/{}/bonding".format(bond_iface)
slave_iface_dir = "/sys/class/net/{}/bonding_slave".format(slave_iface)
with open("{}/ad_aggregator".format(bond_iface_dir)) as fos:
bond_aggr_value = fos.read()
with open("{}/ad_aggregator_id".format(slave_iface_dir)) as fos:
slave_aggr_value = fos.read()
if bond_aggr_value != slave_aggr_value:
return "aggregate_id_mismatch"
return None
def check_lacp_port_state(iface):
cfg = hookenv.config()
iface_dir = "/sys/class/net/{}/bonding_slave".format(iface)
with open("{}/ad_actor_oper_port_state".format(iface_dir)) as fos:
actor_port_state = fos.read()
with open("{}/ad_partner_oper_port_state".format(iface_dir)) as fos:
partner_port_state = fos.read()
if (
actor_port_state != partner_port_state
# check if this is an acceptable mismatch in the LACP activity mode
and not (
cfg.get('lacp_passive_mode')
# and the only difference is the LACP activity bit
# (1111_1110 bitmask to ignore LACP activity bit in comparison)
and (int(actor_port_state) & 254) ==
(int(partner_port_state) & 254)
)
):
return "lacp_port_state_mismatch"
return None
def get_bond_mode(bond):
bond_path = "/sys/class/net/{}".format(bond)
with open("{}/bonding/mode".format(bond_path)) as fos:
content = fos.read()
if re.search('balance-rr', content):
return "balance_rr"
elif re.search('active-backup', content):
return "active_backup"
elif re.search('balance-xor', content):
return "balance_xor"
elif re.search('broadcast', content):
return "broadcast"
elif re.search('802.3ad', content):
return "lacp"
elif re.search('balance-tlb', content):
return "balance_tlb"
elif re.search('balance-alb', content):
return "balance_alb"
return 'others'
def check_bond(bond, lldp=None):
bond_path = "/sys/class/net/{}".format(bond)
if not os.path.isdir(bond_path):
return "missing"
if check_iface_down(bond):
return "down"
with open("{}/bonding/slaves".format(bond_path)) as fos:
content = fos.read()
vlan = None
for slave in content.split():
if check_iface_down(slave):
return "{} down".format(slave)
if lldp:
if vlan:
if not vlan == lldp.get_interface_vlan(slave):
return "vlan mismatch"
else:
vlan = lldp.get_interface_vlan(slave)
if get_bond_mode(bond) == "lacp":
for slave in content.split():
if check_aggregator_id(bond, slave):
return "Aggregator ID mismatch"
for slave in content.split():
if check_lacp_port_state(slave):
return "LACP port state mismatch"
return None
def check_bonds(bonds, lldp=None):
bonds_status = None
for bond in [b.strip() for b in bonds.split(',')]:
bond_status = check_bond(bond, lldp)
if bond_status:
if bonds_status:
bonds_status = "{} {}:{}\
".format(bonds_status, bond, bond_status)
else:
bonds_status = "{}:{}".format(bond, bond_status)
if bonds_status:
return "bonds failed: {}".format(bonds_status)
else:
return "bonds ok"
def get_link_speed(iface):
try:
with open('/sys/class/net/{}/speed'.format(iface)) as f:
return int(f.read())
except OSError as e:
hookenv.log('Unable to determine link speed for {}: {}'
.format(iface, str(e)),
hookenv.WARNING)
return -1
def get_src_ip_from_dest(address):
args = [
"ip",
"-j",
"route",
"get",
address,
]
iproute = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
output = json.loads(iproute.stdout.decode())
# if there is no results, iproute returns an empty list
if output:
return output[0]['prefsrc']
else:
return ""
def get_iface_mac(iface):
if iface in netifaces.interfaces():
addr = netifaces.ifaddresses(iface)
mac = addr[netifaces.AF_LINK][0]['addr']
return mac
else:
hookenv.log('Unable to retrieve MAC from interface {}'
.format(iface), hookenv.WARNING)
return ""
def get_dest_mac(iface, address):
args = [
"ip",
"-j",
"neigh",
"show",
"dev",
iface,
"to",
address,
]
iproute = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
output = json.loads(iproute.stdout.decode())
# if there is no matched address, iproute returns an empty list
if output:
mac_addr = output[0]['lladdr']
return mac_addr
else:
return ""
def check_nodes(nodes, is_leader=False):
iperf_client = is_leader
cfg = hookenv.config()
local_ip = hookenv.network_get("magpie")['ingress-addresses'][0]
iface_lines = subprocess.check_output(["ip", "route", "show", "to",
"match", local_ip]).decode()
iface_lines = iface_lines.split('\n')
for line in iface_lines:
if re.match('.* via .*', line) is None:
break
primary_iface = str(line).split('dev')[1].split(' ')[1]
iface_mtu = get_nic_mtu(primary_iface)
link_speed = get_link_speed(primary_iface)
required_mtu = cfg.get('required_mtu')
min_speed = cfg.get('min_speed')
msg = "MTU for iface: {} is {}".format(primary_iface, iface_mtu)
hookenv.log(msg, 'INFO')
port_status = ""
lldp = None
if cfg.get('use_lldp'):
lldp = Lldp()
lldp.enable()
lldp.collect_data()
if cfg.get('check_port_description'):
port_status = "{}, ".format(check_port_description(lldp))
cfg_check_bonds = cfg.get('check_bonds', lldp)
# Attempt to find the bond interfaces and create a comma-delimited
# string out of them for use by the check_bonds() call below.
all_bonds_path = "/sys/class/net/bonding_masters"
if cfg_check_bonds.upper() == "AUTO" and os.path.exists(all_bonds_path):
with open(all_bonds_path) as fos:
all_bonds = fos.read()
if all_bonds:
cfg_check_bonds = all_bonds.replace(' ', ',')
elif cfg_check_bonds.upper() == "AUTO":
hookenv.log('No bond interfaces available.', 'DEBUG')
cfg_check_bonds = ""
bond_status = ""
# Perform actual bond checking
if cfg_check_bonds:
bond_status = "{}, ".format(check_bonds(cfg_check_bonds, lldp))
cfg_check_iperf = cfg.get('check_iperf')
if cfg_check_iperf:
hookenv.log("Running iperf test", 'INFO')
if not iperf_client:
iperf = Iperf()
no_ping_mtu = check_ping(nodes, mtu=required_mtu or iface_mtu)
if not no_ping_mtu:
mtu = required_mtu or iface_mtu
else:
mtu = 'failed'
speed = iperf.speed()
# Make space for 8 or 12 byte variable overhead (TCP options)
if "failed" not in str(mtu):
if 0 <= (int(iface_mtu) - int(mtu)) <= 12:
iperf_status = ", net mtu ok: {}".format(iface_mtu)
else:
iperf_status = ", net mtu failed, mismatch: {} packet vs {}\
on iface {}".format(mtu, iface_mtu, primary_iface)
else:
iperf_status = ", network mtu check failed"
if "failed" not in speed:
iperf_status += status_for_speed_check(
min_speed, float(speed), link_speed)
else:
iperf_status = iperf_status + ", iperf speed check failed"
elif iperf_client:
iperf_status = ", iperf leader, mtu: {}".format(iface_mtu)
iperf = Iperf()
cfg_iperf_duration = cfg.get('iperf_duration')
iperf.hostcheck(nodes, cfg_iperf_duration)
else:
iperf_status = ""
if check_local_mtu(required_mtu, iface_mtu) == 100:
iperf_status = iperf_status + ", local mtu ok, required: \
{}".format(required_mtu)
elif check_local_mtu(required_mtu, iface_mtu) == 200:
iperf_status = iperf_status + ", local mtu failed, \
required: {}, iface: {}".format(required_mtu, iface_mtu)
hookenv.log('doing other things after iperf', 'INFO')
cfg_check_local_hostname = cfg.get('check_local_hostname')
if cfg_check_local_hostname:
no_hostname = check_local_hostname()
local_hostname = subprocess.check_output(
'hostname', shell=True).decode('utf-8').rstrip()
if no_hostname[0] == '':
no_hostname = ', local hostname ok ({})'.format(local_hostname)
hookenv.log('Local hostname lookup OK: {}'.format(
str(no_hostname)), 'INFO')
else:
no_hostname = ', local hostname failed ({})'.format(local_hostname)
hookenv.log('Local hostname lookup FAILED: {}'.format(
str(no_hostname)), 'ERROR')
if cfg.get('ping_mesh_mode') or is_leader:
unreachable_nodes = check_ping(nodes)
if unreachable_nodes:
ping_errors_text = (
'; '.join(str(x) for x in unreachable_nodes)
if (
isinstance(unreachable_nodes, Iterable)
and not isinstance(unreachable_nodes, str)
) else unreachable_nodes
)
icmp_message = 'icmp failed: {}'.format(ping_errors_text)
else:
icmp_message = 'icmp ok'
else:
icmp_message = 'icmp skipped'
cfg_check_dns = cfg.get('check_dns')
if cfg_check_dns:
no_dns = check_dns(nodes)
hookenv.log("Units with DNS problems: " + str(no_dns))
try:
dns_status
except NameError:
dns_status = ''
else:
dns_status = ''
no_dns = ([], [], [])
try:
dns_status
except NameError:
dns_status = ''
if no_dns == ([], [], []):
dns_status = ', dns ok'
else:
no_rev = no_dns[0]
no_fwd = no_dns[1]
no_match = no_dns[2]
if no_match != []:
dns_status = ', match dns failed: ' + str(no_match)
else:
if no_rev:
no_rev = ', rev dns failed: ' + str(no_rev)
if no_fwd:
no_fwd = ', fwd dns failed: ' + str(no_fwd)
if no_rev == []:
no_rev = ''
if no_fwd == []:
no_fwd = ''
dns_status = '{}{}{}'\
.format(dns_status, str(no_rev), str(no_fwd))
if cfg_check_local_hostname:
check_status = '{}{}{}{}{}{}'.format(
port_status, bond_status, icmp_message,
str(no_hostname), str(dns_status), str(iperf_status))
else:
check_status = '{}{}{}{}{}'.format(
port_status, bond_status, icmp_message,
str(dns_status), str(iperf_status))
if 'failed' in check_status:
workload = 'blocked'
else:
workload = 'active'
safe_status(workload, check_status)
reactive_state = {'icmp': icmp_message, 'dns': dns_status}
return reactive_state
async def async_check_ping(node, mtu):
cfg = hookenv.config()
ping_timeout = cfg.get('ping_timeout')
ping_tries = cfg.get('ping_tries')
ping_interval = cfg.get('ping_interval')
unit_id = node[0].split('/')[1]
hookenv.log('Pinging unit_id: ' + str(unit_id), 'INFO')
(received, transmitted) = await ping(
node[1], ping_timeout, ping_tries,
ping_interval, mtu=mtu
)
if transmitted > 0 and received == transmitted:
hookenv.log(
f'Ping OK for unit_id: {unit_id}. '
f'{transmitted} packets transmitted, {received} received',
hookenv.INFO
)
return ""
else:
hookenv.log(
f'Ping FAILED for unit_id: {unit_id}. '
f'{transmitted} packets transmitted, {received} received',
hookenv.ERROR
)
return f"{unit_id}: {received}/{transmitted} packets received"
def check_ping(nodes, mtu=None):
"""
Ping nodes and return list of unreachable unit ids.
:param mtu: optional packet size to send
:type mtu: Union[str, int, None]
:returns: a list of unreachable unit ids.
:rtype: List[str]
"""
loop = asyncio.get_event_loop()
unreachable = loop.run_until_complete(
asyncio.gather(
*[
async_check_ping(node, mtu)
for node in nodes
]
)
)
return list(filter(lambda x: x, unreachable))
def check_dns(nodes):
cfg = hookenv.config()
dns_server = cfg.get('dns_server')
dns_tries = cfg.get('dns_tries')
dns_time = cfg.get('dns_time')
try:
norev
except NameError:
norev = []
try:
nofwd
except NameError:
nofwd = []
try:
nomatch
except NameError:
nomatch = []
hookenv.log("DNS (ALL NODES): {}".format(nodes))
for node in nodes:
ip = node[1]
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
hookenv.log("private-address appears to be a hostname: {},"
" attempting forward lookup...", 'WARN')
ip = forward_dns(ip, dns_server, dns_tries, dns_time)[0]
else:
hookenv.log('private-address appears to be an IP', 'INFO')
unit_id = node[0].split('/')[1]
hookenv.log("Reverse lookup for ip: {}, node: {},"
" unit_id: {}".format(ip, node[0], unit_id), 'INFO')
reverse, r_stderr = reverse_dns(ip, dns_server, dns_tries, dns_time)
hookenv.log("Reverse result for unit_id: {}, hostname: {},"
" exitcode: {}".format(unit_id, str(reverse),
str(r_stderr)))
if r_stderr:
hookenv.log("Reverse FAILED for"
" unit_id: {}".format(unit_id), 'ERROR')
if unit_id not in norev:
norev.append(unit_id)
continue
else:
hookenv.log("Reverse OK for unit_id: {}".format(unit_id), 'INFO')
if unit_id in norev:
norev.remove(unit_id)
hookenv.log("Forward lookup for hostname: {}, node: {},"
" unit_id: {}".format(str(reverse), node[0], unit_id),
'INFO')
for rev in reverse.split():
forward, f_stderr = forward_dns(rev, dns_server,
dns_tries, dns_time)
hookenv.log("Forward result for unit_id: {}, ip: {},"
" exitcode: {}".format(unit_id, forward,
str(f_stderr)))
if f_stderr:
hookenv.log("Forward FAILED for"
" unit_id: {}".format(unit_id), 'ERROR')
if unit_id not in nofwd:
nofwd.append(unit_id)
else:
hookenv.log("Forward OK for"
" unit_id: {}".format(unit_id), 'INFO')
if unit_id in nofwd:
nofwd.remove(unit_id)
forward_ips = forward.splitlines()
if ip not in forward_ips:
mstr = r'(r\"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")'
if not re.match(mstr, forward):
forward = "Can not resolve hostname to IP {}"\
.format(repr(forward))
hookenv.log("Original IP and Forward MATCH FAILED for"
" unit_id: {}, Original: {}, Forward: {}"
.format(unit_id, ip, forward), 'ERROR')
if unit_id not in nomatch:
nomatch.append(unit_id)
else:
hookenv.log("Original IP and Forward MATCH OK for"
" unit_id: {}, Original: {}, Forward: {}"
.format(unit_id, ip, forward_ips),
'INFO')
if unit_id in nomatch:
nomatch.remove(unit_id)
break
return norev, nofwd, nomatch
def _execute_dig(cmd, dns_server, tries, timeout, lookup_type):
try:
output = (subprocess.check_output(cmd, shell=True)
.decode('utf-8').rstrip())
result, stderr = parse_dig_yaml(
output,
dns_server,
tries,
timeout,
is_reverse_query=lookup_type == 'reverse',
)
except subprocess.CalledProcessError as exc:
result = lookup_type.title() + " DNS lookup error: " + str(exc.output)
stderr = exc.returncode
if result == '':
result = 'No {} response'.format(lookup_type)
stderr = 1
return result, stderr
def resolve_cname(label, dns_server, tries, timeout, rec_type):
cmd = '/usr/bin/dig {} +yaml +tries={} +time={}'.format(
label,
tries,
timeout,
)
if rec_type:
cmd += ' ' + rec_type
if dns_server:
cmd = '{} @{}'.format(cmd, dns_server)
return _execute_dig(cmd, dns_server, tries, timeout, 'cname')
def parse_dig_yaml(output, dns_server, tries, timeout, is_reverse_query=False):
try:
responses = yaml.safe_load(output)
except yaml.YAMLError:
result = f"Cannot parse {output.enocde('utf-8')} as YAML"
stderr = 2
return result, stderr
result = ''
stderr = 0
for response in responses:
if response['type'] == 'MESSAGE':
response_data = response['message']['response_message_data']
for answer in response_data.get('ANSWER_SECTION', []):
split_answer = answer.split(' ')
rec_type = split_answer[3]
rrdata = ' '.join(split_answer[4:])
if rec_type in ('PTR', 'CNAME') and rrdata[-1] == '.':
rrdata = rrdata[:-1]
if rec_type == 'CNAME':
cname_result, stderr = resolve_cname(
rrdata,
dns_server,
tries,
timeout,
'PTR' if is_reverse_query else '',
)
if stderr != 0:
return cname_result, stderr
result += cname_result + '\n'
else:
result += rrdata + '\n'
if result and result[-1] == '\n':
result = result[:-1]
return result, stderr
def reverse_dns(input, dns_server, tries, timeout):
cmd = '/usr/bin/dig -x ' + input + ' +yaml +tries={} +time={}'\
.format(tries, timeout)
if dns_server:
cmd = '{} @{}'.format(cmd, dns_server)
hookenv.log('DNS Reverse command: {}'.format(cmd), 'DEBUG')
return _execute_dig(cmd, dns_server, tries, timeout, 'reverse')
def forward_dns(input, dns_server, tries, timeout):
cmd = '/usr/bin/dig ' + input + ' +yaml +tries={} +time={}'\
.format(tries, timeout)
if dns_server:
cmd = '{} @{}'.format(cmd, dns_server)
hookenv.log('DNS Forward command: {}'.format(cmd), 'DEBUG')
return _execute_dig(cmd, dns_server, tries, timeout, 'forward')