From d5e4c60d44d3f010e37a40994dd9d001409f9678 Mon Sep 17 00:00:00 2001
From: Federico Ceratto <federico.ceratto@hpe.com>
Date: Thu, 10 Mar 2016 15:53:55 +0000
Subject: [PATCH] Integration/stress test using a simulated network

Setup a devstack host
Setup simulated lossy network and run benchmarks

Change-Id: Icf492ab523e218af28ff0558e765bedffef9e623
---
 devstack/networking_test.py            | 663 +++++++++++++++++++++++++
 devstack/networking_test_monitor_tc.sh |   3 +
 2 files changed, 666 insertions(+)
 create mode 100755 devstack/networking_test.py
 create mode 100755 devstack/networking_test_monitor_tc.sh

diff --git a/devstack/networking_test.py b/devstack/networking_test.py
new file mode 100755
index 000000000..49ef76fb7
--- /dev/null
+++ b/devstack/networking_test.py
@@ -0,0 +1,663 @@
+#!/usr/bin/env python
+# Copyright 2016 Hewlett Packard Enterprise Development Company LP
+#
+# Author: Federico Ceratto <federico.ceratto@hpe.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+    Network simulator
+    ~~~~~~~~~~~~~~~~~
+    Perform end-to-end stress tests on Designate on a simulated network
+    that displays high latency and packet loss (almost like real ones)
+
+    WARNING: this script is to be run on a disposable devstack VM
+    It requires sudo and it will configure /sbin/tc
+
+    Usage:
+    cd <designate_repo>/contrib/vagrant
+    ./setup_ubuntu_devstack
+    vagrant ssh ubuntu
+    source ~/devstack/openrc
+    /opt/stack/designate/devstack/networking_test.py
+    Monitor the logfiles
+"""
+
+from argparse import ArgumentParser
+from collections import OrderedDict
+from itertools import product
+from subprocess import check_output
+from subprocess import CalledProcessError
+from tempfile import NamedTemporaryFile
+from threading import Thread
+import json
+import logging
+import os
+import random
+import string
+import time
+
+import dns
+import dns.resolver
+
+log = logging.getLogger()
+
+tc_path = '/sbin/tc'
+sudo_path = '/usr/bin/sudo'
+iptables_restore_path = '/sbin/iptables-restore'
+designate_cli_path = '/usr/local/bin/designate'
+openstack_cli = 'openstack'
+
+
+def gen_random_name(l):
+    return "".join(
+        random.choice(string.ascii_lowercase + string.digits)
+        for n in range(l)
+    )
+
+
+def parse_args():
+    ap = ArgumentParser()
+    ap.add_argument('-d', '--debug', action='store_true')
+    return ap.parse_args()
+
+
+def run_shell(cmd, env=None):
+    log.debug("  running %s" % cmd)
+    out = check_output(cmd, env=env, shell=True, executable='/bin/bash')
+    return [line.rstrip() for line in out.splitlines()]
+
+
+class DesignateCLI(object):
+    """Designate CLI runner
+    """
+
+    def __init__(self):
+        """Setup CLI handler"""
+        self._cli_env = {}
+        for k, v in sorted(os.environ.items()):
+            if k.startswith('OS_'):
+                log.debug("%s: %s", k, v)
+                self._cli_env[k] = v
+
+    def setup_quota(self, quota):
+        """Setup quota
+        """
+        user_id = self.run_json("token issue")["user_id"]
+
+        cmd = """quota-update
+        --domains %(quota)d
+        --domain-recordsets %(quota)d
+        --recordset-records %(quota)d
+        --domain-records %(quota)d
+        %(user_id)s """
+        cmd = ' '.join(cmd.split())
+        quotas = self.run_designate_cli_table(cmd % dict(quota=quota,
+                                                         user_id=user_id))
+        assert quotas['domain_records'] == str(quota)
+
+    def run(self, cmd):
+        """Run a openstack client command
+        """
+        return run_shell("%s %s" % (openstack_cli, cmd),
+                         env=self._cli_env)
+
+    def run_json(self, cmd):
+        """Run a openstack client command using JSON output
+
+        :returns: dict
+        :raises: CalledProcessError
+        """
+        cmd = "%s %s -f json" % (openstack_cli, cmd)
+        log.debug("  running %s" % cmd)
+        out = check_output(cmd, env=self._cli_env, shell=True,
+                           executable='/bin/bash')
+        return json.loads(out)
+
+    def runcsv(self, cmd):
+        """Run a command using the -f csv flag, parse the output
+        and return a list of dicts
+        """
+        cmdout = self.run(cmd + " -f csv")
+        header = [item.strip('"') for item in cmdout[0].split(',')]
+        output_rows = []
+        for line in cmdout[1:]:
+            rawvalues = line.split(',')
+            d = OrderedDict()
+            for k, v in zip(header, rawvalues):
+                if v.startswith('"') or v.endswith('"'):
+                    v = v.strip('"')
+                else:
+                    try:
+                        v = int(v)
+                    except ValueError:
+                        v = float(v)
+
+                d[k] = v
+
+            output_rows.append(d)
+
+        return output_rows
+
+    def run_designate_cli_table(self, cmd):
+        """Run a command in the designate cli expecting a table to be
+        returned and parse it into a dict
+        """
+        cmdout = run_shell("%s %s" % (designate_cli_path, cmd),
+                           env=self._cli_env)
+        out = {}
+        try:
+            for line in cmdout:
+                if not line.startswith('| '):
+                    continue
+                if not line.endswith(' |'):
+                    continue
+                k = line.split('|')[1].strip()
+                v = line.split('|')[2].strip()
+                out[k] = v
+        except Exception:
+            log.error("Unable to parse output into a dict:")
+            for line in out:
+                log.error(line)
+            log.error("-----------------------------------")
+            raise
+
+        return out
+
+
+class TrafficControl(object):
+    """Configure Linux Traffic Control to simulate a real network
+    """
+
+    protocol_marks = dict(
+        mysql=1,
+        dns_udp=2,
+        dns_tcp=3,
+    )
+
+    def run_tc(self, cmd):
+        return run_shell("%s %s %s" % (sudo_path, tc_path, cmd))
+
+    def _apply_iptables_conf(self, ipt_conf):
+        tf = NamedTemporaryFile()
+        tf.file.write(ipt_conf)
+        tf.file.flush()
+        run_shell("%s %s %s" % (sudo_path, iptables_restore_path, tf.name))
+        tf.file.close()
+
+    def cleanup_iptables_marking(self):
+        # Currently unneeded
+        ipt_conf = """
+*filter
+:INPUT ACCEPT [0:0]
+:FORWARD ACCEPT [0:0]
+:OUTPUT ACCEPT [0:0]
+COMMIT
+*mangle
+:PREROUTING ACCEPT [0:0]
+:INPUT ACCEPT [0:0]
+:FORWARD ACCEPT [0:0]
+:OUTPUT ACCEPT [0:0]
+:POSTROUTING ACCEPT [0:0]
+COMMIT
+"""
+        self._apply_iptables_conf(ipt_conf)
+
+    def setup_iptables_marking(self):
+        # Currently unneeded
+        ipt_conf = """
+*filter
+:INPUT ACCEPT [0:0]
+:FORWARD ACCEPT [0:0]
+:OUTPUT ACCEPT [0:0]
+COMMIT
+*mangle
+:PREROUTING ACCEPT [0:0]
+:INPUT ACCEPT [0:0]
+:FORWARD ACCEPT [0:0]
+:OUTPUT ACCEPT [0:0]
+:POSTROUTING ACCEPT [0:0]
+-A PREROUTING -i lo -p tcp -m tcp --dport 3306 -j MARK --set-xmark %(mysql)s
+-A PREROUTING -i lo -p tcp -m tcp --sport 3306 -j MARK --set-xmark %(mysql)s
+-A PREROUTING -i lo -p tcp -m tcp --dport 53 -j MARK --set-xmark %(dns_tcp)s
+-A PREROUTING -i lo -p tcp -m tcp --sport 53 -j MARK --set-xmark %(dns_tcp)s
+-A PREROUTING -i lo -p udp -m udp --dport 53 -j MARK --set-xmark %(dns_udp)s
+-A PREROUTING -i lo -p udp -m udp --sport 53 -j MARK --set-xmark %(dns_udp)s
+COMMIT
+"""
+        marks = dict((k, "0x%d/0xffffffff" % v)
+                     for k, v in self.protocol_marks.iteritems())
+        ipt_conf = ipt_conf % marks
+        self._apply_iptables_conf(ipt_conf)
+
+    def cleanup_tc(self):
+        """Clean up tc conf
+        """
+        out = self.run_tc('qdisc show dev lo')
+        if out:
+            log.debug("Cleaning up tc conf")
+            self.run_tc('qdisc del dev lo root')
+        else:
+            log.debug("No tc conf to be cleaned up")
+
+    def setup_tc(self, dns_latency_ms=0, dns_packet_loss_perc=0,
+                 db_latency_ms=1, db_packet_loss_perc=1):
+        """Setup traffic control
+        """
+        self.cleanup_tc()
+
+        # Create HTB at the root
+        self.run_tc("qdisc add dev lo handle 1: root htb")
+
+        self.run_tc("class add dev lo parent 1: classid 1:5 htb rate 1000Mbps")
+        self.run_tc("class add dev lo parent 1: classid 1:7 htb rate 1000Mbps")
+
+        # TCP DNS
+        self._setup_tc_block('1:8', 'tcp', 53, dns_latency_ms,
+                             dns_packet_loss_perc)
+        # UDP DNS
+        self._setup_tc_block('1:9', 'udp', 53, dns_latency_ms,
+                             dns_packet_loss_perc)
+        # TCP mDNS
+        self._setup_tc_block('1:10', 'tcp', 5354, dns_latency_ms,
+                             dns_packet_loss_perc)
+        # UDP mDNS
+        self._setup_tc_block('1:11', 'udp', 5354, dns_latency_ms,
+                             dns_packet_loss_perc)
+        # MySQL
+        self._setup_tc_block('1:12', 'tcp', 3306, 1, 1)
+
+        # RabbitMQ port: 5672
+        self._setup_tc_block('1:13', 'tcp', 5672, 1, 1)
+
+        # MemcacheD
+        self._setup_tc_block('1:14', 'tcp', 11211, 1, 1)
+
+    def _setup_tc_block(self, class_id, proto, port, latency_ms,
+                        packet_loss_perc):
+        """Setup tc htb entry, netem and filter"""
+        assert proto in ('tcp', 'udp')
+        cmd = "class add dev lo parent 1: classid %s htb rate 1000Mbps" % \
+            class_id
+        self.run_tc(cmd)
+        self._setup_netem(class_id, latency_ms, latency_ms, packet_loss_perc)
+        self._setup_filter(proto, 'sport %d' % port, class_id)
+        self._setup_filter(proto, 'dport %d' % port, class_id)
+
+    def _setup_netem(self, classid, latency1, latency2, loss_perc):
+        """Setup tc netem
+        """
+        # This could be done with the FireQOS tool instead:
+        # https://firehol.org/tutorial/fireqos-new-user/
+        cmd = ("qdisc add dev lo parent {cid} netem"
+               " corrupt 0.1%"
+               " delay {lat1}ms {lat2}ms distribution normal"
+               " duplicate 0.1%"
+               " loss {packet_loss_perc}%"
+               " reorder 25% 50%")
+        cmd = cmd.format(cid=classid, lat1=latency1, lat2=latency2,
+                         packet_loss_perc=loss_perc)
+        self.run_tc(cmd)
+
+    def _setup_filter(self, protocol, filter, flowid):
+        """Setup tc filter
+        """
+        protocol_nums = dict(tcp=6, udp=17)
+        pnum = protocol_nums[protocol]
+        cmd = "filter add dev lo protocol ip prio 1 u32 match ip protocol " \
+            "%(pnum)d 0xff match ip %(filter)s 0xffff flowid %(flowid)s"
+
+        self.run_tc(cmd % dict(pnum=pnum, filter=filter, flowid=flowid))
+
+
+class Digger(object):
+    def __init__(self):
+        self.ns_ipaddr = self.get_nameserver_ipaddr()
+        self._setup_resolver()
+        self.max_probes_per_second = 30
+        self.reset_goals()
+
+    @property
+    def prober_is_running(self):
+        try:
+            return self._prober_thread.is_alive()
+        except AttributeError:
+            return False
+
+    def _setup_resolver(self, timeout=1):
+        resolver = dns.resolver.Resolver(configure=False)
+        resolver.timeout = timeout
+        resolver.lifetime = timeout
+        resolver.nameservers = [self.ns_ipaddr]
+        self.resolver = resolver
+
+    def get_nameserver_ipaddr(self):
+        # FIXME: find a better way to do this
+        out = run_shell('sudo netstat -nlpt | grep pdns_server')
+        ipaddr = out[0].split()[3]
+        ipaddr = ipaddr.split(':', 1)[0]
+        log.debug("Resolver ipaddr: %s" % ipaddr)
+        return ipaddr
+
+    def query_a_record(self, record_name, timeout=3):
+        try:
+            answer = self.resolver.query(record_name, 'A')
+            if answer.rrset:
+                return answer.rrset[0].address
+        except Exception:
+            return None
+
+    def query_soa(self, zone_name, timeout=3):
+        try:
+            soa_answer = self.resolver.query(zone_name, 'SOA')
+            soa_serial = soa_answer[0].serial
+            return soa_serial
+        except Exception:
+            return None
+
+    def reset_goals(self):
+        assert not self.prober_is_running
+        self.goals = set()
+        self.summary = dict(
+            success_cnt=0,
+            total_time_to_success=0,
+        )
+
+    def add_goal(self, goal):
+        self.goals.add(goal + (time.time(), ))
+
+    def _print_summary(self, final=True):
+        """Log out a summary of the current run
+        """
+        remaining = len(self.goals)
+        success_cnt = self.summary['success_cnt']
+        try:
+            avg_t = (self.summary['total_time_to_success'] / success_cnt)
+            avg_t = ", avg time to success: %2.3fs" % avg_t
+        except ZeroDivisionError:
+            avg_t = ''
+
+        logf = log.info if final else log.debug
+        logf("  test summary: success %3d, remaining %3d %s" % (
+            success_cnt, remaining, avg_t))
+
+    def _probe_resolver(self):
+        """Probe the local resolver, report achieved goals
+        """
+        log.debug("Starting prober")
+        assert self.prober_is_running is True
+        self._progress_report_time = 0
+        now = time.time()
+        while (self.goals or not self.prober_can_stop) and \
+                now < self.prober_timeout_time:
+
+            for goal in tuple(self.goals):
+                goal_type = goal[0]
+                if goal_type == 'zone_serial_ge':
+                    goal_type, zone_name, serial, t0 = goal
+                    actual_serial = self.query_soa(zone_name)
+                    if actual_serial and actual_serial >= serial:
+                        deltat = time.time() - t0
+                        log.debug("  reached %s in %.3fs" % (repr(goal),
+                                                             deltat))
+                        self.goals.discard(goal)
+                        self.summary['success_cnt'] += 1
+                        self.summary['total_time_to_success'] += deltat
+
+                elif goal_type == 'record_a':
+                    goal_type, record_name, ipaddr, t0 = goal
+                    actual_ipaddr = self.query_a_record(record_name)
+                    if actual_ipaddr == ipaddr:
+                        deltat = time.time() - t0
+                        log.debug("  reached %s in %.3fs" % (repr(goal),
+                                                             deltat))
+                        self.goals.discard(goal)
+                        self.summary['success_cnt'] += 1
+                        self.summary['total_time_to_success'] += deltat
+
+                else:
+                    log.error("Unknown goal %r" % goal)
+
+                if time.time() < self.prober_timeout_time:
+                    time.sleep(1.0 / self.max_probes_per_second)
+                else:
+                    break
+
+                if time.time() > self._progress_report_time:
+                    self._print_summary(final=False)
+                    self._progress_report_time = time.time() + 10
+
+            time.sleep(1.0 / self.max_probes_per_second)
+            now = time.time()
+
+        if now > self.prober_timeout_time:
+            log.info("prober timed out after %d s" % (
+                now - self.prober_start_time))
+
+        self._print_summary()
+
+    def probe_resolver(self, timeout=600):
+        """Probe the local resolver in a dedicated thread until all
+        goals have been achieved or timeout occours
+        """
+        assert not self.prober_is_running
+        self.prober_can_stop = False
+        self.prober_start_time = time.time()
+        self.prober_timeout_time = self.prober_start_time + timeout
+        self._prober_thread = Thread(target=self._probe_resolver)
+        self._prober_thread.daemon = True
+        self._prober_thread.start()
+
+    def stop_prober(self):
+        self.prober_can_stop = True
+        self.prober_timeout_time = 0
+
+    def wait_on_prober(self):
+        self.prober_can_stop = True
+        self._prober_thread.join()
+        assert self.prober_is_running is False
+
+
+def list_zones(cli):
+    zones = [z["name"] for z in cli.run_json('zone list')]
+    log.debug("Found zones: %r", zones)
+    return zones
+
+
+def delete_zone_by_name(cli, zn, ignore_missing=False):
+    if ignore_missing:
+        # Return if the zone is not present
+        zones = list_zones(cli)
+        if zn not in zones:
+            return
+
+    cli.run('zone delete %s' % zn)
+
+
+def create_and_probe_a_record(cli, digger, zone_id, record_name, ipaddr):
+    cli.run_json('recordset create %s %s --type A --records %s' %
+                 (zone_id, record_name, ipaddr))
+    digger.add_goal(('record_a', record_name, ipaddr))
+
+
+def delete_all_zones(cli):
+    zones = list_zones(cli)
+    log.info("%d zones to be deleted" % len(zones))
+    for zone in zones:
+        log.info("Deleting %s", zone)
+        delete_zone_by_name(cli, zone)
+
+
+def create_zone_with_retry_on_duplicate(cli, digger, zn, timeout=300,
+                                        dig=False):
+    """Create a zone, retry when a duplicate is found,
+    optionally monitor for propagation
+
+    :returns: dict
+    """
+    t0 = time.time()
+    timeout_time = timeout + t0
+    created = False
+    while time.time() < timeout_time:
+        try:
+            output = cli.run_json(
+                "zone create %s --email devstack@example.org" % zn)
+            created = True
+            log.debug("  zone created after %f" % (time.time() - t0))
+            break
+
+        except CalledProcessError as e:
+            if e.output == 'Duplicate Zone':
+                # dup zone, sleep and retry
+                time.sleep(1)
+                pass
+
+            elif e.output == 'over_quota':
+                raise RuntimeError('over_quota')
+
+            else:
+                raise
+
+    assert output['serial']
+
+    if not created:
+        raise RuntimeError('timeout')
+
+    if dig:
+        digger.reset_goals()
+        digger.add_goal(('zone_serial_ge', zn, int(output['serial'])))
+        digger.probe_resolver(timeout=timeout)
+        digger.wait_on_prober()
+
+    return output
+
+
+def test_create_list_delete_loop(cli, digger, cycles_num, zn='cld.org.'):
+    """Create, list, delete a zone in a loop
+    Monitor for propagation time
+    """
+    log.info("Test zone creation, list, deletion")
+    delete_zone_by_name(cli, zn, ignore_missing=True)
+
+    for cycle_cnt in range(cycles_num):
+        zone = create_zone_with_retry_on_duplicate(cli, digger, zn, dig=True)
+
+        zones = cli.runcsv('domain-list')
+        assert any(z['name'] == zn for z in zones), zones
+
+        cli.run('domain-delete %s' % zone['id'])
+
+        zones = cli.runcsv('domain-list')
+        assert not any(z['name'] == zn for z in zones), zones
+
+    log.info("done")
+
+
+def test_one_big_zone(cli, digger, zone_size):
+    """Create a zone with many records,
+    perform CRUD on records and monitor for propagation time
+    """
+    t0 = time.time()
+    zn = 'bigzone-%s.org.' % gen_random_name(12)
+    delete_zone_by_name(cli, zn, ignore_missing=True)
+    zone = create_zone_with_retry_on_duplicate(cli, digger, zn, dig=True)
+    assert 'serial' in zone, zone
+    assert 'id' in zone, zone
+    try:
+        digger.reset_goals()
+        digger.add_goal(('zone_serial_ge', zn, int(zone['serial'])))
+        digger.probe_resolver(timeout=60)
+
+        record_creation_threads = []
+        for record_num in range(zone_size):
+            record_name = "rec%d" % record_num
+            ipaddr = "127.%d.%d.%d" % (
+                (record_num >> 16) % 256,
+                (record_num >> 8) % 256,
+                record_num % 256,
+            )
+            t = Thread(target=create_and_probe_a_record,
+                       args=(cli, digger, zone['id'], record_name, ipaddr))
+            t.start()
+            record_creation_threads.append(t)
+            time.sleep(.5)
+
+        digger.wait_on_prober()
+
+    except KeyboardInterrupt:
+        log.info("Exiting on keyboard")
+        raise
+
+    finally:
+        digger.stop_prober()
+        delete_zone_by_name(cli, zone['name'])
+        log.info("Done in %ds" % (time.time() - t0))
+
+
+def test_servers_are_configured(cli):
+    servers = cli.runcsv('server-list')
+    assert servers[0]['name'] == 'ns1.devstack.org.'
+    log.info("done")
+
+
+def test_big_zone(args, cli, digger, tc):
+    log.info("Test creating many records in one big zone")
+
+    dns_latencies_ms = (1, 100)
+    dns_packet_losses = (1, 15)
+    zone_size = 20
+
+    for dns_latency_ms, dns_packet_loss_perc in product(dns_latencies_ms,
+                                                        dns_packet_losses):
+        tc.cleanup_tc()
+        tc.setup_tc(dns_latency_ms=dns_latency_ms,
+                    dns_packet_loss_perc=dns_packet_loss_perc)
+        log.info("Running test with DNS latency %dms packet loss %d%%" % (
+                    dns_latency_ms, dns_packet_loss_perc))
+        test_one_big_zone(cli, digger, zone_size)
+
+
+def run_tests(args, cli, digger, tc):
+    """Run all integration tests
+    """
+    # test_servers_are_configured(cli)
+    # test_create_list_delete_loop(cli, digger, 10)
+    test_big_zone(args, cli, digger, tc)
+
+
+def main():
+    args = parse_args()
+    loglevel = logging.DEBUG if args.debug else logging.INFO
+    logging.basicConfig(
+        level=loglevel,
+        format='%(relativeCreated)8d %(levelname)s %(funcName)20s %(message)s',
+    )
+
+    cli = DesignateCLI()
+    cli.setup_quota(10000)
+
+    digger = Digger()
+
+    delete_all_zones(cli)
+
+    tc = TrafficControl()
+    tc.cleanup_tc()
+
+    try:
+        run_tests(args, cli, digger, tc)
+    finally:
+        tc.cleanup_tc()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/devstack/networking_test_monitor_tc.sh b/devstack/networking_test_monitor_tc.sh
new file mode 100755
index 000000000..d32f8d019
--- /dev/null
+++ b/devstack/networking_test_monitor_tc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+IF=lo
+watch -n1 "tc -p -s -d  qdisc show dev $IF; echo; tc class show dev $IF; echo; tc filter show dev $IF"