Browse code

Import the ssd tool in libnetwork

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>

Flavio Crisciani authored on 2018/02/08 02:20:55
Showing 3 changed files
1 1
new file mode 100755
... ...
@@ -0,0 +1,34 @@
0
+FROM alpine:3.7
1
+ENV PACKAGES="\
2
+    musl \
3
+    linux-headers \
4
+    build-base \
5
+    util-linux \
6
+    bash \
7
+    git \
8
+    ca-certificates \
9
+    python2 \
10
+    python2-dev \
11
+    py-setuptools \
12
+    iproute2 \
13
+    curl \
14
+    strace \
15
+    drill \
16
+    ipvsadm \
17
+    iperf \
18
+    ethtool \
19
+"
20
+
21
+RUN echo \
22
+    && apk add --no-cache $PACKAGES \
23
+    && if [[ ! -e /usr/bin/python ]];        then ln -sf /usr/bin/python2.7 /usr/bin/python; fi \
24
+    && if [[ ! -e /usr/bin/python-config ]]; then ln -sf /usr/bin/python2.7-config /usr/bin/python-config; fi \
25
+    && if [[ ! -e /usr/bin/easy_install ]];  then ln -sf /usr/bin/easy_install-2.7 /usr/bin/easy_install; fi \
26
+    && easy_install pip \
27
+    && pip install --upgrade pip \
28
+    && if [[ ! -e /usr/bin/pip ]]; then ln -sf /usr/bin/pip2.7 /usr/bin/pip; fi \
29
+    && echo
30
+
31
+ADD ssd.py /
32
+RUN pip install git+git://github.com/docker/docker-py.git
33
+ENTRYPOINT [ "python", "/ssd.py"]
0 34
new file mode 100755
... ...
@@ -0,0 +1,47 @@
0
+# Docker Swarm Service Driller(ssd)
1
+
2
+ssd is a troubleshooting utility for Docker swarm networks. 
3
+
4
+### control-plane and datapath consistency check on a node
5
+ssd checks for the consistency between docker network control-plane (from the docker daemon in-memory state) and kernel data path programming. Currently the tool checks only for the consistency of the Load balancer (implemented using IPVS).
6
+
7
+In a three node swarm cluser ssd status for a overlay network `ov2` which has three services running, each replicated to 3 instances.
8
+
9
+````bash
10
+vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged --net=host sanimej/ssd ov2
11
+Verifying LB programming for containers on network ov2
12
+Verifying container /s2.3.ltrdwef0iqf90rqauw3ehcs56...
13
+service s2... OK
14
+service s3... OK
15
+service s1... OK
16
+Verifying container /s3.3.nyhwvdvnocb4wftyhb8dr4fj8...
17
+service s2... OK
18
+service s3... OK
19
+service s1... OK
20
+Verifying container /s1.3.wwx5tuxhnvoz5vrb8ohphby0r...
21
+service s2... OK
22
+service s3... OK
23
+service s1... OK
24
+Verifying LB programming for containers on network ingress
25
+Verifying container Ingress...
26
+service web... OK
27
+````
28
+
29
+ssd checks the required iptables programming to direct an incoming packet with the <host ip>:<published port> to the right <backend ip>:<target port>
30
+
31
+### control-plane consistency check across nodes in a cluster
32
+
33
+Docker networking uses a gossip protocol to synchronize networking state across nodes  in a cluster. ssd's `gossip-consistency` command verifies if the state maintained by all the nodes are consistent.
34
+
35
+````bash
36
+In a three node cluster with services running on an overlay network ov2 ssd consistency-checker shows 
37
+
38
+vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged sanimej/ssd ov2 gossip-consistency
39
+Node id: sjfp0ca8f43rvnab6v7f21gq0 gossip hash c57d89094dbb574a37930393278dc282
40
+
41
+Node id: bg228r3q9095grj4wxkqs80oe gossip hash c57d89094dbb574a37930393278dc282
42
+
43
+Node id: 6jylcraipcv2pxdricqe77j5q gossip hash c57d89094dbb574a37930393278dc282
44
+````
45
+
46
+This is hash digest of the control-plane state for the network `ov2` from all the cluster nodes. If the values have a mismatch `docker network inspect --verbose` on the individual nodes can help in identifying what the specific difference is.
0 47
new file mode 100755
... ...
@@ -0,0 +1,180 @@
0
+#!/usr/bin/python
1
+
2
+import sys, signal, time
3
+import docker
4
+import re
5
+import subprocess
6
+import json
7
+import hashlib
8
+
9
+ipv4match = re.compile(
10
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
11
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
12
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
13
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9])'
14
+)
15
+
16
+def check_iptables(name, plist):
17
+    replace = (':', ',')
18
+    ports = []
19
+    for port in plist:
20
+        for r in replace:
21
+            port = port.replace(r, ' ')
22
+
23
+        p = port.split()
24
+        ports.append((p[1], p[3]))
25
+
26
+    # get the ingress sandbox's docker_gwbridge network IP.
27
+    # published ports get DNAT'ed to this IP.
28
+    ip = subprocess.check_output(['/usr/bin/nsenter', '--net=/var/run/docker/netns/ingress_sbox', '/bin/bash', '-c', 'ifconfig eth1 | grep \"inet\\ addr\" | cut -d: -f2 | cut -d\" \" -f1'])
29
+    ip = ip.rstrip()
30
+
31
+    for p in ports:
32
+        rule = '/sbin/iptables -t nat -C DOCKER-INGRESS -p tcp --dport {0} -j DNAT --to {1}:{2}'.format(p[1], ip, p[1])
33
+        try:
34
+            subprocess.check_output(["/bin/bash", "-c", rule])
35
+        except subprocess.CalledProcessError as e:
36
+            print "Service {0}: host iptables DNAT rule for port {1} -> ingress sandbox {2}:{3} missing".format(name, p[1], ip, p[1])
37
+
38
+def get_namespaces(data, ingress=False):
39
+    if ingress is True:
40
+        return {"Ingress":"/var/run/docker/netns/ingress_sbox"}
41
+    else:
42
+        spaces =[]
43
+        for c in data["Containers"]:
44
+            sandboxes = {str(c) for c in data["Containers"]}
45
+
46
+        containers = {}
47
+        for s in sandboxes:
48
+            spaces.append(str(cli.inspect_container(s)["NetworkSettings"]["SandboxKey"]))
49
+            inspect = cli.inspect_container(s)
50
+            containers[str(inspect["Name"])] = str(inspect["NetworkSettings"]["SandboxKey"])
51
+        return containers
52
+
53
+
54
+def check_network(nw_name, ingress=False):
55
+
56
+    print "Verifying LB programming for containers on network %s" % nw_name
57
+
58
+    data = cli.inspect_network(nw_name, verbose=True)
59
+
60
+    services = data["Services"]
61
+    fwmarks = {str(service): str(svalue["LocalLBIndex"]) for service, svalue in services.items()}
62
+
63
+    stasks = {}
64
+    for service, svalue in services.items():
65
+        if service == "":
66
+            continue
67
+        tasks = []
68
+        for task in svalue["Tasks"]:
69
+            tasks.append(str(task["EndpointIP"]))
70
+        stasks[fwmarks[str(service)]] = tasks
71
+
72
+        # for services in ingress network verify the iptables rules
73
+        # that direct ingress (published port) to backend (target port)
74
+        if ingress is True:
75
+            check_iptables(service, svalue["Ports"])
76
+
77
+    containers = get_namespaces(data, ingress)
78
+    for container, namespace in containers.items():
79
+        print "Verifying container %s..." % container
80
+        ipvs = subprocess.check_output(['/usr/bin/nsenter', '--net=%s' % namespace, '/usr/sbin/ipvsadm', '-ln'])
81
+
82
+        mark = ""
83
+        realmark = {}
84
+        for line in ipvs.splitlines():
85
+            if "FWM" in line:
86
+                mark = re.findall("[0-9]+", line)[0]
87
+                realmark[str(mark)] = []
88
+            elif "->" in line:
89
+                if mark == "":
90
+                    continue
91
+                ip = ipv4match.search(line)
92
+                if ip is not None:
93
+                    realmark[mark].append(format(ip.group(0)))
94
+            else:
95
+                mark = ""
96
+        for key in realmark.keys():
97
+            if key not in stasks:
98
+                print "LB Index %s" % key, "present in IPVS but missing in docker daemon"
99
+                del realmark[key]
100
+
101
+        for key in stasks.keys():
102
+            if key not in realmark:
103
+                print "LB Index %s" % key, "present in docker daemon but missing in IPVS"
104
+                del stasks[key]
105
+
106
+        for key in realmark:
107
+            service = "--Invalid--"
108
+            for sname, idx in fwmarks.items():
109
+                if key == idx:
110
+                    service = sname
111
+            if len(set(realmark[key])) != len(set(stasks[key])):
112
+                print "Incorrect LB Programming for service %s" % service
113
+                print "control-plane backend tasks:"
114
+                for task in stasks[key]:
115
+                    print task
116
+                print "kernel IPVS backend tasks:"
117
+                for task in realmark[key]:
118
+                    print task
119
+            else:
120
+                print "service %s... OK" % service
121
+
122
+if __name__ == '__main__':
123
+    if len(sys.argv) < 2:
124
+        print 'Usage: ssd.py network-name [gossip-consistency]'
125
+        sys.exit()
126
+
127
+    cli = docker.APIClient(base_url='unix://var/run/docker.sock', version='auto')
128
+    if len(sys.argv) == 3:
129
+        command = sys.argv[2]
130
+    else:
131
+        command = 'default'
132
+
133
+    if command == 'gossip-consistency':
134
+        cspec = docker.types.ContainerSpec(
135
+            image='sanimej/ssd',
136
+            args=[sys.argv[1], 'gossip-hash'],
137
+            mounts=[docker.types.Mount('/var/run/docker.sock', '/var/run/docker.sock', type='bind')]
138
+        )
139
+        mode = docker.types.ServiceMode(
140
+            mode='global'
141
+        )
142
+        task_template = docker.types.TaskTemplate(cspec)
143
+
144
+        cli.create_service(task_template, name='gossip-hash', mode=mode)
145
+        #TODO change to a deterministic way to check if the service is up.
146
+        time.sleep(5)
147
+        output = cli.service_logs('gossip-hash', stdout=True, stderr=True, details=True)
148
+        for line in output:
149
+            print("Node id: %s gossip hash %s" % (line[line.find("=")+1:line.find(",")], line[line.find(" ")+1:]))
150
+        if cli.remove_service('gossip-hash') is not True:
151
+            print("Deleting gossip-hash service failed")
152
+    elif command == 'gossip-hash':
153
+        data = cli.inspect_network(sys.argv[1], verbose=True)
154
+        services = data["Services"]
155
+        md5 = hashlib.md5()
156
+        entries = []
157
+        for service, value in services.items():
158
+            entries.append(service)
159
+            entries.append(value["VIP"])
160
+            for task in value["Tasks"]:
161
+                for key, val in task.items():
162
+                    if isinstance(val, dict):
163
+                        for k, v in val.items():
164
+                            entries.append(v)
165
+                    else:
166
+                        entries.append(val)
167
+        entries.sort()
168
+        for e in entries:
169
+            md5.update(e)
170
+        print(md5.hexdigest())
171
+        sys.stdout.flush()
172
+        while True:
173
+           signal.pause()
174
+    elif command == 'default':
175
+        if sys.argv[1] == "ingress":
176
+            check_network("ingress", ingress=True)
177
+        else:
178
+            check_network(sys.argv[1])
179
+            check_network("ingress", ingress=True)