Skip to content

Commit 6426d1e

Browse files
Flavio Criscianimavenugo
authored andcommitted
Service discovery race on serviceBindings delete. Bug on IP reuse (#1808)
* Correct SetMatrix documentation The SetMatrix is a generic data structure, so the description should not be tight to any specific use Signed-off-by: Flavio Crisciani <[email protected]> * Service Discovery reuse name and serviceBindings deletion - Added logic to handle name reuse from different services - Moved the deletion from the serviceBindings map at the end of the rmServiceBindings body to avoid race with new services Signed-off-by: Flavio Crisciani <[email protected]> * Avoid race on network cleanup Use the locker to avoid the race between the network deletion and new endpoints being created Signed-off-by: Flavio Crisciani <[email protected]> * CleanupServiceBindings to clean the SD records Allow the cleanupServicebindings to take care of the service discovery cleanup. Also avoid to trigger the cleanup for each endpoint from an SD point of view LB and SD will be separated in the future Signed-off-by: Flavio Crisciani <[email protected]> * Addressed comments Signed-off-by: Flavio Crisciani <[email protected]> * NetworkDB deleteEntry has to happen If there is an error locally guarantee that the delete entry on network DB is still honored Signed-off-by: Flavio Crisciani <[email protected]>
1 parent bfc5fe3 commit 6426d1e

File tree

7 files changed

+313
-142
lines changed

7 files changed

+313
-142
lines changed

agent.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -648,13 +648,13 @@ func (ep *endpoint) addServiceInfoToCluster(sb *sandbox) error {
648648
TaskAliases: ep.myAliases,
649649
EndpointIP: ep.Iface().Address().IP.String(),
650650
})
651-
652651
if err != nil {
653652
return err
654653
}
655654

656655
if agent != nil {
657656
if err := agent.networkDB.CreateEntry(libnetworkEPTable, n.ID(), ep.ID(), buf); err != nil {
657+
logrus.Warnf("addServiceInfoToCluster NetworkDB CreateEntry failed for %s %s err:%s", ep.id, n.id, err)
658658
return err
659659
}
660660
}
@@ -686,14 +686,21 @@ func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) err
686686
name = ep.MyAliases()[0]
687687
}
688688

689+
if agent != nil {
690+
// First delete from networkDB then locally
691+
if err := agent.networkDB.DeleteEntry(libnetworkEPTable, n.ID(), ep.ID()); err != nil {
692+
logrus.Warnf("deleteServiceInfoFromCluster NetworkDB DeleteEntry failed for %s %s err:%s", ep.id, n.id, err)
693+
}
694+
}
695+
689696
if ep.Iface().Address() != nil {
690697
if ep.svcID != "" {
691698
// This is a task part of a service
692699
var ingressPorts []*PortConfig
693700
if n.ingress {
694701
ingressPorts = ep.ingressPorts
695702
}
696-
if err := c.rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), name, ep.virtualIP, ingressPorts, ep.svcAliases, ep.myAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster"); err != nil {
703+
if err := c.rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), name, ep.virtualIP, ingressPorts, ep.svcAliases, ep.myAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster", true); err != nil {
697704
return err
698705
}
699706
} else {
@@ -704,12 +711,6 @@ func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) err
704711
}
705712
}
706713

707-
if agent != nil {
708-
if err := agent.networkDB.DeleteEntry(libnetworkEPTable, n.ID(), ep.ID()); err != nil {
709-
return err
710-
}
711-
}
712-
713714
logrus.Debugf("deleteServiceInfoFromCluster from %s END for %s %s", method, ep.svcName, ep.ID())
714715

715716
return nil
@@ -900,7 +901,7 @@ func (c *controller) handleEpTableEvent(ev events.Event) {
900901
logrus.Debugf("handleEpTableEvent DEL %s R:%v", eid, epRec)
901902
if svcID != "" {
902903
// This is a remote task part of a service
903-
if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent"); err != nil {
904+
if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true); err != nil {
904905
logrus.Errorf("failed removing service binding for %s epRec:%v err:%s", eid, epRec, err)
905906
return
906907
}

common/setmatrix.go

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,26 @@ import (
1010
type SetMatrix interface {
1111
// Get returns the members of the set for a specific key as a slice.
1212
Get(key string) ([]interface{}, bool)
13-
// Contains is used to verify is an element is in a set for a specific key
13+
// Contains is used to verify if an element is in a set for a specific key
1414
// returns true if the element is in the set
1515
// returns true if there is a set for the key
1616
Contains(key string, value interface{}) (bool, bool)
17-
// Insert inserts the mapping between the IP and the endpoint identifier
18-
// returns true if the mapping was not present, false otherwise
19-
// returns also the number of endpoints associated to the IP
17+
// Insert inserts the value in the set of a key
18+
// returns true if the value is inserted (was not already in the set), false otherwise
19+
// returns also the length of the set for the key
2020
Insert(key string, value interface{}) (bool, int)
21-
// Remove removes the mapping between the IP and the endpoint identifier
22-
// returns true if the mapping was deleted, false otherwise
23-
// returns also the number of endpoints associated to the IP
21+
// Remove removes the value in the set for a specific key
22+
// returns true if the value is deleted, false otherwise
23+
// returns also the length of the set for the key
2424
Remove(key string, value interface{}) (bool, int)
25-
// Cardinality returns the number of elements in the set of a specific key
26-
// returns false if the key is not in the map
25+
// Cardinality returns the number of elements in the set for a key
26+
// returns false if the set is not present
2727
Cardinality(key string) (int, bool)
2828
// String returns the string version of the set, empty otherwise
29-
// returns false if the key is not in the map
29+
// returns false if the set is not present
3030
String(key string) (string, bool)
31+
// Returns all the keys in the map
32+
Keys() []string
3133
}
3234

3335
type setMatrix struct {
@@ -121,3 +123,13 @@ func (s *setMatrix) String(key string) (string, bool) {
121123
}
122124
return set.String(), ok
123125
}
126+
127+
func (s *setMatrix) Keys() []string {
128+
s.Lock()
129+
defer s.Unlock()
130+
keys := make([]string, 0, len(s.matrix))
131+
for k := range s.matrix {
132+
keys = append(keys, k)
133+
}
134+
return keys
135+
}

libnetwork_internal_test.go

Lines changed: 116 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/docker/libnetwork/driverapi"
1414
"github.com/docker/libnetwork/ipamapi"
1515
"github.com/docker/libnetwork/netlabel"
16+
"github.com/docker/libnetwork/netutils"
1617
"github.com/docker/libnetwork/testutils"
1718
"github.com/docker/libnetwork/types"
1819
)
@@ -382,8 +383,8 @@ func TestSRVServiceQuery(t *testing.T) {
382383
}
383384

384385
sr := svcInfo{
385-
svcMap: make(map[string][]net.IP),
386-
svcIPv6Map: make(map[string][]net.IP),
386+
svcMap: common.NewSetMatrix(),
387+
svcIPv6Map: common.NewSetMatrix(),
387388
ipMap: common.NewSetMatrix(),
388389
service: make(map[string][]servicePorts),
389390
}
@@ -440,6 +441,119 @@ func TestSRVServiceQuery(t *testing.T) {
440441
}
441442
}
442443

444+
func TestServiceVIPReuse(t *testing.T) {
445+
c, err := New()
446+
if err != nil {
447+
t.Fatal(err)
448+
}
449+
defer c.Stop()
450+
451+
n, err := c.NewNetwork("bridge", "net1", "", nil)
452+
if err != nil {
453+
t.Fatal(err)
454+
}
455+
defer func() {
456+
if err := n.Delete(); err != nil {
457+
t.Fatal(err)
458+
}
459+
}()
460+
461+
ep, err := n.CreateEndpoint("testep")
462+
if err != nil {
463+
t.Fatal(err)
464+
}
465+
466+
sb, err := c.NewSandbox("c1")
467+
if err != nil {
468+
t.Fatal(err)
469+
}
470+
defer func() {
471+
if err := sb.Delete(); err != nil {
472+
t.Fatal(err)
473+
}
474+
}()
475+
476+
err = ep.Join(sb)
477+
if err != nil {
478+
t.Fatal(err)
479+
}
480+
481+
// Add 2 services with same name but different service ID to share the same VIP
482+
n.(*network).addSvcRecords("ep1", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
483+
n.(*network).addSvcRecords("ep2", "service_test", "serviceID2", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
484+
485+
ipToResolve := netutils.ReverseIP("192.168.0.1")
486+
487+
ipList, _ := n.(*network).ResolveName("service_test", types.IPv4)
488+
if len(ipList) == 0 {
489+
t.Fatal("There must be the VIP")
490+
}
491+
if len(ipList) != 1 {
492+
t.Fatal("It must return only 1 VIP")
493+
}
494+
if ipList[0].String() != "192.168.0.1" {
495+
t.Fatal("The service VIP is 192.168.0.1")
496+
}
497+
name := n.(*network).ResolveIP(ipToResolve)
498+
if name == "" {
499+
t.Fatal("It must return a name")
500+
}
501+
if name != "service_test.net1" {
502+
t.Fatalf("It must return the service_test.net1 != %s", name)
503+
}
504+
505+
// Delete service record for one of the services, the IP should remain because one service is still associated with it
506+
n.(*network).deleteSvcRecords("ep1", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
507+
ipList, _ = n.(*network).ResolveName("service_test", types.IPv4)
508+
if len(ipList) == 0 {
509+
t.Fatal("There must be the VIP")
510+
}
511+
if len(ipList) != 1 {
512+
t.Fatal("It must return only 1 VIP")
513+
}
514+
if ipList[0].String() != "192.168.0.1" {
515+
t.Fatal("The service VIP is 192.168.0.1")
516+
}
517+
name = n.(*network).ResolveIP(ipToResolve)
518+
if name == "" {
519+
t.Fatal("It must return a name")
520+
}
521+
if name != "service_test.net1" {
522+
t.Fatalf("It must return the service_test.net1 != %s", name)
523+
}
524+
525+
// Delete again the service using the previous service ID, nothing should happen
526+
n.(*network).deleteSvcRecords("ep2", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
527+
ipList, _ = n.(*network).ResolveName("service_test", types.IPv4)
528+
if len(ipList) == 0 {
529+
t.Fatal("There must be the VIP")
530+
}
531+
if len(ipList) != 1 {
532+
t.Fatal("It must return only 1 VIP")
533+
}
534+
if ipList[0].String() != "192.168.0.1" {
535+
t.Fatal("The service VIP is 192.168.0.1")
536+
}
537+
name = n.(*network).ResolveIP(ipToResolve)
538+
if name == "" {
539+
t.Fatal("It must return a name")
540+
}
541+
if name != "service_test.net1" {
542+
t.Fatalf("It must return the service_test.net1 != %s", name)
543+
}
544+
545+
// Delete now using the second service ID, now all the entries should be gone
546+
n.(*network).deleteSvcRecords("ep2", "service_test", "serviceID2", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
547+
ipList, _ = n.(*network).ResolveName("service_test", types.IPv4)
548+
if len(ipList) != 0 {
549+
t.Fatal("All the VIPs should be gone now")
550+
}
551+
name = n.(*network).ResolveIP(ipToResolve)
552+
if name != "" {
553+
t.Fatalf("It must return empty no more services associated, instead:%s", name)
554+
}
555+
}
556+
443557
func TestIpamReleaseOnNetDriverFailures(t *testing.T) {
444558
if !testutils.IsRunningInContainer() {
445559
defer testutils.SetupTestOSContext(t)()

0 commit comments

Comments
 (0)