Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
adam.huang
go-libp2p
Commits
4ae3510f
Commit
4ae3510f
authored
Sep 16, 2016
by
Jeromy Johnson
Committed by
GitHub
Sep 16, 2016
Browse files
Merge pull request #107 from libp2p/feat/better-dialsync
Improve swarm dial sync code
parents
d3f70d18
126e1506
Changes
7
Hide whitespace changes
Inline
Side-by-side
p2p/net/swarm/dial_sync.go
0 → 100644
View file @
4ae3510f
package
swarm
import
(
"context"
"sync"
peer
"github.com/ipfs/go-libp2p-peer"
)
type
DialFunc
func
(
context
.
Context
,
peer
.
ID
)
(
*
Conn
,
error
)
func
NewDialSync
(
dfn
DialFunc
)
*
DialSync
{
return
&
DialSync
{
dials
:
make
(
map
[
peer
.
ID
]
*
activeDial
),
dialFunc
:
dfn
,
}
}
type
DialSync
struct
{
dials
map
[
peer
.
ID
]
*
activeDial
dialsLk
sync
.
Mutex
dialFunc
DialFunc
}
type
activeDial
struct
{
id
peer
.
ID
refCnt
int
refCntLk
sync
.
Mutex
cancel
func
()
err
error
conn
*
Conn
waitch
chan
struct
{}
ds
*
DialSync
}
func
(
dr
*
activeDial
)
wait
(
ctx
context
.
Context
)
(
*
Conn
,
error
)
{
defer
dr
.
decref
()
select
{
case
<-
dr
.
waitch
:
return
dr
.
conn
,
dr
.
err
case
<-
ctx
.
Done
()
:
return
nil
,
ctx
.
Err
()
}
}
func
(
ad
*
activeDial
)
incref
()
{
ad
.
refCntLk
.
Lock
()
defer
ad
.
refCntLk
.
Unlock
()
ad
.
refCnt
++
}
func
(
ad
*
activeDial
)
decref
()
{
ad
.
refCntLk
.
Lock
()
defer
ad
.
refCntLk
.
Unlock
()
ad
.
refCnt
--
if
ad
.
refCnt
<=
0
{
ad
.
cancel
()
ad
.
ds
.
dialsLk
.
Lock
()
delete
(
ad
.
ds
.
dials
,
ad
.
id
)
ad
.
ds
.
dialsLk
.
Unlock
()
}
}
func
(
ds
*
DialSync
)
DialLock
(
ctx
context
.
Context
,
p
peer
.
ID
)
(
*
Conn
,
error
)
{
ds
.
dialsLk
.
Lock
()
actd
,
ok
:=
ds
.
dials
[
p
]
if
!
ok
{
ctx
,
cancel
:=
context
.
WithCancel
(
context
.
Background
())
actd
=
&
activeDial
{
id
:
p
,
cancel
:
cancel
,
waitch
:
make
(
chan
struct
{}),
ds
:
ds
,
}
ds
.
dials
[
p
]
=
actd
go
func
(
ctx
context
.
Context
,
p
peer
.
ID
,
ad
*
activeDial
)
{
ad
.
conn
,
ad
.
err
=
ds
.
dialFunc
(
ctx
,
p
)
close
(
ad
.
waitch
)
ad
.
cancel
()
ad
.
waitch
=
nil
// to ensure nobody tries reusing this
}(
ctx
,
p
,
actd
)
}
actd
.
incref
()
ds
.
dialsLk
.
Unlock
()
return
actd
.
wait
(
ctx
)
}
p2p/net/swarm/dial_sync_test.go
0 → 100644
View file @
4ae3510f
package
swarm
import
(
"context"
"fmt"
"sync"
"testing"
"time"
peer
"github.com/ipfs/go-libp2p-peer"
)
func
getMockDialFunc
()
(
DialFunc
,
func
(),
context
.
Context
,
<-
chan
struct
{})
{
dfcalls
:=
make
(
chan
struct
{},
512
)
// buffer it large enough that we won't care
dialctx
,
cancel
:=
context
.
WithCancel
(
context
.
Background
())
ch
:=
make
(
chan
struct
{})
f
:=
func
(
ctx
context
.
Context
,
p
peer
.
ID
)
(
*
Conn
,
error
)
{
dfcalls
<-
struct
{}{}
defer
cancel
()
select
{
case
<-
ch
:
return
new
(
Conn
),
nil
case
<-
ctx
.
Done
()
:
return
nil
,
ctx
.
Err
()
}
}
o
:=
new
(
sync
.
Once
)
return
f
,
func
()
{
o
.
Do
(
func
()
{
close
(
ch
)
})
},
dialctx
,
dfcalls
}
func
TestBasicDialSync
(
t
*
testing
.
T
)
{
df
,
done
,
_
,
callsch
:=
getMockDialFunc
()
dsync
:=
NewDialSync
(
df
)
p
:=
peer
.
ID
(
"testpeer"
)
ctx
:=
context
.
Background
()
finished
:=
make
(
chan
struct
{})
go
func
()
{
_
,
err
:=
dsync
.
DialLock
(
ctx
,
p
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
finished
<-
struct
{}{}
}()
go
func
()
{
_
,
err
:=
dsync
.
DialLock
(
ctx
,
p
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
finished
<-
struct
{}{}
}()
// short sleep just to make sure we've moved around in the scheduler
time
.
Sleep
(
time
.
Millisecond
*
20
)
done
()
<-
finished
<-
finished
if
len
(
callsch
)
>
1
{
t
.
Fatal
(
"should only have called dial func once!"
)
}
}
func
TestDialSyncCancel
(
t
*
testing
.
T
)
{
df
,
done
,
_
,
dcall
:=
getMockDialFunc
()
dsync
:=
NewDialSync
(
df
)
p
:=
peer
.
ID
(
"testpeer"
)
ctx1
,
cancel1
:=
context
.
WithCancel
(
context
.
Background
())
finished
:=
make
(
chan
struct
{})
go
func
()
{
_
,
err
:=
dsync
.
DialLock
(
ctx1
,
p
)
if
err
!=
ctx1
.
Err
()
{
t
.
Error
(
"should have gotten context error"
)
}
finished
<-
struct
{}{}
}()
// make sure the above makes it through the wait code first
select
{
case
<-
dcall
:
case
<-
time
.
After
(
time
.
Second
)
:
t
.
Fatal
(
"timed out waiting for dial to start"
)
}
// Add a second dialwait in so two actors are waiting on the same dial
go
func
()
{
_
,
err
:=
dsync
.
DialLock
(
context
.
Background
(),
p
)
if
err
!=
nil
{
t
.
Error
(
err
)
}
finished
<-
struct
{}{}
}()
time
.
Sleep
(
time
.
Millisecond
*
20
)
// cancel the first dialwait, it should not affect the second at all
cancel1
()
select
{
case
<-
finished
:
case
<-
time
.
After
(
time
.
Second
)
:
t
.
Fatal
(
"timed out waiting for wait to exit"
)
}
// short sleep just to make sure we've moved around in the scheduler
time
.
Sleep
(
time
.
Millisecond
*
20
)
done
()
<-
finished
}
func
TestDialSyncAllCancel
(
t
*
testing
.
T
)
{
df
,
done
,
dctx
,
_
:=
getMockDialFunc
()
dsync
:=
NewDialSync
(
df
)
p
:=
peer
.
ID
(
"testpeer"
)
ctx1
,
cancel1
:=
context
.
WithCancel
(
context
.
Background
())
finished
:=
make
(
chan
struct
{})
go
func
()
{
_
,
err
:=
dsync
.
DialLock
(
ctx1
,
p
)
if
err
!=
ctx1
.
Err
()
{
t
.
Error
(
"should have gotten context error"
)
}
finished
<-
struct
{}{}
}()
// Add a second dialwait in so two actors are waiting on the same dial
go
func
()
{
_
,
err
:=
dsync
.
DialLock
(
ctx1
,
p
)
if
err
!=
ctx1
.
Err
()
{
t
.
Error
(
"should have gotten context error"
)
}
finished
<-
struct
{}{}
}()
cancel1
()
for
i
:=
0
;
i
<
2
;
i
++
{
select
{
case
<-
finished
:
case
<-
time
.
After
(
time
.
Second
)
:
t
.
Fatal
(
"timed out waiting for wait to exit"
)
}
}
// the dial should have exited now
select
{
case
<-
dctx
.
Done
()
:
case
<-
time
.
After
(
time
.
Second
)
:
t
.
Fatal
(
"timed out waiting for dial to return"
)
}
// should be able to successfully dial that peer again
done
()
_
,
err
:=
dsync
.
DialLock
(
context
.
Background
(),
p
)
if
err
!=
nil
{
t
.
Fatal
(
err
)
}
}
func
TestFailFirst
(
t
*
testing
.
T
)
{
var
count
int
f
:=
func
(
ctx
context
.
Context
,
p
peer
.
ID
)
(
*
Conn
,
error
)
{
if
count
>
0
{
return
new
(
Conn
),
nil
}
count
++
return
nil
,
fmt
.
Errorf
(
"gophers ate the modem"
)
}
ds
:=
NewDialSync
(
f
)
p
:=
peer
.
ID
(
"testing"
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Second
*
5
)
defer
cancel
()
_
,
err
:=
ds
.
DialLock
(
ctx
,
p
)
if
err
==
nil
{
t
.
Fatal
(
"expected gophers to have eaten the modem"
)
}
c
,
err
:=
ds
.
DialLock
(
ctx
,
p
)
if
err
!=
nil
{
t
.
Fatal
(
err
)
}
if
c
==
nil
{
t
.
Fatal
(
"should have gotten a 'real' conn back"
)
}
}
p2p/net/swarm/dial_test.go
View file @
4ae3510f
...
...
@@ -415,7 +415,6 @@ func TestDialBackoff(t *testing.T) {
if
!
s1
.
backf
.
Backoff
(
s3p
)
{
t
.
Error
(
"s3 should be on backoff"
)
}
}
}
...
...
p2p/net/swarm/limiter.go
View file @
4ae3510f
package
swarm
import
(
"context"
"sync"
peer
"github.com/ipfs/go-libp2p-peer"
ma
"github.com/jbenet/go-multiaddr"
context
"golang.org/x/net/context"
conn
"github.com/libp2p/go-libp2p/p2p/net/conn"
addrutil
"github.com/libp2p/go-libp2p/p2p/net/swarm/addr"
...
...
p2p/net/swarm/limiter_test.go
View file @
4ae3510f
package
swarm
import
(
"context"
"fmt"
"math/rand"
"strconv"
...
...
@@ -10,7 +11,6 @@ import (
peer
"github.com/ipfs/go-libp2p-peer"
ma
"github.com/jbenet/go-multiaddr"
mafmt
"github.com/whyrusleeping/mafmt"
context
"golang.org/x/net/context"
conn
"github.com/libp2p/go-libp2p/p2p/net/conn"
)
...
...
p2p/net/swarm/swarm.go
View file @
4ae3510f
...
...
@@ -3,6 +3,7 @@
package
swarm
import
(
"context"
"fmt"
"io/ioutil"
"os"
...
...
@@ -32,7 +33,6 @@ import (
yamux
"github.com/whyrusleeping/go-smux-yamux"
mafilter
"github.com/whyrusleeping/multiaddr-filter"
ws
"github.com/whyrusleeping/ws-transport"
context
"golang.org/x/net/context"
)
var
log
=
logging
.
Logger
(
"swarm2"
)
...
...
@@ -76,7 +76,7 @@ type Swarm struct {
peers
pstore
.
Peerstore
connh
ConnHandler
dsync
d
ial
s
ync
dsync
*
D
ial
S
ync
backf
dialbackoff
dialT
time
.
Duration
// mainly for tests
...
...
@@ -134,6 +134,7 @@ func NewSwarm(ctx context.Context, listenAddrs []ma.Multiaddr,
dialer
:
conn
.
NewDialer
(
local
,
peers
.
PrivKey
(
local
),
wrap
),
}
s
.
dsync
=
NewDialSync
(
s
.
doDial
)
s
.
limiter
=
newDialLimiter
(
s
.
dialAddr
)
// configure Swarm
...
...
p2p/net/swarm/swarm_dial.go
View file @
4ae3510f
package
swarm
import
(
"context"
"errors"
"fmt"
"sync"
...
...
@@ -11,7 +12,6 @@ import (
ma
"github.com/jbenet/go-multiaddr"
conn
"github.com/libp2p/go-libp2p/p2p/net/conn"
addrutil
"github.com/libp2p/go-libp2p/p2p/net/swarm/addr"
context
"golang.org/x/net/context"
)
// Diagram of dial sync:
...
...
@@ -53,78 +53,6 @@ const defaultPerPeerRateLimit = 8
// subcomponent of Dial)
var
DialTimeout
=
time
.
Second
*
10
// dialsync is a small object that helps manage ongoing dials.
// this way, if we receive many simultaneous dial requests, one
// can do its thing, while the rest wait.
//
// this interface is so would-be dialers can just:
//
// for {
// c := findConnectionToPeer(peer)
// if c != nil {
// return c
// }
//
// // ok, no connections. should we dial?
// if ok, wait := dialsync.Lock(peer); !ok {
// <-wait // can optionally wait
// continue
// }
// defer dialsync.Unlock(peer)
//
// c := actuallyDial(peer)
// return c
// }
//
type
dialsync
struct
{
// ongoing is a map of tickets for the current peers being dialed.
// this way, we dont kick off N dials simultaneously.
ongoing
map
[
peer
.
ID
]
chan
struct
{}
lock
sync
.
Mutex
}
// Lock governs the beginning of a dial attempt.
// If there are no ongoing dials, it returns true, and the client is now
// scheduled to dial. Every other goroutine that calls startDial -- with
//the same dst -- will block until client is done. The client MUST call
// ds.Unlock(p) when it is done, to unblock the other callers.
// The client is not reponsible for achieving a successful dial, only for
// reporting the end of the attempt (calling ds.Unlock(p)).
//
// see the example below `dialsync`
func
(
ds
*
dialsync
)
Lock
(
dst
peer
.
ID
)
(
bool
,
chan
struct
{})
{
ds
.
lock
.
Lock
()
if
ds
.
ongoing
==
nil
{
// init if not ready
ds
.
ongoing
=
make
(
map
[
peer
.
ID
]
chan
struct
{})
}
wait
,
found
:=
ds
.
ongoing
[
dst
]
if
!
found
{
ds
.
ongoing
[
dst
]
=
make
(
chan
struct
{})
}
ds
.
lock
.
Unlock
()
if
found
{
return
false
,
wait
}
// ok! you're signed up to dial!
return
true
,
nil
}
// Unlock releases waiters to a dial attempt. see Lock.
// if Unlock(p) is called without calling Lock(p) first, Unlock panics.
func
(
ds
*
dialsync
)
Unlock
(
dst
peer
.
ID
)
{
ds
.
lock
.
Lock
()
wait
,
found
:=
ds
.
ongoing
[
dst
]
if
!
found
{
panic
(
"called dialDone with no ongoing dials to peer: "
+
dst
.
Pretty
())
}
delete
(
ds
.
ongoing
,
dst
)
// remove ongoing dial
close
(
wait
)
// release everyone else
ds
.
lock
.
Unlock
()
}
// dialbackoff is a struct used to avoid over-dialing the same, dead peers.
// Whenever we totally time out on a peer (all three attempts), we add them
// to dialbackoff. Then, whenevers goroutines would _wait_ (dialsync), they
...
...
@@ -246,8 +174,7 @@ func (s *Swarm) bestConnectionToPeer(p peer.ID) *Conn {
// gatedDialAttempt is an attempt to dial a node. It is gated by the swarm's
// dial synchronization systems: dialsync and dialbackoff.
func
(
s
*
Swarm
)
gatedDialAttempt
(
ctx
context
.
Context
,
p
peer
.
ID
)
(
*
Conn
,
error
)
{
var
logdial
=
lgbl
.
Dial
(
"swarm"
,
s
.
LocalPeer
(),
p
,
nil
,
nil
)
defer
log
.
EventBegin
(
ctx
,
"swarmDialAttemptSync"
,
logdial
)
.
Done
()
defer
log
.
EventBegin
(
ctx
,
"swarmDialAttemptSync"
,
p
)
.
Done
()
// check if we already have an open connection first
conn
:=
s
.
bestConnectionToPeer
(
p
)
...
...
@@ -255,57 +182,36 @@ func (s *Swarm) gatedDialAttempt(ctx context.Context, p peer.ID) (*Conn, error)
return
conn
,
nil
}
// check if there's an ongoing dial to this peer
if
ok
,
wait
:=
s
.
dsync
.
Lock
(
p
);
ok
{
defer
s
.
dsync
.
Unlock
(
p
)
// if this peer has been backed off, lets get out of here
if
s
.
backf
.
Backoff
(
p
)
{
log
.
Event
(
ctx
,
"swarmDialBackoff"
,
logdial
)
return
nil
,
ErrDialBackoff
}
// ok, we have been charged to dial! let's do it.
// if it succeeds, dial will add the conn to the swarm itself.
defer
log
.
EventBegin
(
ctx
,
"swarmDialAttemptStart"
,
logdial
)
.
Done
()
ctxT
,
cancel
:=
context
.
WithTimeout
(
ctx
,
s
.
dialT
)
conn
,
err
:=
s
.
dial
(
ctxT
,
p
)
cancel
()
log
.
Debugf
(
"dial end %s"
,
conn
)
if
err
!=
nil
{
log
.
Event
(
ctx
,
"swarmDialBackoffAdd"
,
logdial
)
s
.
backf
.
AddBackoff
(
p
)
// let others know to backoff
// ok, we failed. try again. (if loop is done, our error is output)
return
nil
,
fmt
.
Errorf
(
"dial attempt failed: %s"
,
err
)
}
log
.
Event
(
ctx
,
"swarmDialBackoffClear"
,
logdial
)
s
.
backf
.
Clear
(
p
)
// okay, no longer need to backoff
return
conn
,
nil
}
else
{
// we did not dial. we must wait for someone else to dial.
// if this peer has been backed off, lets get out of here
if
s
.
backf
.
Backoff
(
p
)
{
log
.
Event
(
ctx
,
"swarmDialBackoff"
,
p
)
return
nil
,
ErrDialBackoff
}
// check whether we should backoff first...
if
s
.
backf
.
Backoff
(
p
)
{
log
.
Event
(
ctx
,
"swarmDialBackoff"
,
logdial
)
return
nil
,
ErrDialBackoff
}
return
s
.
dsync
.
DialLock
(
ctx
,
p
)
}
defer
log
.
EventBegin
(
ctx
,
"swarmDialWait"
,
logdial
)
.
Done
()
select
{
case
<-
wait
:
// wait for that other dial to finish.
// doDial is an ugly shim method to retain all the logging and backoff logic
// of the old dialsync code
func
(
s
*
Swarm
)
doDial
(
ctx
context
.
Context
,
p
peer
.
ID
)
(
*
Conn
,
error
)
{
var
logdial
=
lgbl
.
Dial
(
"swarm"
,
s
.
LocalPeer
(),
p
,
nil
,
nil
)
// ok, we have been charged to dial! let's do it.
// if it succeeds, dial will add the conn to the swarm itself.
defer
log
.
EventBegin
(
ctx
,
"swarmDialAttemptStart"
,
logdial
)
.
Done
()
ctxT
,
cancel
:=
context
.
WithTimeout
(
ctx
,
s
.
dialT
)
conn
,
err
:=
s
.
dial
(
ctxT
,
p
)
cancel
()
log
.
Debugf
(
"dial end %s"
,
conn
)
if
err
!=
nil
{
log
.
Event
(
ctx
,
"swarmDialBackoffAdd"
,
logdial
)
s
.
backf
.
AddBackoff
(
p
)
// let others know to backoff
// see if it worked, OR we got an incoming dial in the meantime...
conn
:=
s
.
bestConnectionToPeer
(
p
)
if
conn
!=
nil
{
return
conn
,
nil
}
return
nil
,
ErrDialFailed
case
<-
ctx
.
Done
()
:
// or we may have to bail...
return
nil
,
ctx
.
Err
()
}
// ok, we failed. try again. (if loop is done, our error is output)
return
nil
,
fmt
.
Errorf
(
"dial attempt failed: %s"
,
err
)
}
log
.
Event
(
ctx
,
"swarmDialBackoffClear"
,
logdial
)
s
.
backf
.
Clear
(
p
)
// okay, no longer need to backoff
return
conn
,
nil
}
// dial is the actual swarm's dial logic, gated by Dial.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment