Resolved all deadlocks and fixed race between kill and
monitor.resetContainer
Fixes #7600
Signed-off-by: Alexandr Morozov <lk4d4math@gmail.com>
| ... | ... |
@@ -10,7 +10,6 @@ import ( |
| 10 | 10 |
"path" |
| 11 | 11 |
"path/filepath" |
| 12 | 12 |
"strings" |
| 13 |
- "sync" |
|
| 14 | 13 |
"syscall" |
| 15 | 14 |
"time" |
| 16 | 15 |
|
| ... | ... |
@@ -43,7 +42,7 @@ var ( |
| 43 | 43 |
) |
| 44 | 44 |
|
| 45 | 45 |
type Container struct {
|
| 46 |
- sync.Mutex |
|
| 46 |
+ *State |
|
| 47 | 47 |
root string // Path to the "home" of the container, including metadata. |
| 48 | 48 |
basefs string // Path to the graphdriver mountpoint |
| 49 | 49 |
|
| ... | ... |
@@ -55,7 +54,6 @@ type Container struct {
|
| 55 | 55 |
Args []string |
| 56 | 56 |
|
| 57 | 57 |
Config *runconfig.Config |
| 58 |
- State *State |
|
| 59 | 58 |
Image string |
| 60 | 59 |
|
| 61 | 60 |
NetworkSettings *NetworkSettings |
| ... | ... |
@@ -276,7 +274,7 @@ func (container *Container) Start() (err error) {
|
| 276 | 276 |
container.Lock() |
| 277 | 277 |
defer container.Unlock() |
| 278 | 278 |
|
| 279 |
- if container.State.IsRunning() {
|
|
| 279 |
+ if container.State.Running {
|
|
| 280 | 280 |
return nil |
| 281 | 281 |
} |
| 282 | 282 |
|
| ... | ... |
@@ -526,11 +524,11 @@ func (container *Container) KillSig(sig int) error {
|
| 526 | 526 |
defer container.Unlock() |
| 527 | 527 |
|
| 528 | 528 |
// We could unpause the container for them rather than returning this error |
| 529 |
- if container.State.IsPaused() {
|
|
| 529 |
+ if container.State.Paused {
|
|
| 530 | 530 |
return fmt.Errorf("Container %s is paused. Unpause the container before stopping", container.ID)
|
| 531 | 531 |
} |
| 532 | 532 |
|
| 533 |
- if !container.State.IsRunning() {
|
|
| 533 |
+ if !container.State.Running {
|
|
| 534 | 534 |
return nil |
| 535 | 535 |
} |
| 536 | 536 |
|
| ... | ... |
@@ -541,7 +539,7 @@ func (container *Container) KillSig(sig int) error {
|
| 541 | 541 |
// if the container is currently restarting we do not need to send the signal |
| 542 | 542 |
// to the process. Telling the monitor that it should exit on it's next event |
| 543 | 543 |
// loop is enough |
| 544 |
- if container.State.IsRestarting() {
|
|
| 544 |
+ if container.State.Restarting {
|
|
| 545 | 545 |
return nil |
| 546 | 546 |
} |
| 547 | 547 |
|
| ... | ... |
@@ -70,7 +70,7 @@ func (daemon *Daemon) Containers(job *engine.Job) engine.Status {
|
| 70 | 70 |
writeCont := func(container *Container) error {
|
| 71 | 71 |
container.Lock() |
| 72 | 72 |
defer container.Unlock() |
| 73 |
- if !container.State.IsRunning() && !all && n <= 0 && since == "" && before == "" {
|
|
| 73 |
+ if !container.State.Running && !all && n <= 0 && since == "" && before == "" {
|
|
| 74 | 74 |
return nil |
| 75 | 75 |
} |
| 76 | 76 |
if before != "" && !foundBefore {
|
| ... | ... |
@@ -87,7 +87,7 @@ func (daemon *Daemon) Containers(job *engine.Job) engine.Status {
|
| 87 | 87 |
return errLast |
| 88 | 88 |
} |
| 89 | 89 |
} |
| 90 |
- if len(filt_exited) > 0 && !container.State.IsRunning() {
|
|
| 90 |
+ if len(filt_exited) > 0 && !container.State.Running {
|
|
| 91 | 91 |
should_skip := true |
| 92 | 92 |
for _, code := range filt_exited {
|
| 93 | 93 |
if code == container.State.GetExitCode() {
|
| ... | ... |
@@ -110,7 +110,7 @@ func (m *containerMonitor) Start() error {
|
| 110 | 110 |
defer func() {
|
| 111 | 111 |
if afterRun {
|
| 112 | 112 |
m.container.Lock() |
| 113 |
- m.container.State.SetStopped(exitStatus) |
|
| 113 |
+ m.container.State.setStopped(exitStatus) |
|
| 114 | 114 |
defer m.container.Unlock() |
| 115 | 115 |
} |
| 116 | 116 |
m.Close() |
| ... | ... |
@@ -123,7 +123,7 @@ func (m *containerMonitor) Start() error {
|
| 123 | 123 |
m.container.RestartCount++ |
| 124 | 124 |
|
| 125 | 125 |
if err := m.container.startLoggingToDisk(); err != nil {
|
| 126 |
- m.resetContainer() |
|
| 126 |
+ m.resetContainer(false) |
|
| 127 | 127 |
|
| 128 | 128 |
return err |
| 129 | 129 |
} |
| ... | ... |
@@ -138,7 +138,7 @@ func (m *containerMonitor) Start() error {
|
| 138 | 138 |
// if we receive an internal error from the initial start of a container then lets |
| 139 | 139 |
// return it instead of entering the restart loop |
| 140 | 140 |
if m.container.RestartCount == 0 {
|
| 141 |
- m.resetContainer() |
|
| 141 |
+ m.resetContainer(false) |
|
| 142 | 142 |
|
| 143 | 143 |
return err |
| 144 | 144 |
} |
| ... | ... |
@@ -154,7 +154,7 @@ func (m *containerMonitor) Start() error {
|
| 154 | 154 |
if m.shouldRestart(exitStatus) {
|
| 155 | 155 |
m.container.State.SetRestarting(exitStatus) |
| 156 | 156 |
m.container.LogEvent("die")
|
| 157 |
- m.resetContainer() |
|
| 157 |
+ m.resetContainer(true) |
|
| 158 | 158 |
|
| 159 | 159 |
// sleep with a small time increment between each restart to help avoid issues cased by quickly |
| 160 | 160 |
// restarting the container because of some types of errors ( networking cut out, etc... ) |
| ... | ... |
@@ -168,7 +168,7 @@ func (m *containerMonitor) Start() error {
|
| 168 | 168 |
continue |
| 169 | 169 |
} |
| 170 | 170 |
m.container.LogEvent("die")
|
| 171 |
- m.resetContainer() |
|
| 171 |
+ m.resetContainer(true) |
|
| 172 | 172 |
return err |
| 173 | 173 |
} |
| 174 | 174 |
} |
| ... | ... |
@@ -243,7 +243,7 @@ func (m *containerMonitor) callback(command *execdriver.Command) {
|
| 243 | 243 |
} |
| 244 | 244 |
} |
| 245 | 245 |
|
| 246 |
- m.container.State.SetRunning(command.Pid()) |
|
| 246 |
+ m.container.State.setRunning(command.Pid()) |
|
| 247 | 247 |
|
| 248 | 248 |
// signal that the process has started |
| 249 | 249 |
// close channel only if not closed |
| ... | ... |
@@ -260,8 +260,13 @@ func (m *containerMonitor) callback(command *execdriver.Command) {
|
| 260 | 260 |
|
| 261 | 261 |
// resetContainer resets the container's IO and ensures that the command is able to be executed again |
| 262 | 262 |
// by copying the data into a new struct |
| 263 |
-func (m *containerMonitor) resetContainer() {
|
|
| 263 |
+// if lock is true, then container locked during reset |
|
| 264 |
+func (m *containerMonitor) resetContainer(lock bool) {
|
|
| 264 | 265 |
container := m.container |
| 266 |
+ if lock {
|
|
| 267 |
+ container.Lock() |
|
| 268 |
+ defer container.Unlock() |
|
| 269 |
+ } |
|
| 265 | 270 |
|
| 266 | 271 |
if container.Config.OpenStdin {
|
| 267 | 272 |
if err := container.stdin.Close(); err != nil {
|
| ... | ... |
@@ -1,7 +1,6 @@ |
| 1 | 1 |
package daemon |
| 2 | 2 |
|
| 3 | 3 |
import ( |
| 4 |
- "encoding/json" |
|
| 5 | 4 |
"fmt" |
| 6 | 5 |
"sync" |
| 7 | 6 |
"time" |
| ... | ... |
@@ -10,7 +9,7 @@ import ( |
| 10 | 10 |
) |
| 11 | 11 |
|
| 12 | 12 |
type State struct {
|
| 13 |
- sync.RWMutex |
|
| 13 |
+ sync.Mutex |
|
| 14 | 14 |
Running bool |
| 15 | 15 |
Paused bool |
| 16 | 16 |
Restarting bool |
| ... | ... |
@@ -29,9 +28,6 @@ func NewState() *State {
|
| 29 | 29 |
|
| 30 | 30 |
// String returns a human-readable description of the state |
| 31 | 31 |
func (s *State) String() string {
|
| 32 |
- s.RLock() |
|
| 33 |
- defer s.RUnlock() |
|
| 34 |
- |
|
| 35 | 32 |
if s.Running {
|
| 36 | 33 |
if s.Paused {
|
| 37 | 34 |
return fmt.Sprintf("Up %s (Paused)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
|
| ... | ... |
@@ -50,16 +46,6 @@ func (s *State) String() string {
|
| 50 | 50 |
return fmt.Sprintf("Exited (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
|
| 51 | 51 |
} |
| 52 | 52 |
|
| 53 |
-type jState State |
|
| 54 |
- |
|
| 55 |
-// MarshalJSON for state is needed to avoid race conditions on inspect |
|
| 56 |
-func (s *State) MarshalJSON() ([]byte, error) {
|
|
| 57 |
- s.RLock() |
|
| 58 |
- b, err := json.Marshal(jState(*s)) |
|
| 59 |
- s.RUnlock() |
|
| 60 |
- return b, err |
|
| 61 |
-} |
|
| 62 |
- |
|
| 63 | 53 |
func wait(waitChan <-chan struct{}, timeout time.Duration) error {
|
| 64 | 54 |
if timeout < 0 {
|
| 65 | 55 |
<-waitChan |
| ... | ... |
@@ -77,14 +63,14 @@ func wait(waitChan <-chan struct{}, timeout time.Duration) error {
|
| 77 | 77 |
// immediatly. If you want wait forever you must supply negative timeout. |
| 78 | 78 |
// Returns pid, that was passed to SetRunning |
| 79 | 79 |
func (s *State) WaitRunning(timeout time.Duration) (int, error) {
|
| 80 |
- s.RLock() |
|
| 81 |
- if s.IsRunning() {
|
|
| 80 |
+ s.Lock() |
|
| 81 |
+ if s.Running {
|
|
| 82 | 82 |
pid := s.Pid |
| 83 |
- s.RUnlock() |
|
| 83 |
+ s.Unlock() |
|
| 84 | 84 |
return pid, nil |
| 85 | 85 |
} |
| 86 | 86 |
waitChan := s.waitChan |
| 87 |
- s.RUnlock() |
|
| 87 |
+ s.Unlock() |
|
| 88 | 88 |
if err := wait(waitChan, timeout); err != nil {
|
| 89 | 89 |
return -1, err |
| 90 | 90 |
} |
| ... | ... |
@@ -95,14 +81,14 @@ func (s *State) WaitRunning(timeout time.Duration) (int, error) {
|
| 95 | 95 |
// immediatly. If you want wait forever you must supply negative timeout. |
| 96 | 96 |
// Returns exit code, that was passed to SetStopped |
| 97 | 97 |
func (s *State) WaitStop(timeout time.Duration) (int, error) {
|
| 98 |
- s.RLock() |
|
| 98 |
+ s.Lock() |
|
| 99 | 99 |
if !s.Running {
|
| 100 | 100 |
exitCode := s.ExitCode |
| 101 |
- s.RUnlock() |
|
| 101 |
+ s.Unlock() |
|
| 102 | 102 |
return exitCode, nil |
| 103 | 103 |
} |
| 104 | 104 |
waitChan := s.waitChan |
| 105 |
- s.RUnlock() |
|
| 105 |
+ s.Unlock() |
|
| 106 | 106 |
if err := wait(waitChan, timeout); err != nil {
|
| 107 | 107 |
return -1, err |
| 108 | 108 |
} |
| ... | ... |
@@ -110,28 +96,33 @@ func (s *State) WaitStop(timeout time.Duration) (int, error) {
|
| 110 | 110 |
} |
| 111 | 111 |
|
| 112 | 112 |
func (s *State) IsRunning() bool {
|
| 113 |
- s.RLock() |
|
| 113 |
+ s.Lock() |
|
| 114 | 114 |
res := s.Running |
| 115 |
- s.RUnlock() |
|
| 115 |
+ s.Unlock() |
|
| 116 | 116 |
return res |
| 117 | 117 |
} |
| 118 | 118 |
|
| 119 | 119 |
func (s *State) GetPid() int {
|
| 120 |
- s.RLock() |
|
| 120 |
+ s.Lock() |
|
| 121 | 121 |
res := s.Pid |
| 122 |
- s.RUnlock() |
|
| 122 |
+ s.Unlock() |
|
| 123 | 123 |
return res |
| 124 | 124 |
} |
| 125 | 125 |
|
| 126 | 126 |
func (s *State) GetExitCode() int {
|
| 127 |
- s.RLock() |
|
| 127 |
+ s.Lock() |
|
| 128 | 128 |
res := s.ExitCode |
| 129 |
- s.RUnlock() |
|
| 129 |
+ s.Unlock() |
|
| 130 | 130 |
return res |
| 131 | 131 |
} |
| 132 | 132 |
|
| 133 | 133 |
func (s *State) SetRunning(pid int) {
|
| 134 | 134 |
s.Lock() |
| 135 |
+ s.setRunning(pid) |
|
| 136 |
+ s.Unlock() |
|
| 137 |
+} |
|
| 138 |
+ |
|
| 139 |
+func (s *State) setRunning(pid int) {
|
|
| 135 | 140 |
s.Running = true |
| 136 | 141 |
s.Paused = false |
| 137 | 142 |
s.Restarting = false |
| ... | ... |
@@ -140,11 +131,15 @@ func (s *State) SetRunning(pid int) {
|
| 140 | 140 |
s.StartedAt = time.Now().UTC() |
| 141 | 141 |
close(s.waitChan) // fire waiters for start |
| 142 | 142 |
s.waitChan = make(chan struct{})
|
| 143 |
- s.Unlock() |
|
| 144 | 143 |
} |
| 145 | 144 |
|
| 146 | 145 |
func (s *State) SetStopped(exitCode int) {
|
| 147 | 146 |
s.Lock() |
| 147 |
+ s.setStopped(exitCode) |
|
| 148 |
+ s.Unlock() |
|
| 149 |
+} |
|
| 150 |
+ |
|
| 151 |
+func (s *State) setStopped(exitCode int) {
|
|
| 148 | 152 |
s.Running = false |
| 149 | 153 |
s.Restarting = false |
| 150 | 154 |
s.Pid = 0 |
| ... | ... |
@@ -152,7 +147,6 @@ func (s *State) SetStopped(exitCode int) {
|
| 152 | 152 |
s.ExitCode = exitCode |
| 153 | 153 |
close(s.waitChan) // fire waiters for stop |
| 154 | 154 |
s.waitChan = make(chan struct{})
|
| 155 |
- s.Unlock() |
|
| 156 | 155 |
} |
| 157 | 156 |
|
| 158 | 157 |
// SetRestarting is when docker hanldes the auto restart of containers when they are |
| ... | ... |
@@ -172,9 +166,9 @@ func (s *State) SetRestarting(exitCode int) {
|
| 172 | 172 |
} |
| 173 | 173 |
|
| 174 | 174 |
func (s *State) IsRestarting() bool {
|
| 175 |
- s.RLock() |
|
| 175 |
+ s.Lock() |
|
| 176 | 176 |
res := s.Restarting |
| 177 |
- s.RUnlock() |
|
| 177 |
+ s.Unlock() |
|
| 178 | 178 |
return res |
| 179 | 179 |
} |
| 180 | 180 |
|
| ... | ... |
@@ -191,8 +185,8 @@ func (s *State) SetUnpaused() {
|
| 191 | 191 |
} |
| 192 | 192 |
|
| 193 | 193 |
func (s *State) IsPaused() bool {
|
| 194 |
- s.RLock() |
|
| 194 |
+ s.Lock() |
|
| 195 | 195 |
res := s.Paused |
| 196 |
- s.RUnlock() |
|
| 196 |
+ s.Unlock() |
|
| 197 | 197 |
return res |
| 198 | 198 |
} |