Commit 9e6b79a5 authored by Richard Musiol's avatar Richard Musiol Committed by Ian Lance Taylor

syscall: use CLONE_VFORK and CLONE_VM

This greatly improves the latency of starting a child process when
the Go process is using a lot of memory. Even though the kernel uses
copy-on-write, preparation for that can take up to several 100ms under
certain conditions. All other goroutines are suspended while starting
a subprocess so this latency directly affects total throughput.

With CLONE_VM the child process shares the same memory with the parent
process. On its own this would lead to conflicting use of the same
memory, so CLONE_VFORK is used to suspend the parent process until the
child releases the memory when switching to to the new program binary
via the exec syscall. When the parent process continues to run, one
has to consider the changes to memory that the child process did,
namely the return address of the syscall function needs to be restored
from a register.

A simple benchmark has shown a difference in latency of 16ms vs. 0.5ms
at 10GB memory usage. However, much higher latencies of several 100ms
have been observed in real world scenarios. For more information see
comments on #5838.

Fixes #5838

Change-Id: I6377d7bd8dcd00c85ca0c52b6683e70ce2174ba6
Reviewed-on: https://go-review.googlesource.com/37439Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 0a0186fb
...@@ -111,6 +111,29 @@ ok2: ...@@ -111,6 +111,29 @@ ok2:
MOVQ $0, err+72(FP) MOVQ $0, err+72(FP)
RET RET
// func rawVforkSyscall(trap, a1 uintptr) (r1, err uintptr)
TEXT ·rawVforkSyscall(SB),NOSPLIT,$0-32
MOVQ a1+8(FP), DI
MOVQ $0, SI
MOVQ $0, DX
MOVQ $0, R10
MOVQ $0, R8
MOVQ $0, R9
MOVQ trap+0(FP), AX // syscall entry
POPQ R12 // preserve return address
SYSCALL
PUSHQ R12
CMPQ AX, $0xfffffffffffff001
JLS ok2
MOVQ $-1, r1+16(FP)
NEGQ AX
MOVQ AX, err+24(FP)
RET
ok2:
MOVQ AX, r1+16(FP)
MOVQ $0, err+24(FP)
RET
// func gettimeofday(tv *Timeval) (err uintptr) // func gettimeofday(tv *Timeval) (err uintptr)
TEXT ·gettimeofday(SB),NOSPLIT,$0-16 TEXT ·gettimeofday(SB),NOSPLIT,$0-16
MOVQ tv+0(FP), DI MOVQ tv+0(FP), DI
......
...@@ -95,9 +95,12 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr ...@@ -95,9 +95,12 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr
// About to call fork. // About to call fork.
// No more allocation or calls of non-assembly functions. // No more allocation or calls of non-assembly functions.
runtime_BeforeFork() runtime_BeforeFork()
if runtime.GOARCH == "s390x" { switch {
case runtime.GOARCH == "amd64" && sys.Cloneflags&CLONE_NEWUSER == 0:
r1, err1 = rawVforkSyscall(SYS_CLONE, uintptr(SIGCHLD|CLONE_VFORK|CLONE_VM)|sys.Cloneflags)
case runtime.GOARCH == "s390x":
r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0) r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
} else { default:
r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
} }
if err1 != 0 { if err1 != 0 {
......
...@@ -375,3 +375,7 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -375,3 +375,7 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length) cmsg.Len = uint32(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
...@@ -134,3 +134,5 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -134,3 +134,5 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length) cmsg.Len = uint64(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno)
...@@ -215,3 +215,7 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -215,3 +215,7 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length) cmsg.Len = uint32(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
...@@ -139,3 +139,7 @@ const ( ...@@ -139,3 +139,7 @@ const (
SYS_EPOLL_CREATE = 1042 SYS_EPOLL_CREATE = 1042
SYS_EPOLL_WAIT = 1069 SYS_EPOLL_WAIT = 1069
) )
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
...@@ -198,3 +198,7 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -198,3 +198,7 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length) cmsg.Len = uint64(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
...@@ -218,3 +218,7 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -218,3 +218,7 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length) cmsg.Len = uint32(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
...@@ -115,3 +115,7 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -115,3 +115,7 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length) cmsg.Len = uint64(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
...@@ -286,3 +286,7 @@ func (msghdr *Msghdr) SetControllen(length int) { ...@@ -286,3 +286,7 @@ func (msghdr *Msghdr) SetControllen(length int) {
func (cmsg *Cmsghdr) SetLen(length int) { func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint64(length) cmsg.Len = uint64(length)
} }
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment